In [None]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import numpy as np



```
# This is formatted as code
```

# 1. Load Data

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

display(train.head())
print(len(train))
display(test.head())
print(len(test))

## Data Distribution

In [None]:
x = train["target"].value_counts()
plt.grid()
sns.barplot(x.index, x)
plt.gca().set_ylabel("samples")
plt.title("distribution")

In [None]:
plt.grid()

plt.hist(train[train["target"] == 1]["text"].str.len())
plt.title("Disaster tweets length")

In [None]:
plt.grid()

plt.hist(train[train["target"] == 0]["text"].str.len(), color= 'r')
plt.title("No disaster tweets length")

In [None]:
plt.grid()

word1 = train[train["target"] == 1]["text"].str.split().apply(lambda x:[len(i) for i in x])
sns.distplot(word1.map(lambda x: np.mean(x)))
plt.title("Disaster tweets length")

In [None]:
plt.grid()

word1 = train[train["target"] == 0]["text"].str.split().apply(lambda x:[len(i) for i in x])
sns.distplot(word1.map(lambda x: np.mean(x)), color = 'r')
plt.title("Disaster tweets length")

# 2. Create corpus

In [None]:
def create_corpus(target):
    corpus = []
    for x in train[train["target"] == target]["text"].str.split():
        print(x)
        for i in x:
            corpus.append(i)
            
    return corpus

In [None]:
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.util import ngrams

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [None]:
corpus = create_corpus(0)

stop = set(stopwords.words("english"))

dictionary = defaultdict(int)
for word in corpus:
    if word in stop:
        dictionary[word] +=1
        
top = sorted(dictionary.items(), key = lambda x:x[1], reverse=True)[:10]

In [None]:
x, y = zip(*top)

plt.grid()
plt.bar(x,y)
plt.title("top words 0")

In [None]:
corpus = create_corpus(1)

stop = set(stopwords.words("english"))

dictionary = defaultdict(int)
for word in corpus:
    if word in stop:
        dictionary[word] +=1
        
top = sorted(dictionary.items(), key = lambda x:x[1], reverse=True)[:10]

In [None]:
x, y = zip(*top)

plt.grid()
plt.bar(x,y, color = 'r')
plt.title("top words 1")

### Punctuation

In [None]:
corpus = create_corpus(1)

dictionary = defaultdict(int)

import string

special_char = string.punctuation

for i in corpus:
    if i in special_char:
        dictionary[i] +=1
        
        


In [None]:
x,y = zip(*dictionary.items())

plt.grid()
plt.bar(x,y)
plt.title("Punctuation disaster 1")

In [None]:
corpus = create_corpus(0)

dictionary = defaultdict(int)

import string

special_char = string.punctuation

for i in corpus:
    if i in special_char:
        dictionary[i] +=1

In [None]:
x,y = zip(*dictionary.items())

plt.grid()
plt.bar(x,y, color = 'r')
plt.title("Punctuation disaster 0")

## Common words

In [None]:
from collections import Counter

In [None]:
counter = Counter(corpus)
most = counter.most_common()
x = []
y = []

for word, count in most[:40]:
    if word not in stop:
        x.append(word)
        y.append(count)

In [None]:
plt.title("most common words")
plt.grid()
sns.barplot(x = y, y = x)

# Data cleaning

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
df

## removing URLs

In [None]:
import re

In [None]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [None]:
df["text"] = df["text"].apply(lambda x: remove_url(x))

In [None]:
df

## remove html tag

In [None]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

In [None]:
df["text"] = df["text"].apply(lambda x: remove_html(x))

In [None]:
df

## Remove emoji

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" #emoticons
                               u"\U0001F300-\U0001F5FF" #symbols&pics
                               u"\U0001F680-\U0001F6FF" #transportation pic
                               u"\U0001F1E0-\U0001F1FF" #flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"    
                               "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
df["text"] = df["text"].apply(lambda x: remove_emoji(x))

In [None]:
df

## Remove punctuation

In [None]:
def remove_punctuation(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

In [None]:
df["text"] = df["text"].apply(lambda x: remove_punctuation(x))

In [None]:
df

## Spelling checker

Additional: spelling checker for indonesian dataset

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

In [None]:
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [None]:
#df['text']=df['text'].apply(lambda x : correct_spellings(x))

# Glove vectorization (word2vec)

In [None]:
from tqdm import tqdm
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
def create_corpus(df):
    corpus = []
    for tweet in tqdm(df["text"]):
        words = [word.lower() for word in word_tokenize(tweet) if \
        ((word.isalpha() == 1) & (word not in stop))]
        corpus.append(words)
        
    return corpus

In [None]:
corpus = create_corpus(df)

In [None]:
embedding_dict = {}

with open('/content/drive/MyDrive/Projects/Natural Disaster Tweets/glove.6B.100d.txt','r') as glove:
    for line in glove:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors
        
glove.close()

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D, Dropout
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [None]:
MAX_LEN = 50
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)

sequences = tokenizer_obj.texts_to_sequences(corpus)

tweet_pad = pad_sequences(sequences,
                          maxlen = MAX_LEN, 
                         truncating = 'post', 
                         padding = 'post')

In [None]:
word_index = tokenizer_obj.word_index
print('number of unique words: ', len(word_index))

In [None]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words,100))


for word, i in tqdm(word_index.items()):
    if i > num_words:
        continue
        
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
from keras import regularizers

model = Sequential()

glove_embedding = Embedding(num_words, 100, embeddings_initializer = Constant(embedding_matrix), 
                     input_length = MAX_LEN, 
                     trainable = False)

model.add(glove_embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(128, activation = 'relu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)))
model.add(Dense(256, activation = 'relu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))

optimizer = Adam(learning_rate=1e-5)

model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ["accuracy"])

In [None]:
model.summary()

In [None]:
train_data = tweet_pad[:train.shape[0]]
test_data = tweet_pad[train.shape[0]:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train["target"].values, test_size = 0.20)

In [None]:
hist = model.fit(X_train, y_train, batch_size = 64, epochs = 20, validation_data = (X_test, y_test))

In [None]:
#Ploting Acuracy & Loss
import matplotlib.pyplot as plt
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Accuracy","Validation Accuracy","loss","Validation Loss"])
plt.show()