In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install bs4

In [None]:
import re

import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict, Counter

from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup


from wordcloud import WordCloud 

import warnings
warnings.filterwarnings('ignore')

In [None]:
sns.set(style="white", font_scale=1.2)
plt.rcParams["figure.figsize"] = [10,8]
pd.set_option.display_max_columns = 0
pd.set_option.display_max_rows = 0

nltk.download('stopwords', quiet=True)
stopwords = stopwords.words('english')

In [None]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_df.head()

This train dataset consists of the following features:
- **Id:** a numerical identifier for the tweet. This will be important when we upload our predictions to the leaderboard.
- **Keyword:** a keyword from the tweet which may in some cases be missing.
- **Location:** the location the tweet was sent from. This may also not be present.
- **Text:** the full text of the tweet.
- **Target:** this is the label we are trying to predict. This will be 1 if the tweet is really about a disaster and 0 if not.

In [None]:
#total data length
print('There are {} rows and {} columns in train'.format(train_df.shape[0],train_df.shape[1]))
print('There are {} rows and {} columns in test'.format(test_df.shape[0],test_df.shape[1]))

# unique location and keyword size of data
print("Checking train location column values",len(train_df.location.unique()))
print("Checking train keyword column values",len(train_df.keyword.unique()))
print("Checking test location column values",len(test_df.location.unique()))
print("Checking test keyword column values",len(test_df.keyword.unique()))

#number of disaster tweets
print("disaster tweets", len(train_df[train_df["target"]==1]) )
print("non-disaster tweets", len(train_df[train_df["target"]==0]) )

Now I plot the target value distribution

In [None]:
target_distribution = train_df["target"].value_counts(normalize=True)
print("Not Disaster: {:.2%}, Disaster: {:.2%}".format(target_distribution[0], target_distribution[1]))

sns.barplot(x=target_distribution.index, y=target_distribution)
plt.title("Histogram of Disaster vs. Non-Disaster")
plt.xlabel("0 = Non-Disaster, 1 = Disaster")
plt.show()

As you can see there are more data points with the label 0 meaning tweets that are not disaster tweets and fewer data points with the label 1 which is tweets that are related to a disaster. Usually, for data that has some skewed labels, it is recommended to use an F-score instead of accuracy for model evaluation.

In [None]:
#take look at neutral tweets
train_df[train_df.target == 0].head(2)

In [None]:
#take look at disaster tweets
train_df[train_df.target == 1].head(2)

Now we can take a look at the distribution of `Keywords` and `Locations`. We plot 20 most repeated values for each.

In [None]:
plt.subplots(1,2,figsize=(10,5))
#visualize top 20 train unique keywords
plt.subplot(1,2,1)
train_df.keyword.value_counts()[:20].plot(kind="bar",title="Unique Keywords")

#visualize top 20 train unique locations
plt.subplot(1,2,2)
train_df.location.value_counts()[:20].plot(kind="bar",title="Unique Locations")

plt.show()

We can also plot Keywords and Location for different categories.

In [None]:
plt.subplots(1,2,figsize=(10,5))
#visualize top 20 disaster tweets and their keywords bar graph
plt.subplot(1,2,1)
train_df[train_df["target"]==1].keyword.value_counts()[:20].plot(kind="bar",title="Disaster tweets keywords")

#visualize top 20 non disaster tweets and their keywords bar graph
plt.subplot(1,2,2)
train_df[train_df["target"]==0].keyword.value_counts()[:20].plot(kind="bar",title="Non-Disaster tweets keywords")

plt.show()

In [None]:
plt.subplots(1,2,figsize=(10,5))
#visualize top 20 disaster tweets and their locations bar graph
plt.subplot(1,2,1)
train_df[train_df["target"]==1].location.value_counts()[:20].plot(kind="bar",title="Disaster tweets Locations")

#visualize top 20 non disaster tweets and their locations bar graph
plt.subplot(1,2,2)
train_df[train_df["target"]==0].location.value_counts()[:20].plot(kind="bar",title="Non-Disaster tweets Locations")

plt.show()

#### Taking care of null values

In [None]:
null_counts = pd.DataFrame({"Number_Null": train_df.isnull().sum()})
null_counts["Percent_Null"] = null_counts["Number_Null"] / train_df.count() * 100
null_counts

Location includes `2533` null values and `61` values for keywords are null in train dataset.

In [None]:
null_counts = pd.DataFrame({"Number_Null": test_df.isnull().sum()})
null_counts["Percent_Null"] = null_counts["Number_Null"] / test_df.count() * 100
null_counts

Location includes `1105` null values and`26` keywords are null in test dataset.

We can delete the values for these two columns as they seem not neccessary

In [None]:
# Let's get rid of `Location` and  `keywords` columns as they are unnecessary.
# train_df.drop(['keyword','location'],axis=1,inplace=True)
# test_df.drop(['keyword','location'],axis=1,inplace=True)

Now we can examine if the neutral and disaster tweets, spread equally in our dataset

In [None]:
#take look at neutral tweets index distribution
train_df[train_df.target == 0].index

In [None]:
#take look at disaster tweets index distribution
train_df[train_df.target == 1].index

#### Taking care of duplicate values.

In [None]:
dupli_sum = train_df.text.duplicated().sum()

if(dupli_sum>0):
    print(dupli_sum, " duplicates found\nremoving duplicates...")
    train_df = train_df.loc[False==train_df.text.duplicated(), :]
    print('There are {} rows and {} columns in train after removing duplicates'
          .format(train_df.shape[0],train_df.shape[1]))
else:
    print("no duplicates found")
    
train_df

#### Extracting hashtags from tweets

In [None]:
#extract hashtags
train_df["hashtags"]=train_df["text"].apply(lambda x:re.findall(r"#(\w+)",x.lower()))
test_df["hashtags"]=test_df["text"].apply(lambda x:re.findall(r"#(\w+)",x.lower()))

#convert tokens hashtags to text
train_df["hashtags"]=train_df["hashtags"].apply(lambda x: ' '.join(x))
test_df["hashtags"]=test_df["hashtags"].apply(lambda x: ' '.join(x))

In [None]:
train_df.head(2)

In [None]:
test_df.head(2)

## Exploratory Data Analysis

Adding length of the tweets before cleaning to the dataframe.

In [None]:
# add the characters length of tweets
train_df['text_len'] = [len(t) for t in train_df.text]
train_df.head(2)

In [None]:
sns.distplot(train_df["text_len"])
plt.title("Histogram of Tweet Length")
plt.xlabel("Number of Characters")
plt.ylabel("Density")
plt.show()

In [None]:
g = sns.FacetGrid(train_df, col="target", height=5)

g = g.map(sns.distplot, "text_len")
g.fig.subplots_adjust(top=.8)

plt.suptitle("Distribution Tweet Length")
plt.show()

It can be seen in above plot that the character distribution is left skewed.
what about the test dataset?

In [None]:
test_df['text_len'] = [len(t) for t in test_df.text]
test_df.head(2)

In [None]:
sns.distplot(test_df["text_len"])

plt.title("Histogram of Tweet Length")
plt.xlabel("Number of Characters")
plt.ylabel("Density")
plt.show()

The test data set is also left skewed.

In [None]:
def count_words(x):
    '''
        A function to count number of words in a tweet
        inpit : tweet
        output: (int) number of words
    '''
    return len(x.split())

In [None]:
train_df["num_words_text"] = train_df["text"].apply(count_words)

sns.distplot(train_df["num_words_text"], bins=10)
plt.title("Histogram of Number of Words per Tweet")
plt.xlabel("Number of Words")
plt.ylabel("Density")
plt.show()

It seems that number of words follow a normal distribution.As we can see the majority of tweets are between 11 to 19 words.

In [None]:
g = sns.FacetGrid(train_df, col="target", height=5)

g = g.map(sns.distplot, "num_words_text")
g.fig.subplots_adjust(top=.8)

plt.suptitle("Distribution Number of Words")
plt.show()

We do the same for test dataset.

In [None]:
test_df["num_words_text"] = test_df["text"].apply(count_words)

sns.distplot(test_df["num_words_text"], bins=10)
plt.title("Histogram of Number of Words per Tweet")
plt.xlabel("Number of Words")
plt.ylabel("Density")
plt.show()

I do some further analysis on number of words and the length of the tweet by calculating the average word length of the tweets

In [None]:
def avg_word_length(x):
    return np.sum([len(w) for w in x.split()]) / len(x.split())

In [None]:
train_df["avg_word_length"] = train_df["text"].apply(avg_word_length)

sns.distplot(train_df["avg_word_length"])
plt.title("Histogram of Average Word Length")
plt.xlabel("Average Word Length")
plt.ylabel("Density")
plt.show()

In [None]:
g = sns.FacetGrid(train_df, col="target", height=5)
g = g.map(sns.distplot, "avg_word_length")

g.fig.subplots_adjust(top=.8)

plt.suptitle("Distribution Average Word Length")
plt.show()

In [None]:
test_df["avg_word_length"] = test_df["text"].apply(avg_word_length)

sns.distplot(test_df["avg_word_length"])
plt.title("Histogram of Average Word Length")
plt.xlabel("Average Word Length")
plt.ylabel("Density")
plt.show()

## Preprocessing

In [None]:
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not", "haven't":"have not","hasn't":"has not",
                 "hadn't":"had not","won't":"will not","wouldn't":"would not","don't":"do not", "doesn't":"does not","didn't":"did not",
                 "can't":"can not","couldn't":"could not","shouldn't":"should not", "mightn't":"might not", "mustn't":"must not",
                 }

neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(text):
    
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
        
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)   
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    
    return (" ".join(words)).strip()

The order of the cleaning is
1. Souping
2. BOM removing
3. Url address(‘http:’pattern), twitter ID removing
4. Url address(‘www.'pattern) removing
5. Lower-case
6. Negation handling
7. Removing numbers and special characters
8. Tokenizing and joining

In [None]:
train_df['clean_text'] = train_df['text'].map(lambda x: tweet_cleaner(x))
test_df['clean_text'] = test_df['text'].map(lambda x: tweet_cleaner(x))

In [None]:
train_df.head(2)

In [None]:
test_df.head(2)

In [None]:
# Length of tweets after cleaning
train_df['clean_text_len'] = [len(t) for t in train_df.clean_text]
train_df.head(2)

In [None]:
# Length of tweets after cleaning
test_df['clean_text_len'] = [len(t) for t in test_df.clean_text]
test_df.head(2)

In [None]:
# numbet of words in tweets after cleaning
train_df["clean_num_words"] = train_df["clean_text"].apply(count_words)
train_df.head(2)

In [None]:
# numbet of words in tweets after cleaning
test_df["clean_num_words"] = test_df["clean_text"].apply(count_words)
test_df.head(2)

In [None]:
# Average words length in tweets after cleaning
train_df["clean_avg_word_length"] = train_df["clean_text"].apply(avg_word_length)
train_df.head(2)

In [None]:
# Average words length in tweets after cleaning
test_df["clean_avg_word_length"] = test_df["clean_text"].apply(avg_word_length)
test_df.head(2)

Taking care of Stopwords

In [None]:
train_df['clean_text_stopword'] = train_df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
train_df.head(2)

In [None]:
test_df['clean_text_stopword'] = test_df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
test_df.head(2)

In [None]:
#Dropping words whose length is less than 3
train_df['clean_text_stopword'] = train_df['clean_text_stopword'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
train_df.head(2)

In [None]:
test_df['clean_text_stopword'] = test_df['clean_text_stopword'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
test_df.head(2)

## Word Cloud
A word cloud represents word usage in a document by resizing individual words proportionally to its frequency and then presenting them in a random arrangement. 

#### Plot Disaster tweets wordcloud

In [None]:
disaster_tweets = train_df[train_df.target == 1]
disaster_string = []

for t in disaster_tweets.clean_text_stopword:
    disaster_string.append(t)
    
disaster_string = pd.Series(disaster_string).str.cat(sep=' ')

In [None]:
wordcloud_disaster = WordCloud(width=1600, height=800,max_font_size=200 ,colormap='magma').generate(disaster_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud_disaster, interpolation="bilinear")
plt.axis("off")
plt.show()

#### Plot Neutral tweets wordcloud

In [None]:
neutral_tweets = train_df[train_df.target == 0]
neutral_string = []
for t in neutral_tweets.clean_text_stopword:
    neutral_string.append(t)
neutral_string = pd.Series(neutral_string).str.cat(sep=' ')

In [None]:
wordcloud_neutral = WordCloud(width=1600, height=800,max_font_size=200).generate(neutral_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud_neutral, interpolation="bilinear")
plt.axis("off")
plt.show()

### Visualize Bigram frequency distribution of tweets

In [None]:
plt.subplots(1,2,figsize=(15,10))

plt.subplot(1,2,1)
#Bigram Frequency distribution for disaster tweets
#convert disaster tweets into single string
txt=' '.join(train_df[train_df["target"]==1]["clean_text_stopword"])
disaster_bigram=nltk.FreqDist(nltk.bigrams(nltk.word_tokenize(txt)))
tmplst=disaster_bigram.most_common(30)

#visualize Bigram frequency distribution for disaster tweets using bar graph
wrd,cnt=zip(*tmplst)
wrd=[ x+","+y for (x,y) in wrd]
plt.barh(wrd,cnt)
plt.title("Disaster Bigram BarGraph")

plt.subplot(1,2,2)
#Bigram Frequency distribution for non disaster tweets
#convert non disaster tweets into single string
txt=' '.join(train_df[train_df["target"]==0]["clean_text_stopword"])
nondisaster_bigram=nltk.FreqDist(nltk.bigrams(nltk.word_tokenize(txt)))
tmplst=nondisaster_bigram.most_common(30)

#visualize Bigram frequency distribution for non disaster tweets using bar graph
wrd,cnt=zip(*tmplst)
wrd=[ x+","+y for (x,y) in wrd]
plt.barh(wrd,cnt)
plt.title("Non Disaster Bigram BarGraph")
plt.tight_layout()
plt.show()

#### Visualize unigram frequency distribution for disaster hashtags

In [None]:
plt.subplots(1,2,figsize=(15,15))
plt.subplot(1,2,1)
#Uigram Frequency distribution for disaster hashtags
#convert disaster hashtags into single string
txt=' '.join(train_df[train_df["target"]==1]["hashtags"])
disaster_unigram_hash=nltk.FreqDist(nltk.word_tokenize(txt))

#visualize unigram frequency distribution for disaster hashtags using wordcloud
disaster_wc = WordCloud(width=800, height=400, max_words=100).generate_from_frequencies(disaster_unigram_hash)
plt.title("Disaster Unigram Frequency Distribution hashtags")
plt.imshow(disaster_wc, interpolation="bilinear")
plt.axis("off")

plt.subplot(1,2,2)
#Uigram Frequency distribution for non disaster hashtags
#convert non disaster hashtags into single string
txt=' '.join(train_df[train_df["target"]==0]["hashtags"])
nondisaster_unigram_hash=nltk.FreqDist(nltk.word_tokenize(txt))

#visualize unigram frequency distribution for non disaster hashtags using wordcloud
nondisaster_wc = WordCloud(width=800, height=400, max_words=100).generate_from_frequencies(nondisaster_unigram_hash)
plt.title("Non Disaster Unigram Frequency Distribution hashtags")
plt.axis("off")
plt.imshow(nondisaster_wc, interpolation="bilinear")
plt.show()

## Neural Network

In [None]:
# Word Embedding
from gensim.models import KeyedVectors
# Keras
from keras import optimizers
from keras.models import Model, Sequential
from keras.layers import Dense, Input, Embedding, Dropout
from keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import LSTM, Bidirectional
from keras.layers.convolutional import Conv1D
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Visualization
%matplotlib inline
import matplotlib.pyplot as plt
from keras.utils import plot_model
# Measuring metrics
from sklearn.metrics import f1_score

# fastText word embeddings

In [None]:
# Downloading fastext
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec

In [None]:
train = train_df[['id','text','clean_text_stopword','target']]
train.rename(columns = {'clean_text_stopword':'clean_text'},inplace = True)

In [None]:
train.head(2)

In [None]:
test = test_df[['id','text','clean_text_stopword']]
test.rename(columns = {'clean_text_stopword':'clean_text'},inplace = True)

In [None]:
test.head(2)

In [None]:
x_test = test['clean_text']
print('Number of testing sentence: ', x_test.shape)
x_test = np.asarray(x_test)

In [None]:
x_train = train['clean_text']
y_train = train['target']

In [None]:
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [None]:
print('Number of training sentence: ', x_train.shape)
print('Number of training label: ', y_train.shape)

In [None]:
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [None]:
# See the data number of sentence in each category 
from collections import Counter
cnt = Counter(y_train)
cnt = dict(cnt)
print(cnt)

In [None]:
labels = list(cnt.keys())
sizes = list(cnt.values())
colors = ['#3fba36', '#66b3ff']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', startangle=90)
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
# Decomment following line if you want to save the figure
# plt.savefig('distribution.png')
plt.show()

# Prepare FastText Model

In [None]:
EMBEDDING_FILE = 'wiki.en.vec'

def import_with_gensim(file_address):
    # Creating the model
    ft_model = KeyedVectors.load_word2vec_format(file_address)
    # Getting the tokens
    ft_words = []
    for ft_word in ft_model.index_to_key:
        ft_words.append(ft_word)
    return ft_model, ft_words
  
ft_model, ft_words = import_with_gensim(EMBEDDING_FILE)

In [None]:
# FastText embedding dimensionality
embed_size = 300

In [None]:
# We get the mean and standard deviation of the embedding weights so that we could maintain the
# same statistics for the rest of our own random generated weights.

embedding_list = list()

for w in ft_words:
    embedding_list.append(ft_model[w])

all_embedding = np.stack(embedding_list)
emb_mean, emb_std = all_embedding.mean(), all_embedding.std()

# Prepare data for Deep Learning model

## Setting tokenizer up

In [None]:
num_words = 2500

# Create the tokenizer
tokenizer = Tokenizer()

# fFt the tokenizer on the training documents
tokenizer.fit_on_texts(x_train)

In [None]:
# Find maximum length of training sentences
max_length = max([len(s.split()) for s in x_train])

## Embed sentences

In [None]:
# Embed training sequences
encoded_docs = tokenizer.texts_to_sequences(x_train)

# Pad embeded training sequences
x_train_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
# Define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index)+1

In [None]:
# We are going to set the embedding size to the pre-trained dimension as we are replicating it
nb_words = len(tokenizer.word_index)+1

# the size will be Number of Words in Vocab X Embedding Size
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

# With the newly created embedding matrix, we'll fill it up with the words that we have in both
# our own dictionary and loaded pre-trained embedding.
embeddedCount = 0
for word, i in tokenizer.word_index.items():
    i -= 1
    # then we see if this word is in glove's dictionary, if yes, get the corresponding weights
    if word in ft_model.index_to_key:
        embedding_vector = ft_model[word]
        # and store inside the embedding matrix that we will train later on.
        embedding_matrix[i] = embedding_vector
        embeddedCount += 1
    else:   # Unknown words
        embedding_vector = ft_model['subdivision_name']
        embedding_matrix[i] = embedding_vector
        embeddedCount += 1

print('total embedded:', embeddedCount, 'common words')
print('Embedding matrix shape:', embedding_matrix.shape)

In [None]:
# Embed testing sequences
encoded_docs = tokenizer.texts_to_sequences(x_test)
# Pad testing sequences
x_test_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

### Train and Validation split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(x_train_padded, y_train, test_size=0.2, random_state=2)

In [None]:
print("Train set has total {0} entries with {1:.2f}% disaster, {2:.2f}% neutral".format(len(X_train),
                                                                             (len(X_train[Y_train == 0]) / (len(X_train)*1.))*100,
                                                                            (len(X_train[Y_train == 1]) / (len(X_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% disaster, {2:.2f}% neutral".format(len(X_val),
                                                                             (len(X_val[Y_val == 0]) / (len(X_val)*1.))*100,
                                                                            (len(X_val[Y_val == 1]) / (len(X_val)*1.))*100))

## B-LSTM Model

In [None]:
model_blstm_fast = Sequential()
model_blstm_fast.add(Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False))
model_blstm_fast.add(Bidirectional(LSTM(300, return_sequences=True, name='lstm_layer')))
model_blstm_fast.add(GlobalMaxPool1D())
model_blstm_fast.add(Dropout(0.1))
model_blstm_fast.add(Dense(300, activation="relu"))
model_blstm_fast.add(Dropout(0.1))
model_blstm_fast.add(Dense(1, activation='sigmoid'))

In [None]:
model_blstm_fast.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_blstm_fast.summary()
batch_size_blstm = 32
epochs_blstm = 5

In [None]:
# Train model
hist_blstm_fast = model_blstm_fast.fit(X_train, Y_train, batch_size=batch_size_blstm, epochs=epochs_blstm,
                             validation_data = (X_val, Y_val))

In [None]:
# Evaluate model
loss_blstm_fast, acc_blstm_fast = model_blstm_fast.evaluate(X_val, Y_val, verbose=0)
print('Test Accuracy: %f' % (acc_blstm_fast*100))

In [None]:
# Get prediction label
y_pred_val_blstm_fast = model_blstm_fast.predict_classes(X_val)

In [None]:
# Get prediction label
y_pred_blstm_fast = model_blstm_fast.predict_classes(x_test_padded)

## CNN Model


In [None]:
model_cnn_fast = Sequential()
model_cnn_fast.add(Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False))
model_cnn_fast.add(Conv1D(filters=64, kernel_size=4, activation='relu', padding='same'))
model_cnn_fast.add(MaxPooling1D(pool_size=2))
model_cnn_fast.add(Conv1D(filters=64, kernel_size=8, activation='relu', padding='same'))
model_cnn_fast.add(MaxPooling1D(pool_size=2))
model_cnn_fast.add(Conv1D(filters=64, kernel_size=16, activation='relu', padding='same'))
model_cnn_fast.add(GlobalMaxPooling1D())
model_cnn_fast.add(Dropout(0.1))
model_cnn_fast.add(Dense(500, activation="sigmoid"))
model_cnn_fast.add(Dense(1, activation='sigmoid'))

In [None]:
model_cnn_fast.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_cnn_fast.summary()
batch_size_cnn = 64
epochs_cnn = 10

In [None]:
# Train model
hist_cnn_fast = model_cnn_fast.fit(X_train, Y_train, batch_size=batch_size_cnn, epochs=epochs_cnn,
                        validation_data = (X_val, Y_val))

In [None]:
# Evaluate model
loss_cnn_fast, acc_cnn_fast = model_cnn_fast.evaluate(X_val, Y_val, verbose=0)
print('Test Accuracy: %f' % (acc_cnn_fast*100))

In [None]:
y_pred_val_cnn_fast = model_cnn_fast.predict_classes(X_val)

In [None]:
# Get prediction label
y_pred_cnn_fast = model_cnn_fast.predict_classes(x_test_padded)

## Model Evaluation and Confusion Matrix

In [None]:
def model_evaluation(model):
    t_loss = model.history['loss']
    t_acc  = model.history['accuracy']
    v_loss = model.history['val_loss']
    v_acc  = model.history['val_accuracy']
    x_axis = len(t_loss)

    fig,(ax1) = plt.subplots(1,1,figsize=(12,8))
    ax1.plot(t_acc,color = 'blue',label = 'Train')
    ax1.plot(v_acc,color = 'orange',label = 'Val')
    ax1.set_title('Accuracy Plot')
    ax1.set_xlabel("#Epochs")
    ax1.set_ylabel("Accuracy")
    ax1.legend()
    
    plt.show()


In [None]:
model_evaluation(hist_blstm_fast)

In [None]:
model_evaluation(hist_cnn_fast)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    
    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    print(im)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

class_names = np.array([0, 1])
np.set_printoptions(precision=2)

In [None]:
# Plot non-normalized confusion matrix
plot_confusion_matrix(Y_val, y_pred_val_blstm_fast, classes=class_names)
# plt.savefig('cm-blstm.png')
# Plot normalized confusion matrix
plot_confusion_matrix(Y_val, y_pred_val_blstm_fast, classes=class_names, normalize=True)
# Decomment following line if you want to save the figure
# plt.savefig('cm-blstm-normalized.png')
plt.show()

In [None]:
# Plot non-normalized confusion matrix
plot_confusion_matrix(Y_val, y_pred_val_cnn_fast, classes=class_names)
# plt.savefig('cm-cnn.png')
# Plot normalized confusion matrix
plot_confusion_matrix(Y_val, y_pred_val_cnn_fast, classes=class_names, normalize=True)
# plt.savefig('cm-cnn-normalized.png')
plt.show()

In [None]:
print("(Weighted) F1 score of FasttextEmb B-LSTM model:")
f1_score(Y_val, y_pred_val_blstm_fast, average='weighted')

In [None]:
print("(Weighted) F1 score of FasttextEmb B-LSTM model:")
f1_score(Y_val, y_pred_val_cnn_fast, average='weighted')

## Keras embedding

### BLSTM

In [None]:
model_blstm_keras = Sequential()
model_blstm_keras.add(Embedding(vocab_size, 300, input_length=max_length))
model_blstm_keras.add(Bidirectional(LSTM(300, return_sequences=True, name='lstm_layer')))
model_blstm_keras.add(GlobalMaxPool1D())
model_blstm_keras.add(Dropout(0.3))
model_blstm_keras.add(Dense(300, activation="relu"))
model_blstm_keras.add(Dropout(0.3))
model_blstm_keras.add(Dense(300, activation="relu"))
model_blstm_keras.add(Dropout(0.3))
model_blstm_keras.add(Dense(1, activation='sigmoid'))

In [None]:
model_blstm_keras.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_blstm_keras.summary()
batch_size_blstm = 32
epochs_blstm = 5

In [None]:
# Train model
hist_blstm_keras = model_blstm_keras.fit(X_train, Y_train, batch_size=batch_size_blstm, epochs=epochs_blstm,
                             validation_data = (X_val, Y_val))

In [None]:
# Evaluate model
loss_blstm_keras, acc_blstm_keras = model_blstm_keras.evaluate(X_val, Y_val, verbose=0)
print('Test Accuracy: %f' % (acc_blstm_keras*100))

In [None]:
# Get prediction label
y_pred_val_blstm_keras = model_blstm_keras.predict_classes(X_val)

In [None]:
# Get prediction label
y_pred_blstm_keras = model_blstm_keras.predict_classes(x_test_padded)

In [None]:
model_evaluation(hist_blstm_keras)

# CNN Model

In [None]:
model_cnn_keras = Sequential()
model_cnn_keras.add(Embedding(vocab_size, 300, input_length=max_length))
model_cnn_keras.add(Conv1D(filters=64, kernel_size=4, activation='relu', padding='same'))
model_cnn_keras.add(MaxPooling1D(pool_size=2))
model_cnn_keras.add(Conv1D(filters=64, kernel_size=8, activation='relu', padding='same'))
model_cnn_keras.add(MaxPooling1D(pool_size=2))
model_cnn_keras.add(Conv1D(filters=64, kernel_size=16, activation='relu', padding='same'))
model_cnn_keras.add(GlobalMaxPooling1D())
model_cnn_keras.add(Dropout(0.1))
model_cnn_keras.add(Dense(500, activation="sigmoid"))
model_cnn_keras.add(Dense(1, activation='sigmoid'))

In [None]:

model_cnn_keras.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_cnn_keras.summary()
batch_size_cnn = 64
epochs_cnn = 10

In [None]:
# Train model
hist_cnn_keras = model_cnn_keras.fit(X_train, Y_train, batch_size=batch_size_cnn, epochs=epochs_cnn,
                        validation_data = (X_val, Y_val))

In [None]:
# Evaluate model
loss_cnn_keras, acc_cnn_keras = model_cnn_keras.evaluate(X_val, Y_val, verbose=0)
print('Test Accuracy: %f' % (acc_cnn_keras*100))

In [None]:
y_pred_val_cnn_keras= model_cnn_keras.predict_classes(X_val)

In [None]:
# Get prediction label
y_pred_cnn_keras = model_cnn_keras.predict_classes(x_test_padded)

In [None]:
model_evaluation(hist_cnn_keras)

## Confusion Matrix

In [None]:
# Plot non-normalized confusion matrix
plot_confusion_matrix(Y_val, y_pred_val_blstm_keras, classes=class_names)
# plt.savefig('cm-blstm.png')
# Plot normalized confusion matrix
plot_confusion_matrix(Y_val, y_pred_val_blstm_keras, classes=class_names, normalize=True)
# Decomment following line if you want to save the figure
# plt.savefig('cm-blstm-normalized.png')
plt.show()

In [None]:
# Plot non-normalized confusion matrix
plot_confusion_matrix(Y_val, y_pred_val_cnn_keras, classes=class_names)
# plt.savefig('cm-cnn.png')
# Plot normalized confusion matrix
plot_confusion_matrix(Y_val, y_pred_val_cnn_keras, classes=class_names, normalize=True)
# plt.savefig('cm-cnn-normalized.png')
plt.show()

In [None]:
print("(Weighted) F1 score of KerasEmb B-LSTM model:")
f1_score(Y_val, y_pred_val_blstm_keras, average='weighted')

In [None]:
print("(Weighted) F1 score of KerasEmb CNN model:")
f1_score(Y_val, y_pred_val_cnn_keras, average='weighted')

# Machine Learning Algorithms

In [None]:
# sklearn
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline

# Measuring metrics
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize

In [None]:
# When building the vocabulary ignore terms that have a document frequency strictly lower than
# the given threshold. This value is also called cut-off in the literature.
min_df = 1

# Tokenize function used in Vectorizer
def tokenize(text):
    return word_tokenize(text)

### Naive Bayes

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
x_test = test['clean_text']
print('Number of testing sentence: ', x_test.shape)
x_test = np.asarray(x_test)

In [None]:
x_train = np.asarray(train['clean_text'])
y_train = np.asarray(train['target'])
print('Number of training sentence: ', x_train.shape)
print('Number of training label: ', y_train.shape)

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=2)

In [None]:
# Naive Bayes Model
naive_bayes = Pipeline([('vect', CountVectorizer(tokenizer=tokenize,
                                              analyzer='word', ngram_range=(1, 2), min_df=min_df, lowercase=False)),
                     ('tfidf', TfidfTransformer(sublinear_tf=True)),
                     ('clf', MultinomialNB())])
naive_bayes = naive_bayes.fit(X_train, Y_train)
naive_score = naive_bayes.score(X_val, Y_val)
print('Naive Bayes Model: ', naive_score)
predict_val_nb = naive_bayes.predict(X_val)

In [None]:
predict_nb = naive_bayes.predict(x_test)

### Linear Support Vector Machine Model

In [None]:
# Linear Support Vector Machine Model
svm = Pipeline([('vect', CountVectorizer(tokenizer=tokenize,
                                                         analyzer='word', ngram_range=(1, 2),
                                                         min_df=min_df, lowercase=False)),
                                ('tfidf', TfidfTransformer(sublinear_tf=True)),
                                ('clf-svm', LinearSVC(loss='hinge', penalty='l2',
                                                      max_iter=5))])

svm = svm.fit(X_train, Y_train)
linear_svc_score = svm.score(X_val, Y_val)
print('Linear SVC Model: ', linear_svc_score)
predict_val_svm = svm.predict(X_val)

In [None]:
predict_svm = svm.predict(x_test)

### Stochastic Gradient Descent Model

In [None]:
# SGD (Stochastic Gradient Descent) Model
sgd = Pipeline([('vect', CountVectorizer(tokenizer=tokenize,
                                                  analyzer='word', ngram_range=(1, 2), min_df=min_df, lowercase=False)),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                                   alpha=1e-3, max_iter=5))])
sgd = sgd.fit(X_train, Y_train)
sgd_score = sgd.score(X_val, Y_val)
print('SGD Model: ', sgd_score)
predict_val_sgd = sgd.predict(X_val)

In [None]:
predict_sgd = sgd.predict(x_test)

# Confusion Matrix

In [None]:
Y_val = Y_val.astype(int)
predict_val_nb = predict_val_nb.astype(int)
predict_val_svm = predict_val_svm.astype(int)
predict_val_sgd = predict_val_sgd.astype(int)

In [None]:
# Plot non-normalized confusion matrix
plot_confusion_matrix(Y_val, predict_val_nb, classes=class_names)
# plt.savefig('cm-nb.png')
# Plot normalized confusion matrix
plot_confusion_matrix(Y_val, predict_val_nb, classes=class_names, normalize=True)
# plt.savefig('cm-nb-normalized.png')
plt.show()

In [None]:
# Plot non-normalized confusion matrix
plot_confusion_matrix(Y_val, predict_val_svm, classes=class_names)
# plt.savefig('cm-svm.png')
# Plot normalized confusion matrix
plot_confusion_matrix(Y_val, predict_val_svm, classes=class_names, normalize=True)
# plt.savefig('cm-svm-normalized.png')
plt.show()

In [None]:
# Plot non-normalized confusion matrix
plot_confusion_matrix(Y_val, predict_val_sgd, classes=class_names)
# plt.savefig('cm-sgd.png')
# Plot normalized confusion matrix
plot_confusion_matrix(Y_val, predict_val_sgd, classes=class_names, normalize=True)
# plt.savefig('cm-sgd-normalized.png')
plt.show()

# F1 Score

In [None]:
print("F1 score of NB model:")
f1_score(Y_val, predict_val_nb, average='weighted')

In [None]:
print("F1 score of SVM model:")
f1_score(Y_val, predict_val_svm, average='weighted')

In [None]:
print("F1 score of SGD model:")
f1_score(Y_val, predict_val_sgd, average='weighted')

# Try other models

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import re
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score #Compute Area Under the Curve (AUC) from prediction scores

import seaborn as sns
import matplotlib.pyplot as plt

from warnings import filterwarnings
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

X = train['clean_text'].to_numpy()
y = train['target'].to_numpy()

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    #X_train = X.loc[train_index]
    X_train, X_test = X[train_index], X[test_index]

    y_train, y_test = y[train_index], y[test_index]

In [None]:
plt.figure(figsize=(2.5,5))
plt.title("Distribution in Train dataset")
p1 = sns.countplot(y_train, palette = 'plasma')

for p in p1.patches:
        p1.annotate('{:6.2f}%'.format(p.get_height()/len(y_train)*100), (p.get_x()+0.1, p.get_height()+50))
        
plt.show()

In [None]:
plt.figure(figsize=(2.5,5))
plt.title("Distribution in Test dataset")
p1 = sns.countplot(y_test, palette = 'plasma')

for p in p1.patches:
        p1.annotate('{:6.2f}%'.format(p.get_height()/len(y_test)*100), (p.get_x()+0.2, p.get_height()+12))
        
plt.show()

In [None]:
tweets_pipeline = Pipeline([('CVec', CountVectorizer(stop_words='english')),
                     ('Tfidf', TfidfTransformer())])

X_train_tranformed = tweets_pipeline.fit_transform(X_train)
X_test_tranformed = tweets_pipeline.transform(X_test)

In [None]:
classifiers = {
    "Logistic Regression": LogisticRegression(class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "k-Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(class_weight='balanced'),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Random Forest": RandomForestClassifier(),
    'RidgeClassifier': RidgeClassifier(class_weight='balanced'),
    'AdaBoost': AdaBoostClassifier(n_estimators=100),
    'MNB': MultinomialNB(),
    'Perceptron': Perceptron(class_weight='balanced'),
    'xgboost': XGBClassifier(n_estimators=300),
    'catboost': CatBoostClassifier(verbose=0)
        
}

In [None]:
no_classifiers = len(classifiers.keys())

from time import process_time 


def batch_classify(X_train_tranformed, y_train, X_test_tranformed, y_test, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,3)), columns = ['Classifier', 'Area Under Curve', 'Training time'])
    count = 0
    for key, classifier in classifiers.items():
        t_start = process_time()  
        classifier.fit(X_train_tranformed, y_train)
        t_stop = process_time() 
        t_elapsed = t_stop - t_start
        y_predicted = classifier.predict(X_test_tranformed)
        
        df_results.loc[count,'Classifier'] = key
        df_results.loc[count,'Area Under Curve'] = roc_auc_score(y_test, y_predicted)
        df_results.loc[count,'Training time'] = t_elapsed
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=key, f=t_elapsed))
        count+=1

    return df_results

In [None]:
df_results = batch_classify(X_train_tranformed, y_train,X_test_tranformed, y_test)
print(df_results.sort_values(by='Area Under Curve', ascending=False))