# Real or Not? NLP with Disaster Tweets
Predict which Tweets are about real disasters and which ones are not

# Importing the libraries and loading the data

In [1]:
# For notebook plotting
%matplotlib inline

# Standard packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import pickle
# import sqlite3

# nltk for preprocessing of text data
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# sklearn for preprocessing and machine learning models
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# XGBoost for Machine Learning (Gradient Boosting Machine (GBM))
import xgboost as xgb

# Keras for neural networks
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score,classification_report, recall_score, precision_score
from sklearn.metrics import confusion_matrix,mean_absolute_error,mean_squared_error, f1_score


# Random seeds for consistent results
from tensorflow import set_random_seed
seed = 1234
np.random.seed(seed)
set_random_seed(seed)


Using TensorFlow backend.


In [2]:
os.chdir("C:\\Users\\user\\Desktop\\disaster twitter using nlp")

In [3]:
# Read the dataset

train_df= pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')



In [4]:
print('Train Raw Dataframe:')
train_df.head(5)

Train Raw Dataframe:


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
print('Test Raw Dataframe:')
test_df.head(5)

Test Raw Dataframe:


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
# To know column or attribute names in train data
train_df.columns   

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [7]:
# To know column or attribute names in test_df 
test_df.columns   

Index(['id', 'keyword', 'location', 'text'], dtype='object')

#### Data Preprocessing The first step should be to check the shape of the dataframe and then check the number of null values in each column.

In this way we can get an idea of the redundant columns in the data frame depending on which columns have the highest number of null values.

In [8]:
print("Shape of the train dataframe is",train_df.shape)
print("The number of nulls in each column are \n", train_df.isna().sum())

Shape of the train dataframe is (7613, 5)
The number of nulls in each column are 
 id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


In [9]:
print("Shape of the test dataframe is",test_df.shape)
print("The number of nulls in each column are \n", test_df.isna().sum())

Shape of the test dataframe is (3263, 4)
The number of nulls in each column are 
 id             0
keyword       26
location    1105
text           0
dtype: int64


#### Delete the columns in train data and test data

In [10]:
del train_df['id']
del train_df['keyword']
del train_df['location']
#del train_df['target']
train_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [11]:
del test_df['id']
del test_df['keyword']
del test_df['location']
test_df.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


## Preprocessing

We created a preprocessor class to perform all steps that need to be performed before the text data can be vectorized. These preprocessing steps include:

Tokenization Removing stop words Stemming Transform the tokens back to one string

#### Preprocessing for train data

In [12]:
class PreProcessor:
    '''
    Easily performs all the standard preprocessing steps
    like removing stopwords, stemming, etc.
    Only input that you need to provide is the dataframe and column name for the tweets
    '''
    def __init__(self, train_df, column_name):
        self.data = train_df
        self.conversations = list(self.data[column_name])
        self.stopwords = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        self.preprocessed = []
        
    def tokenize(self, sentence):
        '''
        Splits up words and makes a list of all words in the tweet
        '''
        tokenized_sentence = word_tokenize(sentence)
        return tokenized_sentence
            
    def remove_stopwords(self, sentence):
        '''Removes stopwords like 'a', 'the', 'and', etc.'''
        filtered_sentence = []
        for w in sentence:
            if w not in self.stopwords and len(w) > 1 and w[:2] != '//' and w != 'https': 
                filtered_sentence.append(w)
        return filtered_sentence
    
    def stem(self, sentence):
        '''
        Stems certain words to their root form.
        For example, words like 'computer', 'computation'
        all get truncated to 'comput'
        '''
        return [self.stemmer.stem(word) for word in sentence]
    
    def join_to_string(self, sentence):
        '''
        Joins the tokenized words to one string.
        '''
        return ' '.join(sentence)
    
    def full_preprocess(self, n_rows=None):
        '''
        Preprocess a selected number of rows and
        connects them back to strings
        '''
        # If nothing is given do it for the whole dataset
        if n_rows == None:
            n_rows = len(self.data)
            
        # Perform preprocessing
        for i in range(n_rows):
            tweet = self.conversations[i]
            tokenized = self.tokenize(tweet)
            cleaned = self.remove_stopwords(tokenized)
            stemmed = self.stem(cleaned)
            joined = self.join_to_string(stemmed)
            self.preprocessed.append(joined)
        return self.preprocessed

In [13]:
# Preprocess text and put it in a new column
preprocessor = PreProcessor(train_df, 'text')
train_df['cleaned_text'] = preprocessor.full_preprocess()

In [14]:
del train_df['text']    # deleted text column

In [15]:
train_df    # after cleaning in the train dataframe  

Unnamed: 0,target,cleaned_text
0,1,our deed reason earthquak may allah forgiv us
1,1,forest fire near la rong sask canada
2,1,all resid ask shelter place notifi offic no ev...
3,1,"13,000 peopl receiv wildfir evacu order califo..."
4,1,just got sent photo rubi alaska smoke wildfir ...
5,1,rockyfir updat california hwi 20 close direct ...
6,1,flood disast heavi rain caus flash flood stree...
7,1,'m top hill see fire wood ...
8,1,there 's emerg evacu happen build across street
9,1,'m afraid tornado come area ...


#### Preprocessing for test data

In [16]:
train_df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [17]:
class PreProcessor:
    '''
    Easily performs all the standard preprocessing steps
    like removing stopwords, stemming, etc.
    Only input that you need to provide is the dataframe and column name for the tweets
    '''
    def __init__(self, test_df, column_name):
        self.data = test_df
        self.conversations = list(self.data[column_name])
        self.stopwords = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        self.preprocessed = []
        
    def tokenize(self, sentence):
        '''
        Splits up words and makes a list of all words in the tweet
        '''
        tokenized_sentence = word_tokenize(sentence)
        return tokenized_sentence
            
    def remove_stopwords(self, sentence):
        '''Removes stopwords like 'a', 'the', 'and', etc.'''
        filtered_sentence = []
        for w in sentence:
            if w not in self.stopwords and len(w) > 1 and w[:2] != '//' and w != 'https': 
                filtered_sentence.append(w)
        return filtered_sentence
    
    def stem(self, sentence):
        '''
        Stems certain words to their root form.
        For example, words like 'computer', 'computation'
        all get truncated to 'comput'
        '''
        return [self.stemmer.stem(word) for word in sentence]
    
    def join_to_string(self, sentence):
        '''
        Joins the tokenized words to one string.
        '''
        return ' '.join(sentence)
    
    def full_preprocess(self, n_rows=None):
        '''
        Preprocess a selected number of rows and
        connects them back to strings
        '''
        # If nothing is given do it for the whole dataset
        if n_rows == None:
            n_rows = len(self.data)
            
        # Perform preprocessing
        for i in range(n_rows):
            tweet = self.conversations[i]
            tokenized = self.tokenize(tweet)
            cleaned = self.remove_stopwords(tokenized)
            stemmed = self.stem(cleaned)
            joined = self.join_to_string(stemmed)
            self.preprocessed.append(joined)
        return self.preprocessed

In [18]:
# Preprocess text and put it in a new column
preprocessor = PreProcessor(test_df, 'text')
test_df['cleaned_text'] = preprocessor.full_preprocess()

In [19]:
del test_df['text']    # deleted text column

In [20]:
test_df    # after cleaning in the test dataframe  

Unnamed: 0,cleaned_text
0,just happen terribl car crash
1,heard earthquak differ citi stay safe everyon
2,forest fire spot pond gees flee across street ...
3,apocalyps light spokan wildfir
4,typhoon soudelor kill 28 china taiwan
5,we re shake ... it 's earthquak
6,they 'd probabl still show life arsenal yester...
7,hey how
8,what nice hat
9,fuck


# Split train dataframe into train and test 

 We split the train data into a training set, and test set. This is crucial for training and evaluation of good machine learning models.

The data will be splitted into 80/20. 

In [21]:
train_df.head()

Unnamed: 0,target,cleaned_text
0,1,our deed reason earthquak may allah forgiv us
1,1,forest fire near la rong sask canada
2,1,all resid ask shelter place notifi offic no ev...
3,1,"13,000 peopl receiv wildfir evacu order califo..."
4,1,just got sent photo rubi alaska smoke wildfir ...


In [22]:
X = train_df["cleaned_text"]
y = train_df["target"]

In [23]:
print(X.shape)
print(y.shape)

(7613,)
(7613,)


In [24]:
X_train,X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=42)


In [25]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(6090,)
(6090,)
(1523,)
(1523,)


# Data vectorization for train dataframe

Data vectorization Many machine learning models can only be trained on numerical input in the form of vectors or matrices. To prepare our tweets for the machine learning models we create a term frequency-inverse document frequency (tf-idf) vectorization. The result of this vectorization is a sparse matrix which contains a convenient representation of our tweets.

The machine learning will learn which word frequency is important to predict a correct sentiment.

In [26]:
# Create  matrix based on word frequency in tweets

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)


In [27]:
# Print the size of our data

print(f'Train size: {X_train.shape[0]} tweets\n\
Test size: {X_val.shape[0]} tweets\n\
Amount of words (columns): {X_train.shape[1]} words')

Train size: 6090 tweets
Test size: 1523 tweets
Amount of words (columns): 12469 words


# Data vectorization for test dataframe

In [28]:
test_df = test_df["cleaned_text"]

In [29]:
type(test_df)

pandas.core.series.Series

In [30]:
test_df = vectorizer.transform(test_df)

# ML Models

## Multinomial Naive Bayes

In [31]:
# Multinomial Naive Bayes
multi_nb = MultinomialNB()
multi_nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
train_pred_mn = multi_nb.predict(X_train)
val_pred_mn = multi_nb.predict(X_val)

In [33]:
######Getting evaluation metrics and evaluating model performance

###### Check results
print("Train data")
#print(f'Accuracy on training set (MultinomialNB): {round(accuracy_score(y_train, train_pred_mn)*100, 4)}%')
print("f1 score is",f1_score(y_train,train_pred_mn,pos_label=1))
#print(classification_report(y_train,train_pred_mn,digits=4))
#print(confusion_matrix(y_train, train_pred_mn)
print("\n")

print(" Validation data")
#print(f'Accuracy on test set (MultinomialNB): {round(accuracy_score(y_val,val_pred_mn)*100, 4)}%')
print("f1 score is",f1_score(y_val,val_pred_mn,pos_label=1))
#print(classification_report(y_val,val_pred_mn,digits=4))
#print(confusion_matrix(y_val, val_pred_mn ))

Train data
f1 score is 0.8655479593950695


 Validation data
f1 score is 0.7533783783783784


# Decision Tree

In [34]:
estimator = DecisionTreeClassifier(max_depth=10)
estimator.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [35]:
train_pred_dt = estimator.predict(X_train)
val_pred_dt = estimator.predict(X_val)

In [36]:
######Getting evaluation metrics and evaluating model performance

###### Check results
print("Train data")
print("f1 score is",f1_score(y_train,train_pred_dt,pos_label=1))
#print(accuracy_score(y_train,train_pred_dt))
#print(classification_report(y_train,train_pred_dt,digits=4))
#print(confusion_matrix(y_train, train_pred_dt))
print("\n")

print("validation data")
print("f1 score is",f1_score(y_val,val_pred_dt,pos_label=1))
#print(acuracy_score(y_val,val_pred_dt))
#print(classification_report(y_val,val_pred_dt,digits=4))
#print(confusion_matrix(y_val, val_pred_dt))

Train data
f1 score is 0.6770098730606487


validation data
f1 score is 0.6436132674664784


# GradientBoostingClassifier

In [37]:
GBM_model = GradientBoostingClassifier(n_estimators=200,
                                       learning_rate=0.1,
                                       subsample=0.8, max_depth=5)

In [38]:
%time GBM_model.fit(X_train,y_train)

Wall time: 11.3 s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [39]:
train_pred_gb=GBM_model.predict(X_train)
val_pred_gb=GBM_model.predict(X_val)

In [40]:
#### check results
print("Train data")
print("f1 score is",f1_score(y_train,train_pred_gb,pos_label=1))
#print(accuracy_score(y_train,train_pred_gb))
#print(classification_report(y_train,train_pred_gb,digits=4))
#print(confusion_matrix(y_train, train_pred_gb))
print("\n")

print("validation data")
print("f1 score is",f1_score(y_val,val_pred_gb,pos_label=1))
#print(accuracy_score(y_val,val_pred_gb))
#print(classification_report(y_val,val_pred_gb,digits=4))
#print(confusion_matrix(y_val, val_pred_gb))

Train data
f1 score is 0.8442657638595006


validation data
f1 score is 0.7337180544105524


# Build Random Forest model

In [41]:
rf_clf = RandomForestClassifier(n_estimators=10,max_depth=8)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
train_pred_rf = rf_clf.predict(X_train)
val_pred_rf = rf_clf.predict(X_val)

In [43]:
#### check results
print("Train data")
print("f1 score is",f1_score(y_train,train_pred_rf,pos_label=1))
#print(accuracy_score(y_train,train_pred_rf))
#print(classification_report(y_train,train_pred_rf,digits=4))
#print(confusion_matrix(y_train, train_pred_rf))
print("\n")

print("validation data")
print("f1 score is",f1_score(y_val,val_pred_rf,pos_label=1))
#print(accuracy_score(y_val,val_pred_rf))
#print(classification_report(y_val,val_pred_rf,digits=4))
#print(confusion_matrix(y_val, val_pred_rf))

Train data
f1 score is 0.2900713822193381


validation data
f1 score is 0.25498007968127495


# grid search cross validation

In [44]:
rfc = RandomForestClassifier(n_jobs=-1, max_features='sqrt') 
 
# Use a grid over parameters of interest
param_grid = { 
           "n_estimators" : [9, 18, 27, 36],
           "max_depth" : [2,3,5],
           "min_samples_leaf" : [2, 4]}

In [45]:
rf_gcv_clf = GridSearchCV(estimator=rfc, param_grid=param_grid,cv=5,
                       scoring='accuracy')

In [46]:
rf_gcv_clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='sqrt',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='

In [47]:
rf_gcv_pred_train =rf_gcv_clf.predict(X_train)
rf_gcv_pred_val=rf_gcv_clf.predict(X_val)

In [48]:
#### check results
print("Train data")
print("f1 score is",f1_score(y_train,rf_gcv_pred_train,pos_label=1))
#print(accuracy_score(y_train,rf_gcv_pred_train))
#print(classification_report(y_train,rf_gcv_pred_train,digits=4))
#print(confusion_matrix(y_train, rf_gcv_pred_train))
print("\n")

print("validation data")
print("f1 score is",f1_score(y_val,rf_gcv_pred_val,pos_label=1))
#print(accuracy_score(y_val,rf_gcv_pred_val))
#print(classification_report(y_val,rf_gcv_pred_val,digits=4))
#print(confusion_matrix(y_val, rf_gcv_pred_val))

Train data
f1 score is 0.21768251841929


validation data
f1 score is 0.22696929238985314


In [49]:
# Generator so we can easily feed batches of data to the neural network
def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = X.shape[0]/batch_size
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

In [50]:
# Onehot encoding of target variable

# Initialize sklearn's one-hot encoder class
onehot_encoder = OneHotEncoder(sparse=False)

# One hot encoding for training set
integer_encoded_train = np.array(y_train).reshape(len(y_train), 1)
onehot_encoded_train = onehot_encoder.fit_transform(integer_encoded_train)

# One hot encoding for validation set
integer_encoded_val = np.array(y_val).reshape(len(y_val), 1)
onehot_encoded_val = onehot_encoder.fit_transform(integer_encoded_val)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [51]:
initializer = keras.initializers.he_normal(seed=seed)
activation = keras.activations.elu
optimizer = keras.optimizers.Adam(lr=0.0002, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=4)

# Build model architecture
model = Sequential()
model.add(Dense(20, activation=activation, kernel_initializer=initializer, input_dim=X_train.shape[1]))
model.add(Dropout(0.5))

model.add(Dense(2, activation='sigmoid', kernel_initializer=initializer))
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Hyperparameters
epochs = 5
batch_size = 128

# Fit the model using the batch_generator
hist = model.fit_generator(generator=batch_generator(X_train, onehot_encoded_train, batch_size=batch_size, shuffle=True),
                           epochs=epochs, validation_data=(X_val, onehot_encoded_val),
                           steps_per_epoch=X_train.shape[0]/batch_size, callbacks=[es])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
prediction=multi_nb.predict(test_df)

In [None]:
sub=pd.read_csv('sample_submission.csv')

In [None]:
sub.head()

In [None]:
sub.drop(["target"], axis =1, inplace = True)

In [None]:
sub['target']=prediction

In [None]:
sub['target'].value_counts()

In [None]:
sub.to_csv("sample.csv", index=False)

In [52]:
prediction1=GBM_model.predict(test_df)

In [53]:
sub1=pd.read_csv('sample_submission.csv')

In [54]:
sub1.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [55]:
sub1.drop(["target"],axis=1,inplace=True)

In [56]:
sub1['target']=prediction1

In [57]:
sub1['target'].value_counts()

0    2096
1    1167
Name: target, dtype: int64

In [58]:
sub1.to_csv("sample1.csv", index=False)

# overall by seeing the performance, Naive bayes giving the best result comparing to other classificatio  models.
