In [37]:
#Importing packages
import numpy as np
import pandas as pd
import re

**Data Exploration**

In [38]:
# Read the dataset
disaster = pd.read_csv("/content/drive/MyDrive/deep-learning-project/tweets.csv")
disaster.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [39]:
# Loading required features
disaster = disaster[['text','target']]
disaster.head()

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,Telangana: Section 144 has been imposed in Bha...,1
2,Arsonist sets cars ablaze at dealership https:...,1
3,Arsonist sets cars ablaze at dealership https:...,1
4,"""Lord Jesus, your love brings freedom and pard...",0


In [40]:
# Checking the Null Values
disaster.isna().sum()

text      0
target    0
dtype: int64

In [41]:
# Checking the DF size
disaster.shape

(11370, 2)

In [42]:
# Lets see how Target Values labled
disaster['target'].value_counts()

0    9256
1    2114
Name: target, dtype: int64

**Data Cleaning**

Lower casing 

*   Converting a word to lower case (NLP -> nlp)



In [43]:
disaster['text'] = [entry.lower() for entry in disaster['text']]
disaster['text'].head()

0    communal violence in bhainsa, telangana. "ston...
1    telangana: section 144 has been imposed in bha...
2    arsonist sets cars ablaze at dealership https:...
3    arsonist sets cars ablaze at dealership https:...
4    "lord jesus, your love brings freedom and pard...
Name: text, dtype: object

Tokenization

In [44]:
# Loading packages for Tokenzation
import nltk
from nltk.tokenize import word_tokenize

Tokenization in NLP is the process by which a large quantity of text is divided into smaller parts called tokens

In [45]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading corpus: Package 'corpus' not found in index
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [46]:
disaster['text'] = [word_tokenize(entry) for entry in disaster['text']]

In [47]:
disaster['text'].head()

0    [communal, violence, in, bhainsa, ,, telangana...
1    [telangana, :, section, 144, has, been, impose...
2    [arsonist, sets, cars, ablaze, at, dealership,...
3    [arsonist, sets, cars, ablaze, at, dealership,...
4    [``, lord, jesus, ,, your, love, brings, freed...
Name: text, dtype: object

Part of Speech tagging and Stemming Words [lemmatization]

*   Part-of-Speech (PoS) tagging, then it may be defined as the process of assigning one of the parts of speech to the given word
*   Lemmatization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language and gives its dictionary word

*   Each row has to go through these both so we are going to create a function and add those values into another feature [Final Text]






In [48]:
# loading packages 
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet

In [49]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['j'] = wn.ADJ
tag_map['v'] = wn.VERB
tag_map['v'] = wn.ADV

In [50]:
# Storing all the stopwords into variables
stop_words = set(stopwords.words("english"))
print(stop_words)

{'will', 'theirs', 'no', 'up', 'where', 'if', 'when', 'can', 'being', 'itself', 'himself', 'what', 'or', 'off', 'until', 'of', 'won', "shan't", 'shan', 'it', 'then', 'now', 'to', 'them', "mustn't", 'after', 'hers', 'been', 'between', 'was', 'these', 'on', 'have', 'doesn', 'myself', 'down', 'during', 'other', 'a', 'into', "don't", 'didn', "isn't", 'isn', 'very', 'too', 'each', 'before', "you'd", 'those', 'is', 'from', "couldn't", 'few', 'own', 'having', 'his', 'not', 'am', 'under', 'out', 'against', 'which', 'an', 'by', 'so', "needn't", 'should', "wasn't", 'and', "hasn't", "should've", "weren't", "hadn't", 'y', "you'll", 'yours', 'once', 'hadn', 'that', 'why', 'while', "aren't", 'here', 'its', 'shouldn', 'they', 'had', "doesn't", 'her', 've', 'both', "mightn't", "she's", 'nor', 'you', 'such', 'needn', 'just', 'most', 'herself', 'there', 'more', 'some', 'do', 'as', 'are', 'same', 'the', "you've", 't', 'll', 'mustn', 'for', 'does', 'above', 'over', 'yourselves', 'all', 'any', 'don', 'hasn

In [51]:
for index,entry in enumerate(disaster['text']):
    Final_words = []
    word_lemmstized = WordNetLemmatizer()
    for word,tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmstized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_final)
    disaster.loc[index,'text_final'] = str(Final_words)

In [52]:
disaster.head()

Unnamed: 0,text,target,text_final
0,"[communal, violence, in, bhainsa, ,, telangana...",1,"['communal', 'violence', 'bhainsa', 'telangana..."
1,"[telangana, :, section, 144, has, been, impose...",1,"['telangana', 'section', 'imposed', 'bhainsa',..."
2,"[arsonist, sets, cars, ablaze, at, dealership,...",1,"['arsonist', 'set', 'car', 'ablaze', 'dealersh..."
3,"[arsonist, sets, cars, ablaze, at, dealership,...",1,"['arsonist', 'set', 'car', 'ablaze', 'dealersh..."
4,"[``, lord, jesus, ,, your, love, brings, freed...",0,"['lord', 'jesus', 'love', 'brings', 'freedom',..."


**Naive Bayes**

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection,naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [54]:
Train_X,Test_X,Train_Y,Test_Y = model_selection.train_test_split(disaster['text_final'],disaster['target'],test_size = 0.3)
encoder = LabelEncoder()
Train_Y = encoder.fit_transform(Train_Y)
Test_Y = encoder.fit_transform(Test_Y)

In [55]:
y = Train_Y.tolist()

In [56]:
Tfidf_vect = TfidfVectorizer(max_features = 5000)
Tfidf_vect.fit(disaster['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [57]:
data = Train_X_Tfidf.toarray()

In [58]:
naive = naive_bayes.MultinomialNB()
naive.fit(Train_X_Tfidf,Train_Y)

MultinomialNB()

In [59]:
predictions_NB = naive.predict(Test_X_Tfidf)

In [60]:
print("Naive Bayes Model Accuracy : ",accuracy_score(predictions_NB,Test_Y)*100)

Naive Bayes Model Accuracy :  86.66080328349459


**Random Forest**

In [61]:
Train_X,Test_X,Train_Y,Test_Y = model_selection.train_test_split(disaster['text_final'],disaster['target'],test_size = 0.3)
encoder = LabelEncoder()
Train_Y = encoder.fit_transform(Train_Y)
Test_Y = encoder.fit_transform(Test_Y)

Tfidf_vect = TfidfVectorizer(max_features = 5000)
Tfidf_vect.fit(disaster['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [62]:
#loading random forest
from sklearn.ensemble import RandomForestClassifier

#Setting random forest 
RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=9)

RandomForest.fit(Train_X_Tfidf,Train_Y)

#making predcitons with randome forest model
label_predict = RandomForest.predict(Test_X_Tfidf)

accuracy = accuracy_score( label_predict,Test_Y)
print(("Accuracy of RandomForest: ", accuracy*100))

('Accuracy of RandomForest: ', 87.51099384344766)


**Logistic Regression**

In [63]:
Train_X,Test_X,Train_Y,Test_Y = model_selection.train_test_split(disaster['text_final'],disaster['target'],test_size = 0.3)
encoder = LabelEncoder()
Train_Y = encoder.fit_transform(Train_Y)
Test_Y = encoder.fit_transform(Test_Y)

Tfidf_vect = TfidfVectorizer(max_features = 5000)
Tfidf_vect.fit(disaster['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')
logreg.fit(Train_X_Tfidf,Train_Y)

label_predict = logreg.predict(Test_X_Tfidf)


accuracy = accuracy_score(label_predict,Test_Y)
print(("Accuracy of logreg: ", accuracy*100))

('Accuracy of logreg: ', 87.01260627382)


**LSTM**

In [64]:
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [65]:
Train_X,Test_X,Train_Y,Test_Y = model_selection.train_test_split(disaster['text_final'],disaster['target'],test_size = 0.3)
encoder = LabelEncoder()
Train_Y = encoder.fit_transform(Train_Y)
Test_Y = encoder.fit_transform(Test_Y)

Tfidf_vect = TfidfVectorizer(max_features = 5000)
Tfidf_vect.fit(disaster['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(Train_X)
sequences = tok.texts_to_sequences(Train_X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [66]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [67]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 150)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 150, 50)           50000     
                                                                 
 lstm_1 (LSTM)               (None, 64)                29440     
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_2 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

In [68]:
model.fit(sequences_matrix,Train_Y,batch_size=128,epochs=20,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/20
Epoch 2/20


<keras.callbacks.History at 0x7efce9beb890>

In [69]:
test_sequences = tok.texts_to_sequences(Test_X)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [70]:
accr = model.evaluate(test_sequences_matrix,Test_Y)



In [73]:
print('Test set\n Accuracy: {:0.3f}'.format(accr[1]*100))

Test set
 Accuracy: 87.863
