In [1]:
#basic library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#NN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding, LSTM

#preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

In [2]:
#dataset
df=pd.read_csv("/content/drive/MyDrive/DataSet/twitter.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
#value count to check data imbalance
df["label"].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [4]:
#feature and target
x= df["tweet"]
y=df["label"]

#train test split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=1)

In [5]:
#reshsape 1D data to 2D data
xtrain= xtrain.values.reshape(-1, 1)
xtest = xtest.values.reshape(-1, 1)

In [6]:
#Sampling Technique to balance data
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
xtrain1,ytrain1=ros.fit_resample(xtrain,ytrain)
xtest1,ytest1=ros.fit_resample(xtest,ytest)

In [7]:
#to tokenize data
tok=Tokenizer()
tok.fit_on_texts(xtrain1.ravel())

In [8]:
#to index data with the help of tokenizer
vocab=tok.index_word
print(vocab)



In [9]:
#vocabulary length
vocab_len=len(vocab)
print(vocab_len)

36488


In [10]:
#Token to Sequence
train_seq = tok.texts_to_sequences(xtrain1.ravel())
print(train_seq)

[[2048, 14394, 2879, 45, 2424, 21, 3844, 6, 14395, 1247, 7, 6991, 14396, 8866, 14, 7984, 7, 1291, 9, 3, 2880], [1, 1, 34, 67, 2491, 29, 74, 6, 95, 14397, 60, 386, 6992, 56, 10418, 57, 14398, 6992, 69, 201, 14399], [376, 271, 10, 88, 5, 1, 29, 14400, 10419, 467, 11, 14401], [1, 1, 1, 1, 1, 1, 65, 170, 14402, 45, 10, 2492, 490, 25, 20], [18, 5, 232, 13, 78, 25, 2, 12, 123, 14, 5, 98, 90, 751], [1, 10, 879, 3, 4, 31, 1791, 732, 43, 10420, 14403, 2, 14404, 45, 96, 52, 5070, 23, 4, 336, 892], [1, 1, 35, 115, 17, 10421, 65, 22, 8], [1, 10, 121, 10, 24, 5, 4, 258, 69, 1322, 3845, 14405, 14406, 1930], [10422, 7, 82, 3846, 6649, 7, 7985, 73, 10422, 454, 6650, 10423, 226, 1, 14407, 94, 1881], [3244, 3422, 2172], [893, 47, 259, 6378, 107, 29, 7986, 47, 563, 3, 871, 7986, 7987, 893], [134, 584, 323, 337, 11, 2, 10424, 1134, 1], [1, 1292, 8, 1931, 1836, 1375, 7, 4103, 39, 1], [10, 53, 4, 2666, 8, 10, 139, 109, 12, 178, 3245, 510, 8, 10425, 20, 12, 99, 22, 3596, 3, 468, 100, 70], [1464, 14408, 11, 7

In [11]:
#to calculate length of each document
doc_length= []

for doc in train_seq:
  doc_length.append(len(doc))

print(doc_length)

[21, 21, 12, 15, 14, 21, 9, 14, 17, 3, 14, 9, 10, 23, 6, 14, 20, 6, 19, 10, 6, 11, 12, 19, 14, 20, 12, 11, 12, 7, 7, 18, 18, 5, 17, 10, 11, 3, 17, 8, 8, 6, 7, 10, 9, 12, 9, 11, 20, 9, 26, 18, 15, 11, 18, 7, 16, 7, 18, 12, 15, 11, 13, 13, 5, 11, 18, 23, 13, 10, 7, 22, 16, 6, 16, 6, 23, 11, 7, 6, 16, 18, 15, 17, 9, 24, 21, 13, 11, 3, 7, 6, 13, 20, 12, 25, 19, 16, 17, 12, 8, 19, 16, 4, 8, 21, 12, 3, 7, 21, 7, 10, 12, 8, 8, 19, 8, 9, 14, 14, 16, 12, 13, 6, 17, 26, 10, 11, 10, 11, 18, 8, 25, 8, 12, 19, 13, 7, 14, 13, 19, 12, 24, 23, 23, 21, 4, 14, 6, 21, 8, 20, 15, 5, 8, 14, 15, 15, 5, 17, 9, 5, 13, 20, 15, 7, 29, 9, 9, 7, 18, 6, 6, 3, 11, 28, 3, 5, 4, 5, 24, 8, 7, 13, 4, 18, 14, 22, 14, 9, 9, 12, 12, 12, 12, 20, 11, 15, 25, 12, 4, 18, 13, 6, 15, 10, 7, 19, 14, 7, 10, 10, 13, 15, 9, 9, 9, 20, 13, 15, 6, 13, 9, 7, 12, 15, 12, 13, 8, 12, 15, 12, 7, 8, 15, 16, 10, 9, 8, 3, 15, 19, 20, 19, 15, 13, 7, 16, 7, 19, 12, 10, 10, 13, 18, 8, 8, 14, 10, 21, 9, 20, 17, 10, 13, 11, 9, 17, 11, 5, 17, 6, 14

In [12]:
#to check maximum length
print("max length =",max(doc_length))
print("99% quantile =",np.quantile(doc_length,0.99))
print("95% quantile =",np.quantile(doc_length,0.95))
print("90% quantile =",np.quantile(doc_length,0.90))

max length = 42
99% quantile = 26.0
95% quantile = 23.0
90% quantile = 21.0


In [13]:
max_length=26

In [14]:
#padding
train_matrix=sequence.pad_sequences(train_seq,maxlen=max_length)
print(train_matrix)


[[    0     0     0 ...     9     3  2880]
 [    0     0     0 ...    69   201 14399]
 [    0     0     0 ...   467    11 14401]
 ...
 [    0     0     0 ...   315   557  4837]
 [    0     0     0 ...  2870   119     1]
 [    0     0     0 ...   103    91    75]]


In [15]:
#sequence and padding on Test Data
test_seq = tok.texts_to_sequences(xtest1.ravel())
test_matrix = sequence.pad_sequences(test_seq,maxlen = max_length)
print(test_matrix)

[[    0     0     0 ...   132   198  2585]
 [    0     0     0 ...  9288  9288  7049]
 [    0     0     0 ...   866   111  2110]
 ...
 [    0     0     0 ...   607   408 10380]
 [    0     0     0 ...   484   844   922]
 [    0     0     0 ... 10771   187    39]]


In [18]:
#NN with LSTM
model = Sequential()
model.add(Embedding(input_dim=vocab_len+1,
                    output_dim=32,
                    input_length=max_length,
                    mask_zero=True))

model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))

#model.add(Bidirectional(SimpleRNN(128)))
model.add(Dense(128, activation="tanh"))
model.add(Dense(64, activation="tanh"))
model.add(Dense(32, activation="tanh"))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy")

model.fit(train_matrix, ytrain1, epochs=20, batch_size=256)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa290ae94b0>

In [19]:
#report
y_pred = model.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest1,y_pred))

              precision    recall  f1-score   support

           0       0.68      0.99      0.80      8940
           1       0.97      0.53      0.68      8940

    accuracy                           0.76     17880
   macro avg       0.82      0.76      0.74     17880
weighted avg       0.82      0.76      0.74     17880

