In [3]:
import pandas as pd

In [4]:
df=pd.read_csv('Data/twitter_training.csv',header=None,names=['index','borderlands','sentiment','tweet'])

In [5]:
df.head()

Unnamed: 0,index,borderlands,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
df.drop(columns=['index','borderlands'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,sentiment,tweet
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [8]:
df.shape

(74682, 2)

In [9]:
df.isna().count

<bound method DataFrame.count of        sentiment  tweet
0          False  False
1          False  False
2          False  False
3          False  False
4          False  False
...          ...    ...
74677      False  False
74678      False  False
74679      False  False
74680      False  False
74681      False  False

[74682 rows x 2 columns]>

In [10]:
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
74677    False
74678    False
74679    False
74680    False
74681    False
Length: 74682, dtype: bool

In [11]:
#Preprocessing the tweets

1. Lowercasing

In [12]:
df['tweet']=df['tweet'].str.lower()

In [13]:
df['tweet']

0        im getting on borderlands and i will murder yo...
1        i am coming to the borders and i will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands 2 and i will murder ...
                               ...                        
74677    just realized that the windows partition of my...
74678    just realized that my mac window partition is ...
74679    just realized the windows partition of my mac ...
74680    just realized between the windows partition of...
74681    just like the windows partition of my mac is l...
Name: tweet, Length: 74682, dtype: object

In [14]:
df.dropna(inplace=True)

2. Remove Punctuation

In [15]:
import string

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
exclude=string.punctuation

In [18]:
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

In [19]:
df['tweet']=df['tweet'].apply(remove_punc)

Model Training

In [20]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Input

In [21]:
tokenizer=Tokenizer(oov_token="<OOV>")

In [22]:
tokenizer.fit_on_texts(df['tweet'])

In [23]:
len(tokenizer.word_index)

42552

In [24]:
word_counts=tokenizer.word_counts

In [25]:
word_counts

OrderedDict([('im', 3828),
             ('getting', 1280),
             ('on', 12050),
             ('borderlands', 1542),
             ('and', 26579),
             ('i', 29178),
             ('will', 3312),
             ('murder', 81),
             ('you', 12154),
             ('all', 5547),
             ('am', 1765),
             ('coming', 485),
             ('to', 28825),
             ('the', 44419),
             ('borders', 12),
             ('kill', 440),
             ('2', 4055),
             ('me', 6988),
             ('into', 1177),
             ('can', 3476),
             ('so', 7795),
             ('spent', 189),
             ('a', 24095),
             ('few', 539),
             ('hours', 576),
             ('making', 630),
             ('something', 943),
             ('for', 15595),
             ('fun', 1574),
             ('if', 3648),
             ('dont', 2148),
             ('know', 1745),
             ('huge', 418),
             ('fan', 324),
             ('maya', 24)

In [26]:
sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:20]


[('the', 44419),
 ('i', 29178),
 ('to', 28825),
 ('and', 26579),
 ('a', 24095),
 ('of', 19438),
 ('is', 17823),
 ('for', 15595),
 ('in', 15385),
 ('this', 14652),
 ('it', 13775),
 ('you', 12154),
 ('on', 12050),
 ('my', 11841),
 ('that', 10227),
 ('with', 8860),
 ('game', 8052),
 ('so', 7795),
 ('me', 6988),
 ('have', 6682)]

In [27]:
#ideal num words
import numpy as np

counts = np.array(list(word_counts.values()))
counts.sort()
counts = counts[::-1]

cumulative = np.cumsum(counts) / np.sum(counts)

np.where(cumulative >= 0.95)[0][0]


np.int64(12754)

We use num words as 12k 

In [28]:
tokenizer=Tokenizer(num_words=12000,oov_token="OOV")

In [29]:
tokenizer.fit_on_texts(df['tweet'])

max len 

In [30]:
lengths = [len(t.split()) for t in df["tweet"]]

import numpy as np
np.percentile(lengths, [90, 95, 99])


array([40., 47., 57.])

95% of the tweet lengths are 57 so we round it up and make 60

In [31]:
sequences=tokenizer.texts_to_sequences(df['tweet'])

In [32]:
sequences=pad_sequences(sequences,maxlen=60,padding='post',truncating='post')

In [33]:
sequences

array([[  40,  161,   14, ...,    0,    0,    0],
       [   3,  111,  395, ...,    0,    0,    0],
       [  40,  161,   14, ...,    0,    0,    0],
       ...,
       [  22, 1902,    2, ...,    0,    0,    0],
       [  22, 1902,  718, ...,    0,    0,    0],
       [  22,   30,    2, ...,    0,    0,    0]],
      shape=(73996, 60), dtype=int32)

In [34]:
X=sequences

In [35]:
y=df['sentiment']

In [36]:
X.shape

(73996, 60)

In [37]:
y.shape

(73996,)

In [38]:
y.value_counts()

sentiment
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

In [39]:
label_map={
    'Negative':0,
    'Positive':1,
    'Neutral':2,
    'Irrelevant':3
}

In [40]:
y=df['sentiment'].map(label_map)

In [41]:
y

0        1
1        1
2        1
3        1
4        1
        ..
74677    1
74678    1
74679    1
74680    1
74681    1
Name: sentiment, Length: 73996, dtype: int64

In [42]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: sentiment, dtype: int64

In [43]:
y.value_counts()

sentiment
0    22358
1    20655
2    18108
3    12875
Name: count, dtype: int64

Model Building


In [44]:
from tensorflow.keras.layers import Bidirectional,Embedding,Dropout

In [45]:
model=Sequential()

In [46]:
model.add(Input(shape=(60,)))

In [47]:
model.add(Embedding(12000,128))

In [48]:
model.add(Bidirectional(LSTM(64)))

In [49]:
model.add(Dropout(0.5))

In [50]:
model.add(Dense(4,activation='softmax'))

In [51]:
model.summary()

In [52]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [53]:
model.summary()

In [54]:
from tensorflow.keras.callbacks import EarlyStopping

In [55]:
early_stop=EarlyStopping(monitor="val_loss",patience=3,restore_best_weights=True)

In [56]:
history=model.fit(X,y,epochs=10,batch_size=64,validation_split=0.2,callbacks=[early_stop])

Epoch 1/10


[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 51ms/step - accuracy: 0.6419 - loss: 0.8863 - val_accuracy: 0.4742 - val_loss: 1.4727
Epoch 2/10
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.8287 - loss: 0.4701 - val_accuracy: 0.4832 - val_loss: 1.6475
Epoch 3/10
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.8748 - loss: 0.3427 - val_accuracy: 0.4632 - val_loss: 2.0763
Epoch 4/10
[1m925/925[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 44ms/step - accuracy: 0.8999 - loss: 0.2728 - val_accuracy: 0.4588 - val_loss: 2.3067


In [57]:
model.save("model.keras")

In [58]:
import pickle

In [59]:
with open("tokenizer.pkl",'wb') as f:
    pickle.dump(tokenizer,f)