In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras import utils
import re

In [None]:
data = pd.read_csv('Twitter_Data.csv')
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [None]:
data = data[data['category']!=0.0]
data

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
8,with upcoming election india saga going import...,1.0
...,...,...
162972,engine growth modi unveils indias first 12000 ...,1.0
162973,modi promised 2014 lok sabha elections that be...,1.0
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0


In [None]:
# Only keeping the necessary columns

data['sentiment'] = np.where(data['category'] == 1, 1, 0)
data = data[['clean_text','sentiment']]
data

Unnamed: 0,clean_text,sentiment
0,when modi promised “minimum government maximum...,0
2,what did just say vote for modi welcome bjp t...,1
3,asking his supporters prefix chowkidar their n...,1
4,answer who among these the most powerful world...,1
8,with upcoming election india saga going import...,1
...,...,...
162972,engine growth modi unveils indias first 12000 ...,1
162973,modi promised 2014 lok sabha elections that be...,1
162975,why these 456 crores paid neerav modi not reco...,0
162976,dear rss terrorist payal gawar what about modi...,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107767 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   clean_text  107765 non-null  object
 1   sentiment   107767 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


In [None]:
data['clean_text']= data['clean_text'].astype(str)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107767 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   clean_text  107767 non-null  object
 1   sentiment   107767 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_text']= data['clean_text'].astype(str)


Next, I am dropping the 'Neutral' sentiments as my goal was to only differentiate positive and negative tweets. After that, I am filtering the tweets so only valid texts and words remain.  Then, I define the number of max features as 2000 and use Tokenizer to vectorize and convert text into Sequences so the Network can deal with it as input.

In [None]:
data['clean_text'] = data['clean_text'].apply(lambda x: x.lower())
data['clean_text'] = data['clean_text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 1].size)
print(data[ data['sentiment'] == 0].size)



max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['clean_text'].values)
X = tokenizer.texts_to_sequences(data['clean_text'].values)
print(X[0])
X = pad_sequences(X, padding='post')
print(X[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_text'] = data['clean_text'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_text'] = data['clean_text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))


144500
71034
[46, 1, 339, 748, 71, 1971, 759, 881, 40, 2, 956, 204, 2, 206, 33, 164, 109, 48, 70, 1035, 206, 58, 3, 9, 540, 3, 58, 3]
[  46    1  339  748   71 1971  759  881   40    2  956  204    2  206
   33  164  109   48   70 1035  206   58    3    9  540    3   58    3
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]


In [None]:
X.shape

(107767, 47)

In [None]:
data['clean_text'][0]

'when modi promised minimum government maximum governance expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples'

In [None]:
data['sentiment']

0         0
2         1
3         1
4         1
8         1
         ..
162972    1
162973    1
162975    0
162976    0
162979    1
Name: sentiment, Length: 107767, dtype: int64

In [None]:
Y = data['sentiment']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(72203, 47) (72203,)
(35564, 47) (35564,)


In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X_train.shape[1]))
model.add(LSTM(lstm_out))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])


In [None]:
batch_size = 256
model.fit(X_train, Y_train, epochs = 3, batch_size=batch_size, verbose = 2)

Epoch 1/3
283/283 - 147s - loss: 0.4502 - accuracy: 0.7860 - 147s/epoch - 520ms/step
Epoch 2/3
283/283 - 138s - loss: 0.2570 - accuracy: 0.9022 - 138s/epoch - 489ms/step
Epoch 3/3
283/283 - 138s - loss: 0.2198 - accuracy: 0.9152 - 138s/epoch - 488ms/step


<keras.src.callbacks.History at 0x7ac96ab8dc30>

In [None]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("acc: %.2f" % (acc))

139/139 - 25s - loss: 0.2079 - accuracy: 0.9210 - 25s/epoch - 179ms/step
acc: 0.92


In [None]:
twt = ['Meetings: Because none of us is as dumb as all of us.']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_3` output
twt = pad_sequences(twt, maxlen=47, padding = 'post', dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
sentiment

[[ 73 807 922  16   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0]]
1/1 - 0s - 427ms/epoch - 427ms/step


array([0.01727965], dtype=float32)

In [None]:
if((sentiment) <= 0.5):
    print("negative")
elif ((sentiment) > 0.5):
    print("positive")

negative


Saved model to disk
