In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
import nltk 
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv")

In [3]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
X_an = df["tweet"]
y_an = df["label"]

In [5]:
X_an

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                     thank you @user for you follow  
Name: tweet, Length: 31962, dtype: object

In [6]:
X_tweet = pd.read_csv("Dataset/tweets/tweets/tweets_data.txt",names=["messages"]) 
y_tweet = pd.read_csv("Dataset/tweets/tweets/tweets_label.txt",names=["labels"])
X_facebook = pd.read_csv("Dataset/facebook_comments/facebook_comments/fb_data.txt", sep='\t',names=["messages"])
y_facebook = pd.read_csv("Dataset/facebook_comments/facebook_comments/fb_label.txt",names=["labels"])

In [7]:
X = pd.concat([X_tweet,X_facebook])
y = pd.concat([y_tweet,y_facebook])

In [8]:
X.head()

Unnamed: 0,messages
0,The NBC Commissary has @CokeZero on fountain. ...
1,@cokezero vanilla is really good with fish tacos.
2,just drinkin some @CokeZero and listening to @...
3,@CokeZero just incredible taste! it is hard to...
4,@cokezero If this doesn't make you want a Vani...


In [9]:
y.head()

Unnamed: 0,labels
0,P
1,P
2,P
3,P
4,P


In [10]:
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

## Text Preprocessing 

In [11]:
ps = PorterStemmer()
cleaned_text = []
for i in range(0, len(X_an)):
    text = re.sub("[^a-zA-Z]", ' ',str(X_an[i]))
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words("english")]
    text = ' '.join(text)
    cleaned_text.append(text)

In [12]:
len(cleaned_text)

31962

In [13]:
vocab_size = 50000

In [14]:
ohe = [one_hot(i,vocab_size) for i in cleaned_text]

In [15]:
count_array = []
count = 0
for i in range(len(cleaned_text)):
    count = len(cleaned_text[i].split())
    count_array.append(count)

In [16]:
count_array[10]

12

In [17]:
max(count_array)

37

In [18]:
len(count_array)

31962

In [19]:
cleaned_text[10]

'ireland consum price index mom climb previou may blog silver gold forex'

In [20]:
sentence_length_padding = 37

In [21]:
embeddings = pad_sequences(ohe,padding="pre",maxlen=sentence_length_padding)

In [22]:
embeddings

array([[    0,     0,     0, ..., 29219, 18878, 47613],
       [    0,     0,     0, ..., 39930, 42370, 27796],
       [    0,     0,     0, ...,     0, 29637, 11433],
       ...,
       [    0,     0,     0, ..., 34736, 18503, 28788],
       [    0,     0,     0, ..., 16377, 41329, 13381],
       [    0,     0,     0, ..., 37942, 45796, 14473]])

In [23]:
dimensions = 100

In [24]:
model = Sequential()
model.add(Embedding(vocab_size,dimensions,input_length=sentence_length_padding))
model.add(LSTM(100))
model.add(Dense(1,activation="sigmoid"))
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 37, 100)           5000000   
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 5080501 (19.38 MB)
Trainable params: 5080501 (19.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
X_final = np.array(embeddings)
y_final = np.array(y_an)

In [26]:
X_final.shape

(31962, 37)

In [27]:
y_final.shape

(31962,)

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_final,y_final,test_size=0.3,random_state=42)

In [29]:
X_train.shape

(22373, 37)

In [30]:
y_train.shape

(22373,)

In [31]:
model.fit(X_train,y_train,epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1b8deb56990>

In [32]:
len(set(y))

1

In [33]:
y_pred = model.predict(X_test)



In [38]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [37]:
y_pred = np.where(y_pred>0.5,1,0)

In [39]:
confusion_matrix(y_test,y_pred)

array([[8629,  276],
       [ 244,  440]], dtype=int64)

In [40]:
accuracy_score(y_test,y_pred)

0.9457711961622692