In [48]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
import nltk 
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout,SpatialDropout1D

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv")

In [4]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
X = df["tweet"]
y = df["label"]

In [6]:
X

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                     thank you @user for you follow  
Name: tweet, Length: 31962, dtype: object

In [7]:
y

0        0
1        0
2        0
3        0
4        0
        ..
31957    0
31958    0
31959    0
31960    1
31961    0
Name: label, Length: 31962, dtype: int64

In [11]:
ps = PorterStemmer()
cleaned_text_ps = []
def text_preprocessing_ps(l):
    text = re.sub("[^a-zA-Z]", ' ',str(l))
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words("english")]
    text = ' '.join(text)
    return text

In [13]:
cleaned_text_ps = []
for i in range(len(X)):
    cleaned_text_ps.append(text_preprocessing_ps(X[i]))

In [14]:
cleaned_text_ps

['user father dysfunct selfish drag kid dysfunct run',
 'user user thank lyft credit use caus offer wheelchair van pdx disapoint getthank',
 'bihday majesti',
 'model love u take u time ur',
 'factsguid societi motiv',
 'huge fan fare big talk leav chao pay disput get allshowandnogo',
 'user camp tomorrow user user user user user user user danni',
 'next school year year exam think school exam hate imagin actorslif revolutionschool girl',
 'love land allin cav champion cleveland clevelandcavali',
 'user user welcom gr',
 'ireland consum price index mom climb previou may blog silver gold forex',
 'selfish orlando standwithorlando pulseshoot orlandoshoot biggerproblem selfish heabreak valu love',
 'get see daddi today day gettingf',
 'user cnn call michigan middl school build wall chant tcot',
 'comment australia opkillingbay seashepherd helpcovedolphin thecov helpcovedolphin',
 'ouch junior angri got junior yugyoem omg',
 'thank paner thank posit',
 'retweet agre',
 'friday smile around

In [16]:
nltk.download('wordnet') 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\samar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [18]:
lm = WordNetLemmatizer()
cleaned_text_lm = []
def text_preprocessing_lm(l):
    text = re.sub("[^a-zA-Z]", ' ',str(l))
    text = text.lower()
    text = text.split()
    text = [lm.lemmatize(word) for word in text if not word in stopwords.words("english")]
    text = ' '.join(text)
    return text

In [19]:
cleaned_text_lm = []
for i in range(len(X)):
    cleaned_text_lm.append(text_preprocessing_lm(X[i]))

In [20]:
cleaned_text_lm

['user father dysfunctional selfish drag kid dysfunction run',
 'user user thanks lyft credit use cause offer wheelchair van pdx disapointed getthanked',
 'bihday majesty',
 'model love u take u time ur',
 'factsguide society motivation',
 'huge fan fare big talking leave chaos pay dispute get allshowandnogo',
 'user camping tomorrow user user user user user user user danny',
 'next school year year exam think school exam hate imagine actorslife revolutionschool girl',
 'love land allin cavs champion cleveland clevelandcavaliers',
 'user user welcome gr',
 'ireland consumer price index mom climbed previous may blog silver gold forex',
 'selfish orlando standwithorlando pulseshooting orlandoshooting biggerproblems selfish heabreaking value love',
 'get see daddy today day gettingfed',
 'user cnn call michigan middle school build wall chant tcot',
 'comment australia opkillingbay seashepherd helpcovedolphins thecove helpcovedolphins',
 'ouch junior angry got junior yugyoem omg',
 'thankf

In [24]:
def text_embedding(vocab_size,cleaned_text):
    ohe = [one_hot(i,vocab_size) for i in cleaned_text]
    count_array = []
    count = 0
    for i in range(len(cleaned_text)):
        count = len(cleaned_text[i].split())
        count_array.append(count)
    sentence_length_padding = max(count_array)
    embeddings = pad_sequences(ohe,padding="pre",maxlen=sentence_length_padding)
    return embeddings,sentence_length_padding

In [25]:
vocab_size = 50000
embeddings_ps,sentence_length_padding = text_embedding(vocab_size,cleaned_text_ps)
embeddings_lm,sentence_length_padding = text_embedding(vocab_size,cleaned_text_lm)

In [54]:
dimensions = 100
model_ps = Sequential()
model_ps.add(Embedding(vocab_size,dimensions,input_length=sentence_length_padding))
model_ps.add(SpatialDropout1D(0.2))
model_ps.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
model_ps.add(Dense(1,activation="sigmoid"))
model_ps.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
model_ps.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 37, 100)           5000000   
                                                                 
 spatial_dropout1d_2 (Spati  (None, 37, 100)           0         
 alDropout1D)                                                    
                                                                 
 lstm_6 (LSTM)               (None, 100)               80400     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 5080501 (19.38 MB)
Trainable params: 5080501 (19.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [76]:
dimensions = 100
model_lm = Sequential()
model_lm.add(Embedding(vocab_size,dimensions,input_length=sentence_length_padding))
model_lm.add(LSTM(100,dropout=0.1,recurrent_dropout=0.1))
model_lm.add(Dense(1,activation="sigmoid"))
model_lm.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
model_lm.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 37, 100)           5000000   
                                                                 
 lstm_9 (LSTM)               (None, 100)               80400     
                                                                 
 dense_6 (Dense)             (None, 1)                 101       
                                                                 
Total params: 5080501 (19.38 MB)
Trainable params: 5080501 (19.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
X_final_ps = np.array(embeddings_ps)
X_final_lm = np.array(embeddings_lm)
y_final = np.array(y)

In [35]:
from sklearn.model_selection import train_test_split
X_train_ps,X_test_ps,y_train_ps,y_test_ps = train_test_split(X_final_ps,y_final,test_size=0.2,random_state=42)
X_train_lm,X_test_lm,y_train_lm,y_test_lm = train_test_split(X_final_lm,y_final,test_size=0.2,random_state=42)

In [55]:
model_ps.fit(X_train_ps,y_train_ps,epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x19b0fe5ed50>

In [77]:
model_lm.fit(X_train_lm,y_train_lm,epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x19b1086ad90>

In [71]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [72]:
def accuracy(model,X_test,y_test):    
    y_pred = model.predict(X_test)
    y_pred = np.where(y_pred>0.5,1,0)
    accuracyScore = accuracy_score(y_test,y_pred)
    confusionMatrix = confusion_matrix(y_test,y_pred)
    return accuracyScore,confusionMatrix

In [78]:
as_ps,cm_ps = accuracy(model_ps,X_test_ps,y_test_ps)
as_lm,cm_lm = accuracy(model_lm,X_test_lm,y_test_lm)



In [79]:
as_ps,cm_ps

(0.9568277803847959,
 array([[5826,  111],
        [ 165,  291]], dtype=int64))

In [80]:
as_lm,cm_lm

(0.9552635695291726,
 array([[5805,  132],
        [ 154,  302]], dtype=int64))

## Shallow Machine Learning Models

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import naive_bayes
from sklearn.ensemble import GradientBoostingClassifier

In [52]:
LR_model = LogisticRegression()
SVC_model = LinearSVC()
NB_model = naive_bayes.MultinomialNB()
RFC_model = RandomForestClassifier()
GBC_model = GradientBoostingClassifier()

In [53]:
LR_model.fit(X_train, y_train)
SVC_model.fit(X_train, y_train)
NB_model.fit(X_train, y_train)
RFC_model.fit(X_train, y_train)
GBC_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GradientBoostingClassifier()

In [57]:
shallow_models = [LR_model, SVC_model, NB_model, RFC_model, GBC_model]

In [62]:
def accuracy(shallow_models, X_test, y_test): 
    accuracyScore = []
    confusionMatrix = []
    for i in shallow_models:
        y_pred = i.predict(X_test)
        y_pred = np.where(y_pred>0.5,1,0)
        accuracyScore_val = accuracy_score(y_test,y_pred)
        confusionMatrix_val = confusion_matrix(y_test,y_pred)
        
        accuracyScore.append(accuracyScore_val)
        confusionMatrix.append(confusionMatrix_val)
    return accuracyScore,confusionMatrix

In [68]:
model_names = ["Logistic Regression", "Support Vector Classifier", "Naive Bayes", "Random Forest Classifier", "Gradient Boosting Classifier"]

In [69]:
def print_model_metrics(models, model_names, X_test, y_test):
    accuracy_scores, confusion_matrices = accuracy(models, X_test, y_test)
    for model_name, accuracy_score_value, confusion_matrix_value in zip(model_names, accuracy_scores, confusion_matrices):
        print(f"Model: {model_name}")
        print(f"Accuracy Score: {accuracy_score_value:.4f}")
        print("Confusion Matrix:")
        print(confusion_matrix_value)
        print("\n")

In [70]:
print_model_metrics(shallow_models, model_names, X_test, y_test)

Model: Logistic Regression
Accuracy Score: 0.9287
Confusion Matrix:
[[5937    0]
 [ 456    0]]


Model: Support Vector Classifier
Accuracy Score: 0.9260
Confusion Matrix:
[[5919   18]
 [ 455    1]]


Model: Naive Bayes
Accuracy Score: 0.5584
Confusion Matrix:
[[3312 2625]
 [ 198  258]]


Model: Random Forest Classifier
Accuracy Score: 0.9395
Confusion Matrix:
[[5932    5]
 [ 382   74]]


Model: Gradient Boosting Classifier
Accuracy Score: 0.9343
Confusion Matrix:
[[5937    0]
 [ 420   36]]


