In [32]:
!pip install joblib
import joblib



In [33]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string
import nltk


In [34]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
# Load datasets
train_data = pd.read_csv('train_data.csv', encoding='latin-1', )
test_data = pd.read_csv('test_data.csv', encoding='latin-1')
train_data_sampled = train_data.sample(n=10000, random_state = 42)

In [36]:
# 1. Data Exploration
print(train_data.head())
print(train_data.info())
print(train_data['sentiment'].value_counts())
train_data.shape
test_data.shape

       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  
0  Afghanistan          38928346         652860.0    

(4815, 9)

In [37]:
!pip install clean-text
from cleantext import clean



In [38]:
!pip install contractions
import contractions



In [39]:
!pip install emoji
import emoji



In [40]:
# Data Preprocessing
def preprocess_text_advanced(text):
    if pd.isnull(text):
        return ""
    # Expand contractions
    text = contractions.fix(text)
    # Clean text using clean-text library
    text = clean(text,
                 lower=True,
                 no_line_breaks=True,
                 no_urls=True,
                 no_emails=True,
                 no_phone_numbers=True,
                 no_numbers=True,
                 no_digits=True,
                 no_currency_symbols=True,
                 no_punct=True,
                 replace_with_punct="",
                 replace_with_url="<URL>",
                 replace_with_email="<EMAIL>",
                 replace_with_phone_number="<PHONE>",
                 replace_with_number="<NUMBER>",
                 replace_with_digit="0",
                 replace_with_currency_symbol="<CUR>",
                 lang="en")
    # Remove emojis
    text = emoji.demojize(text)

    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Convert all entries to strings and fill missing values
train_data_sampled['text'] = train_data_sampled['text'].astype(str).fillna('')
test_data['text'] = test_data['text'].astype(str).fillna('')

train_data_sampled['cleaned_text'] = train_data_sampled['text'].apply(preprocess_text_advanced)
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text_advanced)

In [44]:
# Import necessary libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Fill missing values in both train and test data before encoding
train_data_sampled['sentiment'] = train_data_sampled['sentiment'].fillna(method='ffill')
test_data['sentiment'] = test_data['sentiment'].fillna(method='ffill')


# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data_sampled['cleaned_text'])
X_train_seq = tokenizer.texts_to_sequences(train_data_sampled['cleaned_text'])
X_test_seq = tokenizer.texts_to_sequences(test_data['cleaned_text'])

# Pad sequences
max_sequence_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(train_data_sampled['sentiment'])
y_test_encoded = le.transform(test_data['sentiment'])


# Convert labels to categorical
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)


In [46]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [47]:
# Build LSTM model
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [48]:
# Train the model
model.fit(X_train_padded, y_train_categorical, epochs=5, batch_size=64, validation_split=0.1, verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical, verbose=2)
print("LSTM Model Accuracy: ", accuracy)

# Make predictions
lstm_pred = model.predict(X_test_padded)
lstm_pred_classes = np.argmax(lstm_pred, axis=1)


Epoch 1/5
141/141 - 50s - loss: 0.9612 - accuracy: 0.5277 - val_loss: 0.8084 - val_accuracy: 0.6390 - 50s/epoch - 354ms/step
Epoch 2/5
141/141 - 44s - loss: 0.6815 - accuracy: 0.7162 - val_loss: 0.7650 - val_accuracy: 0.6690 - 44s/epoch - 311ms/step
Epoch 3/5
141/141 - 44s - loss: 0.5453 - accuracy: 0.7888 - val_loss: 0.7880 - val_accuracy: 0.6800 - 44s/epoch - 314ms/step
Epoch 4/5
141/141 - 44s - loss: 0.4564 - accuracy: 0.8313 - val_loss: 0.8681 - val_accuracy: 0.6680 - 44s/epoch - 312ms/step
Epoch 5/5
141/141 - 45s - loss: 0.3905 - accuracy: 0.8614 - val_loss: 0.9355 - val_accuracy: 0.6490 - 45s/epoch - 317ms/step
151/151 - 4s - loss: 0.8603 - accuracy: 0.7410 - 4s/epoch - 30ms/step
LSTM Model Accuracy:  0.7410176396369934


In [49]:

# Evaluation Metrics
def print_evaluation_metrics(y_true, y_pred, model_name):
    y_true = y_true.astype(str)
    y_pred = y_pred.astype(str)

    print(f"Evaluation Metrics for {model_name}:")
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("Precision: ", precision_score(y_true, y_pred, average='weighted'))
    print("Recall: ", recall_score(y_true, y_pred, average='weighted'))
    print("F1 Score: ", f1_score(y_true, y_pred, average='weighted'))
    print("Confusion Matrix: \n", confusion_matrix(y_true, y_pred))

print_evaluation_metrics(y_test_encoded, lstm_pred_classes, "LSTM Model")

Evaluation Metrics for LSTM Model:
Accuracy:  0.7410176531671859
Precision:  0.7502011262407671
Recall:  0.7410176531671859
F1 Score:  0.744557045265264
Confusion Matrix: 
 [[ 693  262   46]
 [ 363  879  188]
 [  84  304 1996]]
