In [1]:
import nltk
import random
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Download NLTK data (if not already downloaded)
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# load dataset for train and test.
df_train = pd.read_csv("/content/drive/MyDrive/REX Technologies/Text Sentiment Analysis/train.csv",
                       encoding="latin-1")
df_test = pd.read_csv("/content/drive/MyDrive/REX Technologies/Text Sentiment Analysis/test.csv",
                      encoding="latin-1")

df_train = df_train[["text", "sentiment"]]
df_test = df_test[["text", "sentiment"]]
df_train = df_train.dropna()
df_test = df_test.dropna()
display(df_train.head())
display(df_test.head())

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


In [3]:
# Define a function to preprocess the text
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords and convert to lowercase
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    return " ".join(filtered_words)

# Preprocess the dataset
df_train["text"] = df_train["text"].apply(preprocess_text)
df_test["text"] = df_test["text"].apply(preprocess_text)

X_train = df_train['text'].tolist()
y_train = df_train['sentiment'].tolist()
X_test = df_test['text'].tolist()
y_test = df_test['sentiment'].tolist()

# y labal encodering
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [4]:
# Tokenize and pad sequences
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len)

# Convert class vectors to binary class matrices (one-hot encoding)
y_train_one_hot = to_categorical(y_train, num_classes=3)
y_test_one_hot = to_categorical(y_test, num_classes=3)

In [5]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(3, activation="softmax"))  # Output layer for 3 classes

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [6]:

# Train the model
epochs = 10
batch_size = 32
model.fit(X_train_padded, y_train_one_hot, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# Evaluate the model
y_pred = model.predict(X_test_padded)
y_pred_classes = [int(val.argmax()) for val in y_pred]

accuracy = accuracy_score(y_test, y_pred_classes)
classification_rep = classification_report(y_test, y_pred_classes)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_rep)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.67
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.66      0.65      1001
           1       0.64      0.64      0.64      1430
           2       0.74      0.70      0.72      1103

    accuracy                           0.67      3534
   macro avg       0.67      0.67      0.67      3534
weighted avg       0.67      0.67      0.67      3534

