In [1]:
import pandas as pd

df = pd.read_csv('../data/twitter_dataset.csv')
print(df.columns)
df.head()

Index(['clean_text', 'category'], dtype='object')


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [2]:
import sys
import os
sys.path.append("..")  # or os.getcwd() depending on where the notebook is

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

from utils.preprocessing import clean_text

In [3]:
df = pd.read_csv('../data/twitter_dataset.csv') 
df = df[['clean_text', 'category']]            
df = df.rename(columns={'clean_text': 'text', 'category': 'sentiment'})  
df['cleaned_text'] = df['text'].astype(str).apply(clean_text)
df.head() 

Unnamed: 0,text,sentiment,cleaned_text
0,when modi promised “minimum government maximum...,-1.0,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar names modi ...
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...


In [4]:
label_map = {-1.0: 0, 0.0: 1, 1.0: 2}
df['label'] = df['sentiment'].map(label_map)

# Drop rows with missing labels (i.e., unmapped sentiment)
df = df.dropna(subset=['label'])

# Convert label column to int
df['label'] = df['label'].astype(int)

In [5]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])

sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded = pad_sequences(sequences, maxlen=100, padding='post')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(padded, df['label'], test_size=0.2, random_state=42)

In [7]:
model = Sequential()
model.add(Embedding(10000, 64, input_length=100))
model.add(SimpleRNN(128))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
from sklearn.utils import class_weight

# Compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

# Train model with class weights
history = model.fit(
    X_train, y_train,
    epochs=11,
    batch_size=64,
    validation_data=(X_test, y_test),
    class_weight=class_weights_dict
)

Epoch 1/11
[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 24ms/step - accuracy: 0.3409 - loss: 1.0983 - val_accuracy: 0.3363 - val_loss: 1.0962
Epoch 2/11
[1m2038/2038[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 24ms/step - accuracy: 0.3380 - loss: 1.0987 - val_accuracy: 0.3363 - val_loss: 1.0991
Epoch 3/11
[1m 207/2038[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m40s[0m 22ms/step - accuracy: 0.3476 - loss: 1.0946

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
model.save("model/sentiment_model.keras")

with open("model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
def predict_single(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded_seq = pad_sequences(seq, maxlen=100)
    pred = model.predict(padded_seq)
    return ['Negative', 'Neutral', 'Positive'][pred.argmax()]

# Try it
print(predict_single("I absolutely love this product!"))
print(predict_single("It was terrible and boring."))
print(predict_single("It's fine, not too bad."))

In [None]:
df['label'].value_counts()

In [None]:
df['label'].value_counts()

In [None]:
print(clean_text("I love this!")) 

In [None]:
df['label'].isnull().sum()

In [None]:
print(f"Tokenizer vocabulary size: {len(tokenizer.word_index)}")
print(f"Sample word indices: {dict(list(tokenizer.word_index.items())[:10])}")

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.show()

In [None]:
def predict_single(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded_seq = pad_sequences(seq, maxlen=100)
    pred = model.predict(padded_seq)
    print("Prediction probabilities:", pred)
    return ['Negative', 'Neutral', 'Positive'][pred.argmax()]

In [None]:
for i in range(5):
    print(df['cleaned_text'][i], "=>", df['label'][i])

In [None]:
print(df['label'].value_counts())