In [1]:
import pandas as pd

df = pd.read_csv('../data/twitter_dataset.csv')
print(df.columns)
df.head()

Index(['clean_text', 'category'], dtype='object')


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [2]:
import sys
import os
sys.path.append("..")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import pickle
from utils.preprocessing import clean_text

In [3]:
df = pd.read_csv('../data/twitter_dataset.csv') 
df = df[['clean_text', 'category']]            
df = df.rename(columns={'clean_text': 'text', 'category': 'sentiment'})  
df['cleaned_text'] = df['text'].astype(str).apply(clean_text)
df.head() 

Unnamed: 0,text,sentiment,cleaned_text
0,when modi promised “minimum government maximum...,-1.0,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,say vote modi welcome bjp told rahul main camp...
3,asking his supporters prefix chowkidar their n...,1.0,asking supporters prefix chowkidar names modi ...
4,answer who among these the most powerful world...,1.0,answer among powerful world leader today trump...


In [4]:
df = df.dropna(subset=['sentiment'])

# Debug: Check unique sentiment values
print("Unique sentiment values:", df['sentiment'].unique())
print("Sentiment distribution:")
print(df['sentiment'].value_counts())

# Create label mapping
label_map = {-1.0: 0, 0.0: 1, 1.0: 2}
df['label'] = df['sentiment'].map(label_map)

# Check for unmapped values
unmapped_sentiments = df[df['label'].isna()]['sentiment'].unique()
if len(unmapped_sentiments) > 0:
    print(f"WARNING: Found unmapped sentiment values: {unmapped_sentiments}")

# Drop rows with missing labels
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# Final check
print("Final label distribution:")
print(df['label'].value_counts())
print("Label mapping verification:")
for sentiment, label in df[['sentiment', 'label']].drop_duplicates().values:
    print(f"  {sentiment} -> {label}")

Unique sentiment values: [-1.  0.  1.]
Sentiment distribution:
sentiment
 1.0    72250
 0.0    55213
-1.0    35510
Name: count, dtype: int64
Final label distribution:
label
2    72250
1    55213
0    35510
Name: count, dtype: int64
Label mapping verification:
  -1.0 -> 0.0
  0.0 -> 1.0
  1.0 -> 2.0


In [5]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])

sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded = pad_sequences(sequences, maxlen=100, padding='post')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    padded, df['label'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']  # Ensure balanced split
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print("Training set label distribution:")
print(pd.Series(y_train).value_counts().sort_index())
print("Test set label distribution:")
print(pd.Series(y_test).value_counts().sort_index())

Training set size: 130378
Test set size: 32595
Training set label distribution:
label
0    28408
1    44170
2    57800
Name: count, dtype: int64
Test set label distribution:
label
0     7102
1    11043
2    14450
Name: count, dtype: int64


In [7]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, mask_zero=True),  # Removed input_length
    SimpleRNN(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3),
    SimpleRNN(64, dropout=0.3, recurrent_dropout=0.3),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

# Use lower learning rate
optimizer = Adam(learning_rate=0.001)

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

model.summary()



NameError: name 'Dropout' is not defined

In [None]:
from sklearn.utils import class_weight

# Compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weights_dict}")

# Add callbacks for better training
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=0.0001,
    verbose=1
)

# Train model with callbacks
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=128,
    class_weight=class_weights_dict,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
model.save("model/sentiment_model.keras")

with open("model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
def predict_single(text, debug=False):
    """
    Enhanced prediction function
    """
    if debug:
        print(f"Original text: '{text}'")
    
    # Clean the text
    cleaned = clean_text(text)
    if debug:
        print(f"Cleaned text: '{cleaned}'")
    
    # Handle empty text
    if not cleaned.strip():
        return "Neutral" if not debug else ("Neutral", 0.33)
    
    # Convert to sequence
    seq = tokenizer.texts_to_sequences([cleaned])
    if debug:
        print(f"Sequence: {seq}")
    
    # Check for empty sequence
    if not seq[0]:
        return "Neutral" if not debug else ("Neutral", 0.33)
    
    # Pad sequence
    padded_seq = pad_sequences(seq, maxlen=100, padding='post')
    
    # Make prediction
    pred_probs = model.predict(padded_seq, verbose=0)
    pred_class = pred_probs.argmax()
    confidence = pred_probs.max()
    
    if debug:
        print(f"Prediction probabilities: {pred_probs[0]}")
        print(f"Predicted class: {pred_class}")
        print(f"Confidence: {confidence:.3f}")
    
    labels = ['Negative', 'Neutral', 'Positive']
    sentiment = labels[pred_class]
    
    if debug:
        return sentiment, confidence
    else:
        return sentiment

# Test predictions
test_texts = [
    "I absolutely love this product!",
    "It was terrible and boring.",
    "It's fine, not too bad.",
    "This is amazing and wonderful!",
    "I hate this so much, it's awful",
    "It's okay, nothing special"
]

print("=== TESTING PREDICTIONS ===")
for text in test_texts:
    result = predict_single(text, debug=True)
    if isinstance(result, tuple):
        sentiment, confidence = result
        print(f"Final Result: {sentiment} (Confidence: {confidence:.3f})")
    else:
        print(f"Final Result: {result}")
    print("-" * 50)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate on test set
print("\n=== MODEL EVALUATION ===")
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

# Classification report
labels = ['Negative', 'Neutral', 'Positive']
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=labels))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred_classes)
print(cm)

# Check prediction distribution
print("\nPrediction Distribution:")
unique, counts = np.unique(y_pred_classes, return_counts=True)
for i, count in enumerate(counts):
    print(f"Class {i} ({labels[i]}): {count} predictions ({count/len(y_pred_classes)*100:.1f}%)")

In [None]:
df['label'].value_counts()

In [None]:
df['label'].value_counts()

In [None]:
print(clean_text("I love this!")) 

In [None]:
df['label'].isnull().sum()

In [None]:
print(f"Tokenizer vocabulary size: {len(tokenizer.word_index)}")
print(f"Sample word indices: {dict(list(tokenizer.word_index.items())[:10])}")

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.show()

In [None]:
def predict_single(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded_seq = pad_sequences(seq, maxlen=100)
    pred = model.predict(padded_seq)
    print("Prediction probabilities:", pred)
    return ['Negative', 'Neutral', 'Positive'][pred.argmax()]

In [None]:
for i in range(5):
    print(df['cleaned_text'][i], "=>", df['label'][i])

In [None]:
print(df['label'].value_counts())