In [20]:
pip install pandas scikit-learn tensorflow nltk



In [33]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import string
import re

# Download stopwords
nltk.download("stopwords")

# Load dataset
file_path = "/content/drive/MyDrive/twitter_training.csv"
df = pd.read_csv(file_path, header=None)

# Rename columns
df.columns = ["id", "category", "sentiment", "text"]

# Select only necessary columns
df = df[["text", "sentiment"]]

# Convert sentiment labels to numerical values
label_encoder = LabelEncoder()
df["sentiment"] = label_encoder.fit_transform(df["sentiment"])  # Encode labels (e.g., Positive=2, Negative=0, Neutral=1)

# Text preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercasing
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove leading/trailing spaces
    words = text.split()  # Split text into words
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(words)

# Apply text preprocessing
df["clean_text"] = df["text"].apply(preprocess_text)

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["sentiment"], test_size=0.2, random_state=42)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# Tokenization
max_words = 5000  # Maximum vocabulary size
max_len = 100  # Maximum sentence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post", truncating="post")


In [24]:
# Define the model
model = keras.Sequential([
    keras.layers.Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(32)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(3, activation='softmax')  # 3 classes: Positive, Negative, Neutral
])

# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Model summary
model.summary()




In [26]:
y_train = np.array(y_train).astype(int)
y_test = np.array(y_test).astype(int)


In [29]:
model.add(keras.layers.Dense(3, activation='softmax'))
print(y_train.shape)  # Should be (num_samples,)
print(y_test.shape)  # Should be (num_samples,)
  # 3 classes


(59745,)
(14937,)


In [31]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])



In [34]:
# Train the model
y_train[y_train > 2] = 2
y_test[y_test > 2] = 2
y_train = np.array(y_train).astype(int)
y_test = np.array(y_test).astype(int)

history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))


Epoch 1/5
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 222ms/step - accuracy: 0.4998 - loss: 1.0116 - val_accuracy: 0.5239 - val_loss: 0.9716
Epoch 2/5
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m410s[0m 220ms/step - accuracy: 0.5189 - loss: 0.9741 - val_accuracy: 0.5239 - val_loss: 0.9716
Epoch 3/5
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m409s[0m 219ms/step - accuracy: 0.5250 - loss: 0.9706 - val_accuracy: 0.5240 - val_loss: 0.9616
Epoch 4/5
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 219ms/step - accuracy: 0.5272 - loss: 0.9602 - val_accuracy: 0.5499 - val_loss: 0.9555
Epoch 5/5
[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 219ms/step - accuracy: 0.5588 - loss: 0.9543 - val_accuracy: 0.5435 - val_loss: 0.9537


In [35]:
# Evaluate on test data
test_loss, test_acc = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

# Test with new text input
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding="post", truncating="post")
    prediction = model.predict(padded_sequence)
    sentiment_label = label_encoder.inverse_transform([np.argmax(prediction)])
    return sentiment_label[0]

# Example test
print(predict_sentiment("I love this movie!"))
print(predict_sentiment("This is the worst experience ever!"))


[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 54ms/step - accuracy: 0.5413 - loss: 0.9529
Test Accuracy: 0.5435
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 833ms/step
Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Negative


In [38]:
print(predict_sentiment("worst movie i have seen"))
print(predict_sentiment("I really love this movie"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Neutral
