In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import resample, class_weight

file_path = "D:\\sentiment analysis\\Dataset\\sentiment_analysis.csv"
df = pd.read_csv(file_path)

target_column = "sentiment"
text_column = "text"
if target_column not in df.columns or text_column not in df.columns:
    raise ValueError(f"The column '{target_column}' or the column '{text_column}' is missing in the CSV file. Please check the data.")

df = df[[target_column, text_column]]

df["sentiment"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)

df_positive = df[df["sentiment"] == 1]
df_negative = df[df["sentiment"] == 0]
df_negative_upsampled = resample(df_negative, replace=True, n_samples=len(df_positive), random_state=42)
df = pd.concat([df_positive, df_negative_upsampled])

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True) 
stop_words = set(nltk.corpus.stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z ]', '', text.lower())
    tokens = nltk.word_tokenize(text)
    return " ".join([word for word in tokens if word.isalpha() and word not in stop_words])

df["clean_text"] = df["text"].apply(clean_text)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df["clean_text"])
sequences = tokenizer.texts_to_sequences(df["clean_text"])
word_index = tokenizer.word_index

max_length = 50
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
labels = df["sentiment"].values

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

class_weights = class_weight.compute_class_weight("balanced", classes=np.unique(df["sentiment"]), y=df["sentiment"])
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

vocab_size = len(word_index) + 1
embedding_dim = 32

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=128,
    verbose=1,
    validation_data=(X_test, y_test),
    class_weight=class_weight_dict
)

def predict_sentiment(texts, threshold=0.5):
    cleaned_texts = [clean_text(text) for text in texts]
    sequences = tokenizer.texts_to_sequences(cleaned_texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    predictions = model.predict(padded_sequences, verbose=0)
    results = ["Positive " if pred > threshold else "Negative" for pred in predictions]
    return results

test_texts = [
    "I love this product!",
    "I hate this movie!",
    "This is the best experience ever.",
    "Terrible service, I will never come back."
]

results = predict_sentiment(test_texts)
for text, result in zip(test_texts, results):
    print(f"Text: {text} -> {result}")

Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 600ms/step - accuracy: 0.5289 - loss: 0.6936 - val_accuracy: 0.4776 - val_loss: 0.6947
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - accuracy: 0.5009 - loss: 0.6929 - val_accuracy: 0.4776 - val_loss: 0.6965
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.4999 - loss: 0.6933 - val_accuracy: 0.4776 - val_loss: 0.6990
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.5019 - loss: 0.6935 - val_accuracy: 0.4776 - val_loss: 0.7016
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.5048 - loss: 0.6940 - val_accuracy: 0.4776 - val_loss: 0.7048
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.5058 - loss: 0.6929 - val_accuracy: 0.4776 - val_loss: 0.7036
Epoch 7/20
[1m3/3[0m [32m━━━━━━━━━━━