In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense

# Loading IMDB dataset
dataset = pd.read_csv('IMDB Dataset.csv')

# Cleaning and preprocessing the data
def preprocess_text(review):
    
    review = re.sub(r'[^\w\s]', '', review)  
    review = review.lower().strip()  
    return review

dataset['review'] = dataset['review'].apply(preprocess_text)
dataset['sentiment'] = dataset['sentiment'].map({'positive': 1, 'negative': 0})

# Splitting the dataset into training and testing sets
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Tokenization
tokenizer = Tokenizer(num_words=8000)  # Vocab size - 8000
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=200)  # max pad len - 200
X_test_padded = pad_sequences(X_test_seq, maxlen=200)

# Building the sentiment analysis model
model = Sequential([
    Embedding(input_dim=8000, output_dim=128, input_length=200),
    Dropout(0.3),
    LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Training
history = model.fit(
    X_train_padded, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_test_padded, y_test),
    verbose=2
)

# Model Evaluation
predictions = (model.predict(X_test_padded) > 0.5).astype(int)

accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy:.2%}")

# Confusion Matrix and Classification Report
print("\nClassification Report:\n", classification_report(y_test, predictions))
cm = confusion_matrix(y_test, predictions)

# Confusion Matrix Vizualization
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
