In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam




In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Trisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Trisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv("IMDB Dataset.csv")

In [5]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [7]:
df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
def tokenize_text(review):
    return word_tokenize(review.lower())

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.isalnum() and word not in stop_words]

In [9]:
df['tokenized_text'] = df['review'].apply(tokenize_text)

df['filtered_text'] = df["tokenized_text"].apply(remove_stopwords)

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
label_encoder  = LabelEncoder()
    
df["sentiment"] = label_encoder.fit_transform(df["sentiment"])

In [12]:
df.head()

Unnamed: 0,review,sentiment,tokenized_text,filtered_text
0,One of the other reviewers has mentioned that ...,1,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, 1, oz, e..."
1,A wonderful little production. <br /><br />The...,1,"[a, wonderful, little, production, ., <, br, /...","[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,1,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,0,"[basically, there, 's, a, family, where, a, li...","[basically, family, little, boy, jake, thinks,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, mattei, 's, ``, love, in, the, time, ...","[petter, mattei, love, time, money, visually, ..."


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['filtered_text'],df['sentiment'], test_size = 0.2, random_state=42)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(CountVectorizer(),MultinomialNB())

model.fit(x_train.astype(str), y_train)

In [15]:
y_pred = model.predict(x_test.astype(str))

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy: .4f}")
print("\nClassification Report:\n", report)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy:  0.8566

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.86      4961
           1       0.87      0.84      0.85      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000


Confusion Matrix:
 [[4340  621]
 [ 813 4226]]


In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()


tokenizer.fit_on_texts(df['filtered_text'])
x_train_tf = tokenizer.texts_to_sequences(x_train)
x_test_tf = tokenizer.texts_to_sequences(x_test)

model_tf = Sequential([
    Embedding(input_dim= len(tokenizer.word_index) + 1, output_dim=16),
    LSTM(32),
    Dense(32, activation = 'relu'),
    Dense(1,activation = 'sigmoid')
])

In [None]:
model_tf.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])

model_tf.fit(x_train_tf, y_train, epochs=5, batch_size=32, validation_split = 0.2)

In [None]:
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [None]:
x_train_pt = torch.LongTensor(x_train_tf)
y_train_pt = torch.LongTensor(y_train.values)
x_test_pt = torch.LongTensor(x_test_tf)
y_test_pt = torch.LongTensor(y_test.values)

In [None]:
train_dataset = TensorDataset(x_train_pt, y_train_pt)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size):
        super(SimpleLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        x = hn[-1, :]  # Take the last hidden state
        x = self.fc(x)
        return x

model_pt = SimpleLSTM(vocab_size=len(tokenizer.word_index) + 1, embed_dim=16, hidden_size=32, output_size=1)

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_pt.parameters(), lr=0.001)

# Train the model
num_epochs = 5
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model_pt(inputs)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Tokenize and pad sequences for the testing set
X_test_tf = tokenizer.texts_to_sequences(X_test)
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_tf, maxlen=len(X_train_tf[0]))

# Predictions on the testing set
y_pred_tf = model_tf.predict(X_test_padded)
y_pred_tf = (y_pred_tf > 0.5).astype(int)

# Convert y_test to numpy array if it's not already
y_test_np = y_test.to_numpy()

# Evaluation metrics
accuracy_tf = accuracy_score(y_test_np, y_pred_tf)
report_tf = classification_report(y_test_np, y_pred_tf)
conf_matrix_tf = confusion_matrix(y_test_np, y_pred_tf)

print("TensorFlow Model (LSTM) Evaluation:")
print(f"Accuracy: {accuracy_tf:.4f}")
print("\nClassification Report:\n", report_tf)
print("\nConfusion Matrix:\n", conf_matrix_tf)

In [None]:
plt.figure(figsize=(12, 6))

# Plot training & validation accuracy values
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.show()

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_tf, annot=True, fmt='d', cmap='Blues', annot_kws={"size": 16})
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Save model
model_tf.save('sentiment_analysis_model.h5')

# Save tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as token_file:
    pickle.dump(tokenizer, token_file)

# Generate documentation (you can use more sophisticated tools like Sphinx for larger projects)
with open('documentation.md', 'w') as doc_file:
    doc_file.write("# Sentiment Analysis Project Documentation\n")
    doc_file.write("## Model Overview\n")
    doc_file.write("This sentiment analysis model uses a simple LSTM architecture with word embeddings.\n")
    doc_file.write("## Model Evaluation\n")
    doc_file.write(f"Accuracy: {accuracy_tf:.4f}\n\n")
    doc_file.write("### Classification Report\n")
    doc_file.write(f"{report_tf}\n\n")
    doc_file.write("### Confusion Matrix\n")
    doc_file.write(f"{conf_matrix_tf}\n\n")
    doc_file.write("## Visualizations\n")
    doc_file.write("### Training History\n")
    doc_file.write("![Training History](training_history.png)\n")
    doc_file.write("### Confusion Matrix\n")
    doc_file.write("![Confusion Matrix](confusion_matrix.png)\n")