# Sentiment Analysis (NLP) 

- IMBD Movie Review

### Import necessary libraries:

In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...


True

### Load the dataset and explore it

In [11]:
# Load the dataset
df = pd.read_csv("IMDB_dataset.csv")

# Display the first few rows
print(df.head())

# Check the shape of the dataset
print("Dataset Shape:", df.shape)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Dataset Shape: (50000, 2)


In [12]:
df["review"][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### Data Preprocessing:

In [13]:
# Tokenization and cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# def preprocess_text(text):
#     # Tokenize the text
#     words = nltk.word_tokenize(text)
#     # Remove stopwords and non-alphabetic characters
#     cleaned_words = [word.lower() for word in words if word.isalpha() and word not in stop_words]
#     return ' '.join(cleaned_words)
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Remove single characters
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Tokenize the text
    words = nltk.word_tokenize(text)

    cleaned_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word not in stop_words]
    
    # Handle repeated characters (e.g., coooool -> cool)
    cleaned_words = [re.sub(r'(.)\1+', r'\1\1', word) for word in cleaned_words]
    
    return ' '.join(cleaned_words)

# Apply preprocessing to the 'review' column
df['cleaned_review'] = df['review'].apply(preprocess_text)


In [14]:
df['cleaned_review'][0]

'one reviewer mentioned watching oz episode hooked they right exactly happened the first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid this show pull punch regard drug sex violence it hardcore classic use word it called oz nickname given oswald maximum security state penitentary it focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around the first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence not violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill 

In [15]:
df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

### Split the dataset into training and testing sets:

In [16]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [17]:
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Feature Extraction:

In [18]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)


### Build and train a sentiment analysis model:

In [19]:
model = MultinomialNB()
model.fit(X_train_bow, y_train)

### Make predictions and evaluate the model:

In [20]:
y_pred = model.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8589
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86      4961
           1       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



## Interpretation:

Accuracy: The model is correct about 86% of the time when predicting if a review is positive or negative.

Precision: When the model predicts a review as "negative":

It's right about 84% of the time.
When a review is actually negative, it catches 88% of them.
Precision: When the model predicts a review as "positive":

It's right about 87% of the time.
When a review is actually positive, it catches 84% of them.
F1-score: This number combines both precision and recall. It's a balanced measure of correctness.

Support: The number of reviews in each category (negative or positive).

In [21]:
vectorizer = CountVectorizer(max_features=10000)  # Adjust as necessary
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Convert the sparse matrix to a dense format
X_train_bow_dense = X_train_bow.toarray()
X_test_bow_dense = X_test_bow.toarray()

# Create the model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_bow_dense.shape[1],)))  # Input layer
model.add(Dropout(0.5))  # Dropout layer for regularization
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dropout(0.5))  # Dropout layer for regularization
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_bow_dense, y_train, epochs=10, batch_size=128,validation_split=0.2 )


Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.7609 - loss: 0.4827 - val_accuracy: 0.8866 - val_loss: 0.2741
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.9214 - loss: 0.2112 - val_accuracy: 0.8838 - val_loss: 0.2907
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.9509 - loss: 0.1390 - val_accuracy: 0.8876 - val_loss: 0.3255
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9701 - loss: 0.0911 - val_accuracy: 0.8825 - val_loss: 0.3591
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9813 - loss: 0.0616 - val_accuracy: 0.8846 - val_loss: 0.4329
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9865 - loss: 0.0450 - val_accuracy: 0.8810 - val_loss: 0.4936
Epoch 7/10
[1m250/25

In [22]:
model.save('sentiment_analysis_model.h5')




In [23]:
# Now, to predict sentiment for a sample sentence:
# Load the model
loaded_model = load_model('sentiment_analysis_model.h5')

# sample_sentence = "I hate the movie! it was very boring"
sample_sentence = "I loved the movie! It was fantastic!"

# Preprocess the sample sentence
sample_bow = vectorizer.transform([sample_sentence])  # Transform to bag-of-words
sample_bow_dense = sample_bow.toarray()  # Convert to dense format

# Make prediction
prediction = loaded_model.predict(sample_bow_dense)
print(prediction)
# Interpret prediction
sentiment = 'Positive' if prediction[0][0] > 0.7 else 'Negative'
print(f"Sentiment: {sentiment}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[[0.9948239]]
Sentiment: Positive
