# Sentiment Analysis (NLP) 

- IMBD Movie Review

### Import necessary libraries:

In [32]:
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

### Load the dataset and explore it

In [23]:
# Load the dataset
df = pd.read_csv('../Dataset/IMDB_dataset.csv')

# Display the first few rows
print(df.head())

# Check the shape of the dataset
print("Dataset Shape:", df.shape)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Dataset Shape: (50000, 2)


### Data Preprocessing:

In [26]:
# Tokenization and cleaning
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Remove stopwords and non-alphabetic characters
    cleaned_words = [word.lower() for word in words if word.isalpha() and word not in stop_words]
    
    return ' '.join(cleaned_words)

# Apply preprocessing to the 'review' column
df['cleaned_review'] = df['review'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Split the dataset into training and testing sets:

In [27]:
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Feature Extraction:

In [28]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)


### Build and train a sentiment analysis model:

In [29]:
model = MultinomialNB()
model.fit(X_train_bow, y_train)

### Make predictions and evaluate the model:

In [30]:
y_pred = model.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8585
Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.88      0.86      4961
    positive       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



## Interpretation:

Accuracy: The model is correct about 86% of the time when predicting if a review is positive or negative.

Precision: When the model predicts a review as "negative":

It's right about 84% of the time.
When a review is actually negative, it catches 88% of them.
Precision: When the model predicts a review as "positive":

It's right about 87% of the time.
When a review is actually positive, it catches 84% of them.
F1-score: This number combines both precision and recall. It's a balanced measure of correctness.

Support: The number of reviews in each category (negative or positive).

In [33]:
vectorizer = CountVectorizer(max_features=10000)  # Adjust as necessary
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Convert the sparse matrix to a dense format
X_train_bow_dense = X_train_bow.toarray()
X_test_bow_dense = X_test_bow.toarray()

# Create the model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_bow_dense.shape[1],)))  # Input layer
model.add(Dropout(0.5))  # Dropout layer for regularization
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dropout(0.5))  # Dropout layer for regularization
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_bow_dense, y_train, epochs=10, batch_size=128, validation_split=0.2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


UnimplementedError: Graph execution error:

Detected at node compile_loss/binary_crossentropy/Cast defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2032.0_x64__qbz5n2kfra8p0\Lib\asyncio\base_events.py", line 641, in run_forever

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2032.0_x64__qbz5n2kfra8p0\Lib\asyncio\base_events.py", line 1986, in _run_once

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2032.0_x64__qbz5n2kfra8p0\Lib\asyncio\events.py", line 88, in _run

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\ravia\AppData\Local\Temp\ipykernel_8908\2645977818.py", line 21, in <module>

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 320, in fit

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 121, in one_step_on_iterator

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 108, in one_step_on_data

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 54, in train_step

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\trainers\trainer.py", line 398, in _compute_loss

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\trainers\trainer.py", line 366, in compute_loss

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\trainers\compile_utils.py", line 618, in __call__

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\trainers\compile_utils.py", line 659, in call

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\losses\loss.py", line 56, in __call__

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\tree\tree_api.py", line 148, in map_structure

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\tree\optree_impl.py", line 79, in map_structure

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\optree\ops.py", line 747, in tree_map

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\losses\loss.py", line 57, in <lambda>

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\ops\core.py", line 917, in convert_to_tensor

  File "d:\NLP-Learning\Natural-Language-Processing\venv\Lib\site-packages\keras\src\backend\tensorflow\core.py", line 132, in convert_to_tensor

Cast string to float is not supported
	 [[{{node compile_loss/binary_crossentropy/Cast}}]] [Op:__inference_one_step_on_iterator_1470]