<a href="https://colab.research.google.com/github/sobiahashmi/BIA_codes/blob/main/imdb_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step-01 Load Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

## Step-02 Load Dataset

In [6]:
df = pd.read_csv('/content/drive/MyDrive/BIA_class/NLP/IMDB dataset/IMDB_dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Step-03 Data Preprocessing

In [7]:
df.shape

(50000, 2)

In [8]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [9]:
# download and prepare stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


{'has', 'to', 'out', 'which', 'more', "she's", 'who', "needn't", "won't", 'its', 'he', 'am', 'same', 'how', 'here', "you're", 'once', 'your', 'me', 'into', 'over', 'some', 'as', "wouldn't", "mightn't", 'and', 'then', 'shouldn', 'again', 'both', "should've", 'y', "doesn't", 'weren', 'was', 'having', 'mightn', 'can', 'being', 'too', 'below', 'didn', 'ourselves', 'from', 'isn', 'are', 'these', 'when', 'or', 'do', 'against', 'nor', 'those', 'doesn', 'while', 'ain', 'during', "hadn't", 'doing', 'because', 'were', 'with', 'but', 'own', 'themselves', 'just', 'we', 'the', 'why', 'most', 'did', 'very', 'such', 'in', 'at', 'theirs', 'down', 'our', 'all', 'herself', 'my', 'any', "couldn't", 'myself', 'few', 'on', 'other', 'itself', 'what', "weren't", "shan't", 'mustn', 'above', 'shan', 'been', 'aren', 'won', 'an', 'up', 'if', 'wasn', "didn't", "haven't", 'whom', 'than', 'm', 'for', 've', 'couldn', "it's", 'them', 'his', 'd', 'this', 'don', "don't", 't', 'not', 'no', 'only', "wasn't", 'll', 'is', 

In [10]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [13]:
df['review'] = df['review'].apply(lambda x:' '.join(words.lower() for words in word_tokenize(x) if words.isalpha()))
df['review'] = df['review'].apply(lambda x:' '.join(word for word in x.split() if word not in stop_words))

In [14]:
df['review']

Unnamed: 0,review
0,one reviewers mentioned watching oz episode ho...
1,wonderful little production br br filming tech...
2,thought wonderful way spend time hot summer we...
3,basically family little boy jake thinks zombie...
4,petter mattei love time money visually stunnin...
...,...
49995,thought movie right good job creative original...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,catholic taught parochial elementary schools n...
49998,going disagree previous comment side maltin on...


## Step-04 Feature Extraction

In [18]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['review'])
print(X)

  (0, 60399)	0.017974952839640245
  (0, 71441)	0.06462480990968943
  (0, 54092)	0.05609257336852549
  (0, 93368)	0.06575817373268848
  (0, 61684)	0.45623212993794404
  (0, 27602)	0.0975544426738396
  (0, 39899)	0.07094898893961152
  (0, 71837)	0.0735281265840711
  (0, 28436)	0.04904385280355244
  (0, 37578)	0.04906897696993938
  (0, 10061)	0.1047786971609892
  (0, 30894)	0.054421297960725694
  (0, 85831)	0.03303889550109451
  (0, 82165)	0.14215854177509643
  (0, 10973)	0.07903842493089844
  (0, 89869)	0.09486972298505915
  (0, 74571)	0.03257488692995884
  (0, 92320)	0.2005726368824762
  (0, 76083)	0.04041402416824635
  (0, 95155)	0.05023337023058612
  (0, 35029)	0.032218575955768956
  (0, 88294)	0.06204631857696947
  (0, 77159)	0.1032577785093174
  (0, 29340)	0.0847517288143779
  (0, 38275)	0.07921586789377569
  :	:
  (49999, 89906)	0.1082206330087094
  (49999, 48730)	0.09747854593557065
  (49999, 43001)	0.19065207524758182
  (49999, 28775)	0.11166077174693212
  (49999, 5351)	0.1223981

## Step-05 Split the data into Training and Testing sets

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,df['sentiment'], test_size = 0.2 , random_state=42)

## Step-06 Build and Train the Model

In [21]:
model = MultinomialNB()
model.fit(X_train,y_train)

## Step-07 Save the Model

In [23]:
import joblib
joblib.dump(model,'/content/drive/MyDrive/BIA_class/NLP/IMDB dataset/model.pkl')

['/content/drive/MyDrive/BIA_class/NLP/IMDB dataset/model.pkl']

## Step-08 Model Prediction

In [24]:
y_pred = model.predict(X_test)

## Step-09 Model Evaluation

In [25]:
print("Accuracy:" , accuracy_score(y_test,y_pred))
print("Classificaton Report:", classification_report(y_test,y_pred))

Accuracy: 0.8672
Classificaton Report:               precision    recall  f1-score   support

    negative       0.86      0.88      0.87      4961
    positive       0.88      0.85      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

