In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import numpy as np
import pandas as pd
import re, string #re= regular expression
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
import pickle
import nltk
nltk.download('stopwords')
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Sentiment analysis/IMDB Dataset.csv', encoding='latin1')


In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
stop_words= set(stopwords.words('english'))
stemmer=PorterStemmer()

Data preprocessing

In [6]:
#function banayera ani apply gariyeko chha

def clean_text(text):
  text= text.lower()
  text= re.sub(r'<.*?>', '', text) #removing HTML tags
  text= re.sub(r'[^a-zA-Z\s]', '', text) #removing numbers and punctuations
  text=text.strip() #removing spaces
  words=text.split()

  words= [stemmer.stem(word) for word in words if word not in stop_words]

  return " ".join(words)
df['clean_review']=df['review']. apply(clean_text)

In [7]:
df

Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod youll hook ...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movi right good job wasnt creativ orig...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogu bad act idiot direct anno...
49997,I am a Catholic taught in parochial elementary...,negative,cathol taught parochi elementari school nun ta...
49998,I'm going to have to disagree with the previou...,negative,im go disagre previou comment side maltin one ...


In [8]:
df['label']=df['sentiment'].map({'positive':1, 'negative':0})

In [9]:
df

Unnamed: 0,review,sentiment,clean_review,label
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod youll hook ...,1
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...,1
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...,1
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movi right good job wasnt creativ orig...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogu bad act idiot direct anno...,0
49997,I am a Catholic taught in parochial elementary...,negative,cathol taught parochi elementari school nun ta...,0
49998,I'm going to have to disagree with the previou...,negative,im go disagre previou comment side maltin one ...,0


splitting the dataset


In [10]:
X= df['clean_review']
y=df['label']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X

Unnamed: 0,clean_review
0,one review mention watch oz episod youll hook ...
1,wonder littl product film techniqu unassum old...
2,thought wonder way spend time hot summer weeke...
3,basic there famili littl boy jake think there ...
4,petter mattei love time money visual stun film...
...,...
49995,thought movi right good job wasnt creativ orig...
49996,bad plot bad dialogu bad act idiot direct anno...
49997,cathol taught parochi elementari school nun ta...
49998,im go disagre previou comment side maltin one ...


In [12]:
y

Unnamed: 0,label
0,1
1,1
2,1
3,0
4,1
...,...
49995,1
49996,0
49997,0
49998,0


In [13]:
tfidf= TfidfVectorizer(max_features=5000)
X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf= tfidf.transform(X_test)

In [15]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [18]:
y_pred=model.predict(X_test_tfidf)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

Accuracy:  0.8838
Classification Report:               precision    recall  f1-score   support

           0       0.90      0.87      0.88      4961
           1       0.87      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [19]:
def predict_review(text):
  cleaned= clean_text(text)
  vector=tfidf.transform([cleaned])
  prediction=model.predict(vector)
  return 'Positive' if prediction[0]==1 else 'Negative'

print(predict_review("The movie was absolutely terrible."))
print(predict_review("One of the best films I've ever watched."))


Negative
Positive


In [20]:
with open("sentiment_model.pkl", 'wb') as  file:
  pickle.dump(model, file)

with open ('tfidf_vectorizer.pkl', 'wb') as file:
  pickle.dump(tfidf, file)