In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import joblib
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\datas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
df= pd.read_csv('./Dataset/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
print(df.shape)

(50000, 2)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
df.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
df.sentiment = np.where(df.sentiment == 'positive', 1, 0)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
df.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

Cleaning: Remove HTML tags
Remove special charector
Converting text into lower case
Removing stop words
Stemming

In [10]:
def clean_review(text):
    # Remove HTML tags
    # text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'<.*?>', ' ', text)    

    # Remove special characters and digits (if needed, modify this as per requirement)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Convert text to lowercase
    text = text.lower()

    return text

df.review = df.review.apply(clean_review)
df.review[0]

'one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with me  the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word  it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far away  i would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty 

In [11]:
# download stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\datas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\datas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words

def apply_stemming(text):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in text]
    return ' '.join(stemmed_words)

In [13]:
df['review'] = df['review'].apply(remove_stopwords)
df['review'][0]

['one',
 'reviewers',
 'mentioned',
 'watching',
 '1',
 'oz',
 'episode',
 'youll',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'manyaryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'moreso',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes'

In [14]:
df['review'] = df['review'].apply(apply_stemming)
df['review'][0]

'one review mention watch 1 oz episod youll hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti surreal couldnt say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort u

In [15]:
X=df.iloc[:0:1].values
y=df.iloc[:,-1].values

In [16]:
cv=CountVectorizer(max_features=2500)
X=cv.fit_transform(df['review']).toarray()

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [18]:
clf=MultinomialNB()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print('MultinomialNB',accuracy_score(y_test,y_pred))

MultinomialNB 0.8371


In [19]:
from sklearn.metrics import roc_auc_score
y_proba = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
print("AUC of MultinomialNB Model =", auc)

AUC of MultinomialNB Model = 0.9079488648719054


In [21]:
# Save the CountVectorizer
joblib.dump(cv, 'count_vectorizer.pkl')

# Save the model
joblib.dump(clf, 'sentiment_model.pkl')


['sentiment_model.pkl']

In [22]:
data = pd.read_csv('./Dataset/IMDB Dataset.csv')
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [23]:
data.review[8]

"Encouraged by the positive comments about this film on here I was looking forward to watching this film. Bad mistake. I've seen 950+ films and this is truly one of the worst of them - it's awful in almost every way: editing, pacing, storyline, 'acting,' soundtrack (the film's only song - a lame country tune - is played no less than four times). The film looks cheap and nasty and is boring in the extreme. Rarely have I been so happy to see the end credits of a film. <br /><br />The only thing that prevents me giving this a 1-score is Harvey Keitel - while this is far from his best performance he at least seems to be making a bit of an effort. One for Keitel obsessives only."

In [24]:
test_text= """After reading all the positive comments about this film, I was eager to watch it. Unfortunately, that was a mistake. Having seen over 950 films, I can say with confidence that this one ranks among the worst. It falters in nearly every area: editing, pacing, storyline, acting, and even the soundtrack (the same uninspiring country song is played a ridiculous four times). The film looks cheap, feels unpleasant, and is painfully boring. I’ve rarely been so relieved to see the end credits roll.

The only reason I’m not giving this a 1/10 is Harvey Keitel. While it’s far from his best performance, at least he appears to be making an effort. This film is really only for die-hard Keitel fans."""

In [25]:
# Load the CountVectorizer
cv = joblib.load('count_vectorizer.pkl')

# Load the trained model
model = joblib.load('sentiment_model.pkl')


In [26]:
cleaned_text = clean_review(test_text)
cleaned_text = remove_stopwords(cleaned_text)
cleaned_text = apply_stemming(cleaned_text)
cleaned_text = np.array(cleaned_text)
final_text = cv.transform(np.array([cleaned_text])).toarray()
prediction = model.predict(final_text.reshape(1, 2500))
if prediction == 1:
    print("Positive Sentiment!")
else:
    print("Negative Sentiment!")

Negative Sentiment!


In [27]:
import numpy as np

def predict_sentiment(text, model, cv):
    # Clean the review text (removes HTML tags, stopwords, and applies stemming)
    cleaned_text = clean_review(text)
    cleaned_text = remove_stopwords(cleaned_text)
    cleaned_text = apply_stemming(cleaned_text)
    
    # Convert cleaned text to a numpy array (for CountVectorizer)
    cleaned_text = np.array([cleaned_text])
    
    # Transform the cleaned text using the CountVectorizer (cv)
    final_text = cv.transform(cleaned_text).toarray()
    
    # Predict sentiment using the trained model
    prediction = model.predict(final_text.reshape(1, -1))  # Ensure the shape is correct
    
    # Output prediction
    if prediction == 1:
        print("Positive Sentiment!")
    else:
        print("Negative Sentiment!")


In [28]:
predict_sentiment(test_text,model,cv)

Negative Sentiment!
