In [8]:
import numpy as np
import pandas as pd

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_csv('imdb_reviews/IMDB_Dataset.csv')

In [14]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [16]:
df.shape

(50000, 2)

In [18]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment'].replace({'positive':1,'negative':0},inplace=True)
  df['sentiment'].replace({'positive':1,'negative':0},inplace=True)


In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [9]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [10]:
df['review'] = df['review'].apply(remove_html_tags)

In [11]:
def remove_special(text):
    x = ''
    for s in text:
        if s.isalnum():
            x = x+s
        else:
            x = x+' '
    return x

In [12]:
df['review'] = df['review'].apply(remove_special)

In [13]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there s a family where a little boy ...,0
4,Petter Mattei s Love in the Time of Money is...,1


In [14]:
nltk.download('stopwords')

print(len(stopwords.words('english')))

179


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/itgroup13/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [16]:
df['sentiment'].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [17]:
ps = PorterStemmer()

In [18]:
def stemming(content):

    stemmed_cont=re.sub('[^a-zA-Z]', ' ', content)
    stemmed_cont=stemmed_cont.lower()
    stemmed_cont=stemmed_cont.split()
    stemmed_cont=[ps.stem(word) for word in stemmed_cont if word not in stopwords.words('english')]
    stemmed_cont=' '.join(stemmed_cont)

    return stemmed_cont

In [19]:
df['review'] = df['review'].apply(stemming)

In [21]:
df.head(10)

Unnamed: 0,review,sentiment
0,one review mention watch oz episod hook right ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1
5,probabl time favorit movi stori selfless sacri...,1
6,sure would like see resurrect date seahunt ser...,1
7,show amaz fresh innov idea first air first yea...,0
8,encourag posit comment film look forward watch...,0
9,like origin gut wrench laughter like movi youn...,1


In [22]:
x=df['review'].values
y=df['sentiment'].values

In [23]:
x

array(['one review mention watch oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side',

In [72]:
# vectorizer=CountVectorizer()

# x=vectorizer.fit_transform(x).toarray()

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [26]:
vectorizer=TfidfVectorizer()

x_train=vectorizer.fit_transform(x_train)
x_test=vectorizer.transform(x_test)

In [None]:
with open("vectorizer.pkl", "rb") as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

In [31]:
print((x_train.shape), (x_test.shape))
print((y_train.shape), (y_test.shape))

(40000, 64259) (10000, 64259)
(40000,) (10000,)


In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
clf1 = LogisticRegression(max_iter=1000)

In [36]:
clf1.fit(x_train, y_train)

In [39]:
ypred1 = clf1.predict(x_test)
ypred_traindata = clf1.predict(x_train)

In [40]:
from sklearn.metrics import accuracy_score

print('Logistic regression accuracy on test data : ', accuracy_score(y_test, ypred1))
print('Logistic regression accuracy on train data : ', accuracy_score(y_train, ypred_traindata))

Logistic regression accuracy on test data :  0.894
Logistic regression accuracy on train data :  0.92385


In [41]:
import pickle

logistic_dir='logistic_model.sav'
pickle.dump(clf1, open(logistic_dir, 'wb'))