In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("moviereviews.tsv",sep='\t')

In [4]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [5]:
len(df)

2000

In [6]:
df["review"]

0       how do films like mouse hunt get into theatres...
1       some talented actresses are blessed with a dem...
2       this has been an extraordinary year for austra...
3       according to hollywood movies made in last few...
4       my first press screening of 1998 and already i...
                              ...                        
1995    i like movies with albert brooks , and i reall...
1996    it might surprise some to know that joel and e...
1997    the verdict : spine-chilling drama from horror...
1998    i want to correct what i wrote in a former ret...
1999    a couple of months ago , when i first download...
Name: review, Length: 2000, dtype: object

In [9]:
# df["review"][0]
# print(df["review"][0])

In [10]:
#first we will check for the missing values

In [11]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [12]:
#here we are not missing labels, but we are missing reviews
# we can remove this NaN , not a number, or none reviews by
df.dropna(inplace=True)

In [15]:
df.isnull().sum()
#now we are not having any missing values

label     0
review    0
dtype: int64

In [16]:
#however, whenever the text data is formated, databases, instead of putting in Nan, 
#they actually put the empty string
# so we also need to remove the empty string values

In [17]:
myString = 'hello'
empty = ' '

In [18]:
myString.isspace()

False

In [20]:
empty.isspace() #this is the way , to detect an empty whitespace

True

In [21]:
#simple way  , to do this
blanks = []

for i,lb,rv in df.itertuples():
    #it returns, tuple index ,location, label value, and review value
    if rv.isspace():
        blanks.append(i)#we are collecting the index posiitons, of the blank statements

In [24]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [22]:
df.drop(blanks,inplace=True)

In [23]:
len(df)

1938

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X=df['review']

In [27]:
y=df['label']

In [28]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [30]:
#now we need to make a pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [31]:
text_clf = Pipeline([ ('tfidf',TfidfVectorizer()) , ('clf',LinearSVC())])

In [32]:
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [33]:
#now we need to make predictions

In [34]:
predictions = text_clf.predict(X_test)

In [35]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [36]:
print(confusion_matrix(y_test,predictions))

[[259  49]
 [ 49 283]]


In [37]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.84      0.84      0.84       308
         pos       0.85      0.85      0.85       332

    accuracy                           0.85       640
   macro avg       0.85      0.85      0.85       640
weighted avg       0.85      0.85      0.85       640



In [38]:
print(accuracy_score(y_test,predictions))

0.846875


In [40]:
#we are getting 0.85 for precision,recall and accuracy which
#is pretty amazing, 
#given the fact that, we are doing in few lines of code
#and more importantly , we are able to do this with the raw text of review

# so just by reading the raw text, we are able to understand  that 
#the review is positive or negative
#so now we can setup our own movie review site, that goes through the internet,
#scrapes through the reviews, and run through the classifier,
#and tell if the review is positive or negative
