#Importing the Dependencies

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score

#Data Collection & Pre-Processing

In [4]:
data=pd.read_csv('/content/Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data.shape

(50000, 2)

In [6]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [7]:
# printing the first 5 rows of the dataframe
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
# checking the number of rows and columns in the dataframe
data.shape

(50000, 2)

#Label Encoding

In [10]:
encoder=LabelEncoder()
data['sentiment']=encoder.fit_transform(data['sentiment'])

In [11]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


positive  -  1

negative  -  0

In [12]:
# separating the data as texts and label

X = data['review']

Y = data['sentiment']

In [13]:
print(X)

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object


In [14]:
print(Y)

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


#Splitting the data into training data & test data

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [16]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(50000,)
(40000,)
(10000,)


#Feature Extraction

In [17]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

# min_df=1: This means that terms appearing in at least one document are considered.
# stop_words='english': This removes common English words that don't contribute much to the meaning.

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)


In [19]:
print(X_train)

6061     VIVAH in my opinion is the best movie of 2006,...
13906    More exciting than the Wesley Snipes film, and...
19664    this attempt at a "thriller" would have no sub...
16830    Sherlock Holmes films from the classic Univers...
16916    Like some other people wrote, I'm a die-hard m...
                               ...                        
25544    America's Next Top Model is a great reality sh...
48056    In 2004, I wrote the following statements on a...
11513    It's a difficult movie to classify "10 Items o...
1688     Pathetic... worse than a bad made-for-TV movie...
5994     I expected a lot more out of this film. The pr...
Name: review, Length: 40000, dtype: object


In [20]:
print(X_train_features)

  (0, 88279)	0.21742516664172426
  (0, 58303)	0.034325701343397685
  (0, 8827)	0.020795309461399715
  (0, 54607)	0.10585343653014008
  (0, 580)	0.049467242944920956
  (0, 16893)	0.03316140142011759
  (0, 22994)	0.023363926470404476
  (0, 64515)	0.04523671600804336
  (0, 79014)	0.03900208969414974
  (0, 13319)	0.034479800497907394
  (0, 44668)	0.05477540729403397
  (0, 69441)	0.0364912021410782
  (0, 54634)	0.019956677400268536
  (0, 20722)	0.031998592039374885
  (0, 58002)	0.023787020875927
  (0, 90259)	0.05351771625094744
  (0, 56264)	0.024395932110634896
  (0, 10717)	0.05173382714452572
  (0, 63526)	0.03533107441593057
  (0, 89084)	0.027867781340290377
  (0, 82529)	0.02536328278098244
  (0, 8411)	0.026963070124283457
  (0, 6562)	0.039935194253988934
  (0, 10901)	0.19426039581131768
  (0, 34432)	0.054755677198918676
  :	:
  (39999, 47059)	0.07222621570894228
  (39999, 41906)	0.08447780854047296
  (39999, 61124)	0.09087520368056211
  (39999, 78554)	0.07649074622743088
  (39999, 57962)	

#Training the Model

Logistic Regression

In [21]:
model = LogisticRegression()

In [22]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

Evaluating the trained model

In [24]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
f1_score_on_training_data=f1_score(Y_train,prediction_on_training_data)

In [25]:
print('Accuracy on training data : ', accuracy_on_training_data)
print('F1 score on training data : ', f1_score_on_training_data)

Accuracy on training data :  0.932775
F1 score on training data :  0.9332439611727613


In [26]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
f1_score_on_test_data=f1_score(Y_test,prediction_on_test_data)

In [27]:
print('Accuracy on test data : ', accuracy_on_test_data)
print('F1 score on test data : ', f1_score_on_test_data)

Accuracy on test data :  0.8945
F1 score on test data :  0.8962124938514511


#Building a Predictive System

In [30]:
# input = ["This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 or 8 years were brilliant, but things dropped off after that. By 1990, the show was not really funny anymore, and it's continued its decline further to the complete waste of time it is today.<br /><br />It's truly disgraceful how far this show has fallen. The writing is painfully bad, the performances are almost as bad - if not for the mildly entertaining respite of the guest-hosts, this show probably wouldn't still be on the air. I find it so hard to believe that the same creator that hand-selected the original cast also chose the band of hacks that followed. How can one recognize such brilliance and then see fit to replace it with such mediocrity? I felt I must give 2 stars out of respect for the original cast that made this show such a huge success. As it is now, the show is just awful. I can't believe it's still on the air."]

input = ["I sure would like to see a resurrection of a up dated Seahunt series with the tech they have today it would bring back the kid excitement in me.I grew up on black and white TV and Seahunt with Gunsmoke were my hero's every week.You have my vote for a comeback of a new sea hunt.We need a change of pace in TV and this would work for a world of under water adventure.Oh by the way thank you for an outlet like this to view many viewpoints about TV and the many movies.So any ole way I believe I've got what I wanna say.Would be nice to read some more plus points about sea hunt.If my rhymes would be 10 lines would you let me submit,or leave me out to be in doubt and have me to quit,If this is so then I must go so lets do it."]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==0):
  print('Negative Review')

else:
  print('Positive Review')

[0]
Negative Review
