In [29]:
#importing all required libraries
import random
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

## Accessing Dataset

In [2]:
reviews=pd.read_csv('Review.csv',names=['ratings','title','review'])
reviews

Unnamed: 0,ratings,title,review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


## Accessing the data

In [3]:
reviews.groupby('ratings').count()

Unnamed: 0_level_0,title,review
ratings,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1799958,1800000
2,1799965,1800000


## Separating Positive and Negative reviews

In [4]:
Positive=reviews.drop(reviews[reviews['ratings'] == 1].index)

In [5]:
Negative=reviews.drop(reviews[reviews['ratings'] == 2].index)

## Using numpy and generating dataset of total 10000 dataset(Positive=5000 and Negative=5000)

In [6]:
%%time
np.random.seed(10)
remove=1795000
indice=np.random.choice(Positive.index,remove,replace=False)
new_positive=Positive.drop(indice)

Wall time: 397 ms


In [7]:
%%time
np.random.seed(10)
remove=1795000
indice=np.random.choice(Negative.index,remove,replace=False)
new_negative=Negative.drop(indice)

Wall time: 397 ms


## Concating both Positive and Negative datasets into one dataset

In [8]:
reviews_subset=pd.concat([new_positive, new_negative], axis=0)

In [9]:
reviews_subset.groupby('ratings').count()

Unnamed: 0_level_0,title,review
ratings,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5000,5000
2,5000,5000


## Resetting Index

In [10]:
reviews_subset.reset_index(inplace=True)

## Dropping old index

In [11]:
reviews_subset.drop('index',axis='columns',inplace=True)

In [12]:
reviews_subset

Unnamed: 0,ratings,title,review
0,2,Belva Plain has done it again - another great ...,"Once again, Belva Plain has written a compelli..."
1,2,Memoir or Memory?,Although Derrida can be difficult to follow so...
2,2,Learn what children really loose when they wat...,This is the classic book on how television aff...
3,2,scary,This book was great.It was scary at some point...
4,2,Paxton Quigley is a realist,"When women ask me about safety and firearms, t..."
...,...,...,...
9995,1,Not to good!,"This toy was horribe to put together, after ab..."
9996,1,Ferris Wheel Does NOT Work,The cars continually fall off the ferris wheel...
9997,1,P is for Psycho,Flipping through this book I found it to be ye...
9998,1,HUGE Disappointment!!!!,I would NOT recommend this DVD. It is a seriou...


## Creating a new column sentiment containing Positive and Negative 

In [13]:
reviews_subset['sentiment']=''
for i in reviews_subset.index:
    if reviews_subset.loc[i,'ratings']==2:
        reviews_subset.loc[i,'sentiment']='POSITIVE'
    else:
        reviews_subset.loc[i,'sentiment']='NEGATIVE'

In [14]:
reviews_subset

Unnamed: 0,ratings,title,review,sentiment
0,2,Belva Plain has done it again - another great ...,"Once again, Belva Plain has written a compelli...",POSITIVE
1,2,Memoir or Memory?,Although Derrida can be difficult to follow so...,POSITIVE
2,2,Learn what children really loose when they wat...,This is the classic book on how television aff...,POSITIVE
3,2,scary,This book was great.It was scary at some point...,POSITIVE
4,2,Paxton Quigley is a realist,"When women ask me about safety and firearms, t...",POSITIVE
...,...,...,...,...
9995,1,Not to good!,"This toy was horribe to put together, after ab...",NEGATIVE
9996,1,Ferris Wheel Does NOT Work,The cars continually fall off the ferris wheel...,NEGATIVE
9997,1,P is for Psycho,Flipping through this book I found it to be ye...,NEGATIVE
9998,1,HUGE Disappointment!!!!,I would NOT recommend this DVD. It is a seriou...,NEGATIVE


## Forming a single column of review by adding title column and existing review column together

In [15]:
reviews_subset['review']=reviews_subset['title']+reviews_subset['review']

In [16]:
reviews_subset.head()

Unnamed: 0,ratings,title,review,sentiment
0,2,Belva Plain has done it again - another great ...,Belva Plain has done it again - another great ...,POSITIVE
1,2,Memoir or Memory?,Memoir or Memory?Although Derrida can be diffi...,POSITIVE
2,2,Learn what children really loose when they wat...,Learn what children really loose when they wat...,POSITIVE
3,2,scary,scaryThis book was great.It was scary at some ...,POSITIVE
4,2,Paxton Quigley is a realist,Paxton Quigley is a realistWhen women ask me a...,POSITIVE


## Dropping old title column

In [17]:
reviews_subset.drop('title',axis='columns',inplace=True)

## Checking data for NaN values 

In [18]:
reviews_subset.isna().sum()

ratings      0
review       0
sentiment    0
dtype: int64

In [19]:
reviews_subset.to_csv('reviews.csv')

## Splitting dataset into train and test dataset using train_test_split method.This is convenient to test the model with datasets not seen by the model

In [20]:
x_train,x_test,y_train,y_test=train_test_split(reviews_subset['review'],reviews_subset['sentiment'],test_size=0.25)

In [21]:
x_train

2806    Nice Medium Duty CasterThe swiveling caster is...
6794    Parents NightmareIn the beginning it was great...
986     Very good reference book for any astrophotogra...
4946    It's really coolThis figure was really cool. B...
5459    Stopped Working After 9 MonthsThe monitor look...
                              ...                        
3702    DANCING ALL OVER THE NYPDThis book wasn't as g...
1229    OMG ths was soooo good.im Sicilian although ra...
7986    boring angel apocolypseThis game is very horri...
8310    Horrible ProductI purchased this item with the...
4802    beautifulthe Class Trip is silly. however The ...
Name: review, Length: 7500, dtype: object

In [22]:
y_train

2806    POSITIVE
6794    NEGATIVE
986     POSITIVE
4946    POSITIVE
5459    NEGATIVE
          ...   
3702    POSITIVE
1229    POSITIVE
7986    NEGATIVE
8310    NEGATIVE
4802    POSITIVE
Name: sentiment, Length: 7500, dtype: object

## Converting string to binary form

In [23]:
cv=CountVectorizer()
x_train_cv=cv.fit_transform(x_train.values)
x_test_cv=cv.transform(x_test.values)

## Model Selection

## Fitting the Model

In [24]:
models=svm.SVC(C=1000,kernel='sigmoid',gamma='auto')
models.fit(x_train_cv,y_train)

SVC(C=1000, gamma='auto', kernel='sigmoid')

In [25]:
models.score(x_test_cv,y_test)

0.8528

In [26]:
y_pred=models.predict(x_test_cv)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    NEGATIVE       0.87      0.84      0.85      1258
    POSITIVE       0.84      0.87      0.85      1242

    accuracy                           0.85      2500
   macro avg       0.85      0.85      0.85      2500
weighted avg       0.85      0.85      0.85      2500



In [27]:
text=['i am happy','I hate this','This is good','Life is good']
_vector = cv.transform(text)
models.predict(_vector)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE'], dtype=object)

In [31]:
Text=input('Enter text:')
text_series=pd.Series(Text)
_vector = cv.transform(text_series.values)
print('It sounds',models.predict(_vector))

Enter text:bad
It sounds ['NEGATIVE']


In [None]:
Text=input('Enter text:')
text_series=pd.Series(Text)
_vector = cv.transform(text_series.values)
print('It sounds',models.predict(_vector))