In [42]:
import os 
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem.porter import *

from sklearn.feature_extraction.text import CountVectorizer


# from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn import svm

In [2]:
data=pd.read_csv('dataset.csv',encoding = "ISO-8859-1")

In [3]:
data.head()

Unnamed: 0,SentimentText,Sentiment
0,"first think another Disney movie, might good, ...",1
1,"Put aside Dr. House repeat missed, Desperate H...",0
2,"big fan Stephen King's work, film made even gr...",1
3,watched horrid thing TV. Needless say one movi...,0
4,truly enjoyed film. acting terrific plot. Jeff...,1


In [4]:
def remove_pattern(input_text,pattern):
  r=re.findall(pattern,input_text)
  for i in r:
    input_text=re.sub(i,'',input_text)
    
  return input_text

In [5]:
data['tidy']=np.vectorize(remove_pattern)(data['SentimentText'],',[\w]*')
data['tidy']=data['tidy'].str.replace('[^a-zA-z]'," ")
data['tidy']=data['tidy'].apply(lambda x:' '.join([w for w in x.split() if len(w)>2]))

In [6]:
data.head()

Unnamed: 0,SentimentText,Sentiment,tidy
0,"first think another Disney movie, might good, ...",1,first think another Disney movie might good ki...
1,"Put aside Dr. House repeat missed, Desperate H...",0,Put aside House repeat missed Desperate Housew...
2,"big fan Stephen King's work, film made even gr...",1,big fan Stephen King work film made even great...
3,watched horrid thing TV. Needless say one movi...,0,watched horrid thing Needless say one movies w...
4,truly enjoyed film. acting terrific plot. Jeff...,1,truly enjoyed film acting terrific plot Jeff C...


In [7]:
tokenized_review=data['tidy'].apply(lambda x:x.split())
tokenized_review.head()

0    [first, think, another, Disney, movie, might, ...
1    [Put, aside, House, repeat, missed, Desperate,...
2    [big, fan, Stephen, King, work, film, made, ev...
3    [watched, horrid, thing, Needless, say, one, m...
4    [truly, enjoyed, film, acting, terrific, plot,...
Name: tidy, dtype: object

In [8]:
stemmer=PorterStemmer()

In [9]:
tokenized_review=tokenized_review.apply(lambda x:[stemmer.stem(i) for i in x])

In [10]:
tokenized_review.head()

0    [first, think, anoth, disney, movi, might, goo...
1    [put, asid, hous, repeat, miss, desper, housew...
2    [big, fan, stephen, king, work, film, made, ev...
3    [watch, horrid, thing, needless, say, one, mov...
4    [truli, enjoy, film, act, terrif, plot, jeff, ...
Name: tidy, dtype: object

In [11]:
for i in range(len(tokenized_review)):
    tokenized_review[i]=' '.join(tokenized_review[i])

data['tidy']=tokenized_review

In [12]:
data.head()

Unnamed: 0,SentimentText,Sentiment,tidy
0,"first think another Disney movie, might good, ...",1,first think anoth disney movi might good kid m...
1,"Put aside Dr. House repeat missed, Desperate H...",0,put asid hous repeat miss desper housew new wa...
2,"big fan Stephen King's work, film made even gr...",1,big fan stephen king work film made even great...
3,watched horrid thing TV. Needless say one movi...,0,watch horrid thing needless say one movi watch...
4,truly enjoyed film. acting terrific plot. Jeff...,1,truli enjoy film act terrif plot jeff comb tal...


In [13]:
all_words=' '.join([word for word in data['tidy']])

In [14]:
all_words_list=all_words.split(' ')

In [15]:
all_word_frame=pd.DataFrame(all_words_list)

In [16]:
X=all_word_frame[0].value_counts()

In [17]:
X

movi           51690
film           48181
one            27728
like           22778
time           16188
good           15353
make           15205
the            14265
charact        14178
get            14139
see            14109
watch          13942
stori          13165
even           12897
would          12434
realli         11731
well           11024
scene          10595
look           10042
show            9875
much            9760
end             9651
peopl           9389
bad             9338
great           9164
also            9153
first           9060
love            9013
think           8910
don             8847
               ...  
luren              1
outerbridg         1
fondu              1
scopophilia        1
weverka            1
nicodim            1
ungratifi          1
prolix             1
pencier            1
easton             1
lofaso             1
stani              1
chaykin            1
asif               1
majai              1
rocll              1
orpington    

In [18]:
df2 = pd.DataFrame(np.array(all_words.split(' ')).reshape(-1, 1), columns=['words'])

In [19]:
unique_words = list(df2['words'].str.split(' ', expand=True).stack().unique())

In [20]:
len(unique_words)

51292

In [21]:
word_counts = list(df2['words'].value_counts())

In [22]:
df4 = pd.DataFrame(np.hstack((np.array(unique_words).reshape(-1,1),np.array(word_counts).reshape(-1,1))), columns=['words','word_count'])

In [23]:
df4_new=df4[df4['word_count'].map(len)==1]

In [24]:
df4_words_list = list(df4_new['words'])

In [25]:
unique_words=list(unique_words)

In [26]:
for word in df4_words_list:
    unique_words.remove(word)

In [27]:
len(unique_words)

14132

In [28]:
type(unique_words)

list

In [29]:
df4_new.head()

Unnamed: 0,words,word_count
14132,contemptu,9
14133,bicycl,9
14134,victrola,9
14135,vaccin,9
14136,barcelona,9


In [30]:
new_data=data

In [31]:
new_data.head()

Unnamed: 0,SentimentText,Sentiment,tidy
0,"first think another Disney movie, might good, ...",1,first think anoth disney movi might good kid m...
1,"Put aside Dr. House repeat missed, Desperate H...",0,put asid hous repeat miss desper housew new wa...
2,"big fan Stephen King's work, film made even gr...",1,big fan stephen king work film made even great...
3,watched horrid thing TV. Needless say one movi...,0,watch horrid thing needless say one movi watch...
4,truly enjoyed film. acting terrific plot. Jeff...,1,truli enjoy film act terrif plot jeff comb tal...


In [32]:
tokenized_tidy=new_data['tidy'].apply(lambda x:x.split())
tokenized_tidy.head()

0    [first, think, anoth, disney, movi, might, goo...
1    [put, asid, hous, repeat, miss, desper, housew...
2    [big, fan, stephen, king, work, film, made, ev...
3    [watch, horrid, thing, needless, say, one, mov...
4    [truli, enjoy, film, act, terrif, plot, jeff, ...
Name: tidy, dtype: object

In [34]:
for i in tokenized_tidy:
    for j in i:
        if j not in unique_words:
            i.remove(j)

In [35]:
len(tokenized_tidy)

25000

In [36]:
for i in range(len(tokenized_tidy)):
    tokenized_tidy[i]=' '.join(tokenized_tidy[i])

new_data['new_tidy']=tokenized_tidy

In [37]:
new_data.head()

Unnamed: 0,SentimentText,Sentiment,tidy,new_tidy
0,"first think another Disney movie, might good, ...",1,first think anoth disney movi might good kid m...,first think anoth disney movi might good kid m...
1,"Put aside Dr. House repeat missed, Desperate H...",0,put asid hous repeat miss desper housew new wa...,put asid hous repeat miss desper housew new wa...
2,"big fan Stephen King's work, film made even gr...",1,big fan stephen king work film made even great...,big fan stephen king work film made even great...
3,watched horrid thing TV. Needless say one movi...,0,watch horrid thing needless say one movi watch...,watch horrid thing needless say one movi watch...
4,truly enjoyed film. acting terrific plot. Jeff...,1,truli enjoy film act terrif plot jeff comb tal...,truli enjoy film act terrif plot jeff comb tal...


In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
rev_vectorizer=CountVectorizer(stop_words='english')
rev= rev_vectorizer.fit_transform(new_data['new_tidy'])

In [43]:
train_rev=rev

xtrain_rev,xvalid_rev,ytrain,yvalid=train_test_split(train_rev,new_data['Sentiment'],test_size=0.2)

In [49]:
clf=svm.SVC()

In [50]:
xtrain_rev

<20000x16697 sparse matrix of type '<class 'numpy.int64'>'
	with 1589542 stored elements in Compressed Sparse Row format>

In [None]:
clf.fit(xtrain_rev,ytrain)