In [109]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [49]:
imdb = pd.read_csv('imdb_labelled.txt',sep='\t',header=None)

In [52]:
amazon = pd.read_csv('amazon_cells_labelled.txt',sep='\t',header=None)
yelp = pd.read_csv('yelp_labelled.txt',sep='\t',header=None)

In [53]:
df = pd.DataFrame()

In [68]:
df = pd.concat([imdb,amazon,yelp],ignore_index=True)

In [69]:
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [70]:
df.shape

(2748, 2)

In [71]:
df.columns = ['Review','Sentiment']

In [72]:
df.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [73]:
df.iloc[0]

Review       A very, very, very slow-moving, aimless movie ...
Sentiment                                                    0
Name: 0, dtype: object

In [74]:
df.loc[0]

Review       A very, very, very slow-moving, aimless movie ...
Sentiment                                                    0
Name: 0, dtype: object

In [75]:
df['Review'][0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [76]:
tokens = []
for i in range(len(df)):
    tokens.append(word_tokenize(df['Review'][i]))

In [79]:
tokens[0]

['A',
 'very',
 ',',
 'very',
 ',',
 'very',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'about',
 'a',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [90]:
stopwordsList = stopwords.words("english")
stopwordsList.extend([',','.','-','!'])

In [91]:
wordsList = []
for tokenList in tokens:
    words = []
    for word in tokenList:
        if word.lower() not in stopwordsList:
            words.append(word.lower())
    wordsList.append(words)

In [92]:
wordsList[0]

['slow-moving', 'aimless', 'movie', 'distressed', 'drifting', 'young', 'man']

In [93]:
print(wordsList[1])

['sure', 'lost', 'flat', 'characters', 'audience', 'nearly', 'half', 'walked']


In [94]:
wnet = WordNetLemmatizer()

In [95]:
for i in range(len(wordsList)):
    for j in range(len(wordsList[i])):
        wordsList[i][j] = wnet.lemmatize(wordsList[i][j], pos='v')

In [97]:
print(wordsList[:5])

[['slow-moving', 'aimless', 'movie', 'distress', 'drift', 'young', 'man'], ['sure', 'lose', 'flat', 'character', 'audience', 'nearly', 'half', 'walk'], ['attempt', 'artiness', 'black', '&', 'white', 'clever', 'camera', 'angle', 'movie', 'disappoint', 'become', 'even', 'ridiculous', 'act', 'poor', 'plot', 'line', 'almost', 'non-existent'], ['little', 'music', 'anything', 'speak'], ['best', 'scene', 'movie', 'gerardo', 'try', 'find', 'song', 'keep', 'run', 'head']]


In [101]:
cv = CountVectorizer()

In [104]:
wordsList = np.asarray(wordsList)

In [106]:
for i in range(len(wordsList)):
    wordsList[i] = ' '.join(wordsList[i])

In [107]:
wordsList[0]

'slow-moving aimless movie distress drift young man'

In [108]:
vect = cv.fit_transform(wordsList)

In [111]:
y = df['Sentiment'].values
x_train,x_test,y_train,y_test = train_test_split(vect,y,test_size=0.25)

In [112]:
reg = LogisticRegression()

In [113]:
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [114]:
y_pred = reg.predict(x_test)

In [115]:
accuracy_score(y_test,y_pred)

0.784570596797671

In [116]:
confusion_matrix(y_test,y_pred)

array([[291,  67],
       [ 81, 248]], dtype=int64)