In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [56]:
imdb = pd.read_csv('imdb_labelled.txt',sep='\t',header=None)

In [57]:
amazon = pd.read_csv('amazon_cells_labelled.txt',sep='\t',header=None)
yelp = pd.read_csv('yelp_labelled.txt',sep='\t',header=None)

In [58]:
df = pd.DataFrame()

In [59]:
df = pd.concat([imdb,amazon,yelp],ignore_index=True)

In [60]:
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [61]:
df.shape

(2748, 2)

In [62]:
df.columns = ['Review','Sentiment']

In [63]:
df.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [64]:
df.iloc[0]

Review       A very, very, very slow-moving, aimless movie ...
Sentiment                                                    0
Name: 0, dtype: object

In [65]:
df.loc[0]

Review       A very, very, very slow-moving, aimless movie ...
Sentiment                                                    0
Name: 0, dtype: object

In [66]:
df['Review'][0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [67]:
tokens = []
for i in range(len(df)):
    tokens.append(word_tokenize(df['Review'][i]))

In [68]:
tokens[0]

['A',
 'very',
 ',',
 'very',
 ',',
 'very',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'about',
 'a',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [69]:
stopwordsList = stopwords.words("english")
stopwordsList.extend([',','.','-','!'])

In [70]:
wordsList = []
for tokenList in tokens:
    words = []
    for word in tokenList:
        if word.lower() not in stopwordsList:
            words.append(word.lower())
    wordsList.append(words)

In [71]:
wordsList[0]

['slow-moving', 'aimless', 'movie', 'distressed', 'drifting', 'young', 'man']

In [72]:
print(wordsList[1])

['sure', 'lost', 'flat', 'characters', 'audience', 'nearly', 'half', 'walked']


In [73]:
wnet = WordNetLemmatizer()

In [74]:
for i in range(len(wordsList)):
    for j in range(len(wordsList[i])):
        wordsList[i][j] = wnet.lemmatize(wordsList[i][j], pos='v')

In [75]:
print(wordsList[:5])

[['slow-moving', 'aimless', 'movie', 'distress', 'drift', 'young', 'man'], ['sure', 'lose', 'flat', 'character', 'audience', 'nearly', 'half', 'walk'], ['attempt', 'artiness', 'black', '&', 'white', 'clever', 'camera', 'angle', 'movie', 'disappoint', 'become', 'even', 'ridiculous', 'act', 'poor', 'plot', 'line', 'almost', 'non-existent'], ['little', 'music', 'anything', 'speak'], ['best', 'scene', 'movie', 'gerardo', 'try', 'find', 'song', 'keep', 'run', 'head']]


In [76]:
cv = CountVectorizer()

In [77]:
# wordsList = np.asarray(wordsList)

In [78]:
for i in range(len(wordsList)):
    wordsList[i] = ' '.join(wordsList[i])

In [79]:
wordsList[0]

'slow-moving aimless movie distress drift young man'

In [80]:
vect = cv.fit_transform(wordsList)

In [81]:
y = df['Sentiment'].values
x_train,x_test,y_train,y_test = train_test_split(vect,y,test_size=0.25)

In [82]:
reg = LogisticRegression()

In [83]:
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [84]:
y_pred = reg.predict(x_test)

In [85]:
accuracy_score(y_test,y_pred)

0.7947598253275109

In [86]:
confusion_matrix(y_test,y_pred)

array([[270,  65],
       [ 76, 276]], dtype=int64)

In [87]:
rev = "I loved it. i want to watch it again and the credits are still rolling. i will never stop singing tom holland's praise as the best spiderman we've had. and jake gyllenhaal was fantastic. an all around really great film. funny when it needed to be and emotional when it needed to be."

In [88]:
review = {'Review':[rev]}

In [89]:
df_2 = pd.DataFrame(review)

In [90]:
token = word_tokenize(rev.lower())

In [91]:
tokens = []
for word in token:
    if word not in stopwordsList:
        tokens.append(word)

In [92]:
for i in range(len(tokens)):
    tokens[i] = wnet.lemmatize(tokens[i],pos='v')

In [93]:
sent = ' '.join(tokens)

In [94]:
vect = cv.transform([sent])

In [95]:
reg.predict(vect)

array([1], dtype=int64)

In [96]:
tokens = []
for i in range(len(df_2)):
    tokens.append(word_tokenize(df_2['Review'][i]))

In [97]:
tokens[0][:5]

['I', 'loved', 'it', '.', 'i']

In [98]:
wordsList = []
for tokenList in tokens:
    words = []
    for word in tokenList:
        if word.lower() not in stopwordsList:
            words.append(word.lower())
    wordsList.append(words)

In [99]:
for i in range(len(wordsList)):
    for j in range(len(wordsList[i])):
        wordsList[i][j] = wnet.lemmatize(wordsList[i][j], pos='v')

In [100]:
for i in range(len(wordsList)):
    wordsList[i] = ' '.join(wordsList[i])

In [101]:
wordsList

["love want watch credit still roll never stop sing tom holland 's praise best spiderman 've jake gyllenhaal fantastic around really great film funny need emotional need"]

In [102]:
# vect = cv.fit_transform(wordsList)

In [103]:
vect

<1x4372 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [104]:
reg.predict(vect.toarray())

array([1], dtype=int64)

In [105]:
reg.coef_

array([[-0.39533034,  0.89307774,  0.09614749, ...,  0.        ,
        -0.06592646, -0.27482589]])