In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('imdb_labelled.txt', sep='\t', header=None)
columns = ['review','sentiment']
df.columns = columns

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
X = df['review']
y = df['sentiment']

In [5]:
X[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [7]:
for i in range(len(X)):
    X[i] = word_tokenize(X[i].lower())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
print(X[0])

['a', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie', 'about', 'a', 'distressed', ',', 'drifting', 'young', 'man', '.']


In [9]:
X.shape

(748,)

In [10]:
s_words = stopwords.words('english')
s_words.append(',')
s_words.append('.')
s_words.append('-')

In [11]:
newX = []
for i in range(len(X)):
    newX.append(list(set(X[i]) - set(s_words)))

In [12]:
newX[0]

['movie', 'man', 'slow-moving', 'young', 'aimless', 'distressed', 'drifting']

In [13]:
newX[1]

['walked', 'nearly', 'audience', 'half', 'lost', 'characters', 'sure', 'flat']

In [14]:
wordnet = WordNetLemmatizer()

In [15]:
for i in range(len(newX)):
    for j in range(len(newX[i])):
        newX[i][j] = wordnet.lemmatize(newX[i][j], pos='v')

In [16]:
newX[0]

['movie', 'man', 'slow-moving', 'young', 'aimless', 'distress', 'drift']

In [17]:
newX[1]

['walk', 'nearly', 'audience', 'half', 'lose', 'character', 'sure', 'flat']

In [18]:
type(newX)

list

In [19]:
transformer = TfidfTransformer()

In [20]:
newX = np.asarray(newX)

In [21]:
newX[0]

['movie', 'man', 'slow-moving', 'young', 'aimless', 'distress', 'drift']

In [22]:
' '.join(newX[0])

'movie man slow-moving young aimless distress drift'

In [23]:
for i in range(len(newX)):
    newX[i] = ' '.join(newX[i])

In [24]:
newX[1]

'walk nearly audience half lose character sure flat'

In [25]:
vect = TfidfVectorizer()

In [26]:
newX = vect.fit_transform(newX)

In [27]:
newX

<748x2613 sparse matrix of type '<class 'numpy.float64'>'
	with 6890 stored elements in Compressed Sparse Row format>

In [28]:
newX = newX.toarray()

In [29]:
reg = LogisticRegression()

In [30]:
x_train,x_test,y_train,y_test = train_test_split(newX,y,test_size = 0.25)

In [31]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [32]:
y_pred = reg.predict(x_test)

In [33]:
accuracy_score(y_test, y_pred)

0.7540106951871658

In [34]:
nb = GaussianNB()
nb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [35]:
y_pred2 = nb.predict(x_test)

In [36]:
accuracy_score(y_test, y_pred2)

0.7058823529411765

In [37]:
confusion_matrix(y_test, y_pred)

array([[65, 26],
       [20, 76]], dtype=int64)