In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score



In [2]:
columns = ['sentence', 'sentiment']
df = pd.read_csv('imdb_labelled.txt', sep='\t', header=None)

In [3]:
df.columns = columns
df.head()

Unnamed: 0,sentence,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
tokens = []
for sent in df['sentence']:
    tokens.append(word_tokenize(sent))

In [5]:
tokens[0]

['A',
 'very',
 ',',
 'very',
 ',',
 'very',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'about',
 'a',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [6]:
for i in range(len(tokens)):
    for j in range(len(tokens[i])):
        tokens[i][j] = tokens[i][j].lower()

In [7]:
tokens[0]

['a',
 'very',
 ',',
 'very',
 ',',
 'very',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'about',
 'a',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [8]:
for i in range(len(tokens)):
    words = []
    for word in tokens[i]:
        if word not in stopwords.words("english"):
            words.append(word)
        tokens[i] = words

In [9]:
tokens[0]

[',',
 ',',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [10]:
wnet = WordNetLemmatizer()

In [11]:
# for i in range(len(tokens)):
#     for j in range(len(tokens[i])):
#         print(tokens[i][j])

In [12]:
X = df['sentence'].values
y = df['sentiment'].values

In [13]:
X[0],y[0]

('A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
 0)

In [14]:
tfidf = TfidfVectorizer()

In [15]:
X = tfidf.fit_transform(X)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [16]:
X

<748x3047 sparse matrix of type '<class 'numpy.float64'>'
	with 11363 stored elements in Compressed Sparse Row format>

In [17]:
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.25)

In [19]:
logistic = LogisticRegression()

In [20]:
logistic.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
y_pred = logistic.predict(x_test)

In [22]:
accuracy_score(y_test, y_pred)

0.7647058823529411

In [23]:
review = ["the movie was really boring and i didn't enjoy it",
          "nice movie, worth to watch"]

review_tfidf = tfidf.transform(review)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [24]:
my_pred = logistic.predict(review_tfidf)

In [25]:
logistic.coef_

array([[ 1.14542822,  0.09984998, -0.09792843, ..., -0.20573874,
         0.01549833, -0.14948121]])

In [26]:
len(logistic.coef_[0])

3047

In [27]:
# accuracy_score(y_test, my_pred)

In [28]:
my_pred

array([0, 1], dtype=int64)

In [30]:
y_test[:10],y_pred[:10]

(array([0, 0, 1, 0, 1, 1, 0, 1, 1, 0], dtype=int64),
 array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0], dtype=int64))

In [32]:
nb = GaussianNB()
nb.fit(x_train.toarray(), y_train)

GaussianNB(priors=None)

In [34]:
y_pred_nb = nb.predict(x_test.toarray())

In [35]:
print("Actual :",y_test[:10])
print("Logistic :",y_pred[:10])
print("Naive Bayes :",y_pred_nb[:10])

Actual : [0 0 1 0 1 1 0 1 1 0]
Logistic : [0 0 1 0 0 0 0 0 1 0]
Naive Bayes : [0 0 1 0 1 0 0 1 1 0]


In [36]:
accuracy_score(y_test,y_pred_nb)

0.6417112299465241

In [37]:
review = ["the movie was really boring and i didn't enjoy it"]
review_tfidf = tfidf.transform(review)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [38]:
logistic.predict(review_tfidf)

array([0], dtype=int64)