In [1]:
import pandas as pd

# Read reviews

In [25]:
reviews = pd.read_csv('labeledTrainData.tsv', sep='\t', header=0)

# Split data into train and test sets

In [26]:
from sklearn.model_selection import train_test_split
reviews_train, reviews_test = train_test_split(reviews, test_size=0.2)

# Analyze reviews

In [27]:
reviews_train.head()

Unnamed: 0,id,sentiment,review
1684,6710_1,0,I have seen over 1000 movies and this one stan...
18217,8531_3,0,"In this forgettable trifle, the 40-ish Norma S..."
24386,10766_7,1,A thin story with many fine shots. Eyecatchers...
12360,9918_1,0,Another British cinema flag waver. Real garbag...
24062,861_2,0,A disappointing film.<br /><br />The story est...


# Compute word counts

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
word_count = vectorizer.fit_transform(reviews_train['review'])

In [46]:
word_count.shape

(20000, 68475)

# Build model using Naive Bayes

In [47]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(word_count, reviews_train['sentiment'])

# Analyze built model

In [48]:
clf.coef_.shape

(1, 68475)

In [55]:
clf.coef_

array([[-11.06824434, -10.05418943, -14.67916225, ..., -13.98601507,
        -13.98601507, -13.98601507]])

In [56]:
vectorizer.get_feature_names()

[u'00',
 u'000',
 u'0000000000001',
 u'00001',
 u'00015',
 u'000s',
 u'001',
 u'003830',
 u'006',
 u'007',
 u'0079',
 u'0080',
 u'0083',
 u'00am',
 u'00pm',
 u'00s',
 u'01',
 u'01pm',
 u'02',
 u'020410',
 u'029',
 u'03',
 u'04',
 u'041',
 u'05',
 u'050',
 u'06',
 u'07',
 u'08',
 u'087',
 u'08th',
 u'09',
 u'0f',
 u'0ne',
 u'0r',
 u'0s',
 u'10',
 u'100',
 u'1000',
 u'1000000',
 u'10000000000000',
 u'1000lb',
 u'1000s',
 u'1001',
 u'100k',
 u'100m',
 u'100min',
 u'100mph',
 u'100s',
 u'100th',
 u'100x',
 u'100yards',
 u'101',
 u'101st',
 u'102',
 u'102nd',
 u'103',
 u'104',
 u'1040',
 u'1040a',
 u'1040s',
 u'105',
 u'1050',
 u'106',
 u'106min',
 u'107',
 u'108',
 u'109',
 u'10min',
 u'10minutes',
 u'10p',
 u'10pm',
 u'10s',
 u'10star',
 u'10th',
 u'10x',
 u'10yr',
 u'11',
 u'110',
 u'1100',
 u'11001001',
 u'111',
 u'112',
 u'1138',
 u'114',
 u'1146',
 u'115',
 u'116',
 u'117',
 u'11f',
 u'11m',
 u'11th',
 u'12',
 u'120',
 u'1200',
 u'1200f',
 u'1201',
 u'1202',
 u'123',
 u'12383499143743

# Building a pipeline

In [61]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()), ])

# Train the model using the pipeline

In [62]:
text_clf.fit(reviews_train.review, reviews_train.sentiment)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

# Evaluation of the performance on the test set

In [60]:
predicted = text_clf.predict(reviews_test.review)
np.mean(predicted == reviews_test.sentiment)

0.85999999999999999

# Use SVM to build and evaluate the model

In [67]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()), ])
text_clf.fit(reviews_train.review, reviews_train.sentiment)
predicted = text_clf.predict(reviews_test.review)
np.mean(predicted == reviews_test.sentiment)

0.88619999999999999

# predicting probabilities

In [94]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='log')), ])
text_clf.fit(reviews_train.review, reviews_train.sentiment)
reviews_test['prediction'] = text_clf.predict_proba(reviews_test.review)[:, [True, False]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


# read the best review

In [102]:
pd.options.display.max_colwidth = 1000
print reviews_test.sort_values(['prediction']).head(1).review

8871    One of the best movies out there. Yeah maybe the cinematography wasn't the greatest, but an excellent plot and concept. Great for the time and brilliant and creative ideas. Something different from the usual movies and great fun. One of my favorites and would recommend to anyone who likes creative and imaginative movies. Post World War 3 and fighting in gigantic robots, the actors gave a great performance and made it all worth while. The sets are not amazing, but simple and worked for the overall look of the film. This movie is very hard to find on DVD, but also on VHS. Check it out cause I have loved it since it came out. Not a mainstream flick and not like anything you've ever seen. Take a look and think like a child. It's a great view and very fun.
Name: review, dtype: object


# read the worst review

In [103]:
print reviews_test.sort_values(['prediction'], ascending=False).head(1).review

19224    I would have given it a one instead of a two, but I suppose it COULD have been worse. I guess the acting isn't all that bad, but the plot lacks anything even remotely close to interesting. It is a terrible movie!! TERRIBLE! Complete waste of time! I strongly suggest you do not watch this movie.!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Name: review, dtype: object
