In [4]:
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1,'neg':0}

df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()

df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:04:40


In [7]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)

In [8]:
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"The premise of this movie, of a comedian talk ...",0
1,I first remember bumping into this zaniness fr...,1
2,First of all I saw this movie without knowing ...,1


In [9]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining', 'The weather is sweet', 'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
print(count.vocabulary_)

{'sun': 4, 'shining': 3, 'one': 2, 'is': 1, 'the': 6, 'and': 0, 'sweet': 5, 'weather': 8, 'two': 7}


In [10]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.    0.56  0.56  0.    0.43  0.    0.  ]
 [ 0.    0.43  0.    0.    0.    0.56  0.43  0.    0.56]
 [ 0.5   0.45  0.5   0.19  0.19  0.19  0.3   0.25  0.19]]


In [13]:
df.loc[0, 'review'][-50:]

'topping it, which is usually a good thing with me.'

In [17]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
    return text

In [18]:
preprocessor(df.loc[0, 'review'][-50:])
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :):(:)'

In [19]:
df['review'] = df['review'].apply(preprocessor)

In [27]:
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [33]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [34]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
x_train = df.loc[:25000, 'review'].values
# print(x_train[0:10])
y_train = df.loc[:25000, 'sentiment'].values
print(y_train[0:10])
x_test = df.loc[25000:, 'review'].values
# print(x_test[0:1])
y_test = df.loc[25000:, 'sentiment'].values
# print(y_test[0:1])

[0 1 1 0 1 1 1 1 1 0]


In [56]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{
    'vect__ngram_range': [(1,1)],
    'vect__stop_words': [stop, None],
    'vect__tokenizer': [tokenizer, tokenizer_porter],
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(x_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 19.1min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 49.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'th...05d08>], 'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2'], 'vect__ngram_range': [(1, 1)]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='ac

In [57]:
print('Best parameter set: %s' % gs_lr_tfidf.best_params_)

Best parameter set: {'vect__stop_words': None, 'clf__penalty': 'l2', 'vect__tokenizer': <function tokenizer at 0x7f1d206bcd08>, 'clf__C': 10.0, 'vect__ngram_range': (1, 1)}


In [58]:
clf = gs_lr_tfidf.best_estimator_
print('CV Accuracy: %.3f' % clf.score(x_test, y_test))

CV Accuracy: 0.900


In [73]:
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
# print(vect.transform(x_train[0:1]))
gs_lr_tfidf.predict_proba(x_train[0:1]).max()
print(x_train[0])

the premise of this movie of a comedian talk show host running for president as an independent just to shake things up is funny entertaining brilliant and even a bit inspiring thought about the west wing debate when tom dobbs leaves his podium thought about steven colbert announcing his candidacy good times the first 15 20 minutes of this movie are therefore very very entertaining the debate especially when he eventually get s elected it s a pity that is because of a computer glitch you d want him to win fair although that is unrealistic but after that this movie goes completely downhill i thought we d get a great movie like dave 1993 in which we see how it would out if a comedian actually ran the country instead the movie turns from comedy into a thriller a romantic comedy and a drama and does none good the computer glitch becomes the main storyline which really sucks boy is this disappointing i give it 3 stars just for the premise and because i actually managed to watch this movie fr