In [1]:
#Merging 50,000 .txt documents into one Pandas data frame
#nltk.download()
import nltk
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'): 
    for l in ('pos', 'neg'): 
        path = '/Users/jmaller/Downloads/aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path): 
                with open(os.path.join(path, file), 'r') as infile: txt = infile.read() 
                df = df.append([[txt, labels[l]]], ignore_index = True) 
                pbar.update()
df.columns = ['review', 'sentiment']

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:05:24


In [3]:
# Shuffle the data frame using the permutation function and print the first three rows
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [4]:
#word of bags example
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())


{u'and': 0, u'weather': 6, u'sweet': 4, u'sun': 3, u'is': 1, u'the': 5, u'shining': 2}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [5]:
#eample of term frequency-inverse document frequency
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [6]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [7]:
# clean data - remove HTML markup and 
#remove all non-word characters from the text, converted the text into lowercase characters, moved emoticons to end of document string, removed nose character from emoticons
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()).join(emoticons).replace('-', '')
    return text

In [8]:
# testing pre-processor
preprocessor(df.loc[0, 'review'][-50:])
preprocessor("</a>This :) is :( a test :-)!")

':)this is a test :(this is a test :)'

In [9]:
# clean all movie reviews in data frame
df['review'] = df['review'].apply(preprocessor)

In [10]:
#processing documents into tokens
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [11]:
# use word stemming to tranform words into root form
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

[u'runner', u'like', u'run', u'and', u'thu', u'they', u'run']

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jmaller/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

[u'runner', u'like', u'run', u'run', u'lot']

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [20]:
#Exploratory Data Analysis 
#df['tokenized_review'] = df.apply(lambda row: nltk.word_tokenize(row['review']), axis=1)
#tr = df['tokenized_review']
#text = nltk.Text(tr)
#text.collocations()


In [21]:
# training a logistic regression model for document classification
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None, lowercase = False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]
lr_tfidf = Pipeline([('vect', tfidf), 
                     ('clf',
                     LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring='accuracy',
                          cv=5, verbose=1,
                          n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   51.0s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x111cc6758>, <function tokenizer_porter at 0x111cc6500>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'y...x111cc6500>], 'vect__use_idf': [False], 'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2']}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1

In [22]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_) 

Best parameter set: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function tokenizer at 0x111cc6758>, 'vect__stop_words': [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'wh

In [23]:
# frequency analysis of words in text
print('CV Accuracy: %.3f'
       % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f'
     % clf.score(X_test, y_test))

CV Accuracy: 0.502
Test Accuracy: 0.505


In [27]:

word_vectorizer = CountVectorizer(ngram_range=(3,3), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['review'])
frequencies = sum(sparse_matrix).toarray()[0]
pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])

Unnamed: 0,frequency
000 000 frf,1
000 frf was,1
000 frf which,1
000 times better,1
007s he looks,2
02 hope this,2
10 000 times,1
10 10 fine,2
10 against the,2
10 because they,1


In [30]:
# count the number of times the word 'bad' was used in each review
txt = df.apply(lambda x: nltk.Text(x).count('bad'))
txt.head(5)

review              0
sentiment           0
tokenized_review    0
dtype: int64