In [1]:
"""
chapter 08
"""

# import libs 

import os, re 
import pandas as pd 
import numpy as np 

import pyprind 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 

from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier




In [4]:
# ----------------- import the dataset ------------------- 

# IMDb dataset 

# assemble the individual text documents 
# from the decompressed download archive into a single CSV file

# initialized a new progress bar object pbar with 50,000 iterations: number of documents  
pbar = pyprind.ProgBar(50000)

labels = {'pos':1, 'neg':0}

df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path ='./aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8', errors='ignore') as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()
                
df.columns = ['review', 'sentiment']

# shuffle DataFrame: class labels in the assembled dataset are sorted
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False, encoding='utf-8')



0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:01:44


In [5]:
# test if file write sucsess 
df = pd.read_csv('./movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [10]:
# bag of words model 
count = CountVectorizer()
docs = np.array([
       'The sun is shining',
       'The weather is sweet',
       'The sun is shining and the weather is sweet'])

bag = count.fit_transform(docs)

print(f'the vocabulary: \n {count.vocabulary_} \n')
print(f'the bag-of-word feature vectors: \n {bag.toarray()}')

the vocabulary: 
 {'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0} 

the bag-of-word feature vectors: 
 [[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [11]:
# tf-idf model 
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print('the tf-idf feature vectors: \n', tfidf.fit_transform(count.fit_transform(docs)).toarray())

the tf-idf feature vectors: 
 [[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [14]:
# cleaning text data 
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

# apply the preprocessor function to all movie reviews
df['review'] = df['review'].apply(preprocessor)

In [2]:
# processing documents into tokens 

# def tokenizer(text):
#     return text.split()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

# word stemming 
# process of transforming a word into its root form that allows us to map related 
# words to the same stem
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

# stop-word removal 
# Stop-words are simply those words that are extremely common 
# in all sorts of texts and likely bear no (or only little) useful 
# information that can be used to distinguish between different classes of documents
stop = stopwords.words('english')

In [None]:
# logistic regression for document classification 

# training data and test data
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

# use a GridSearchCV object to find the optimal set of parameters 
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

param_grid = [{'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer,
                                  tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]},
            {'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer,
                                  tokenizer_porter],
              'vect__use_idf':[False],
              'vect__norm':[None],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]}
            ]

lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf',
                     LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, 
                          scoring='accuracy',
                          cv=5, verbose=1,
                          n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
# stochastic gradient descent


# reads in and returns one document at a time
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            
            
# take a document stream from the stream_docs function 
# and return a particular number of documents 
# specified by the size parameter            
def get_minibatch(doc_stream, size):
    docs, y = [], []
        try:
            for _ in range(size):
                text, label = next(doc_stream)
                docs.append(text)
                y.append(label)
        except StopIteration:
            return None, None
        return docs, y
    
# out of core learning     
vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')    


# set up the complementary functions 
pbar = pyprind.ProgBar(45)  # initialized the progress bar object with 45 iterations
classes = np.array([0, 1])
for _ in range(45): # iterated over 45 minibatches 
    X_train, y_train = get_minibatch(doc_stream, size=1000) # each minibatch consists of 1,000 documents
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

# use the last 5,000 documents to evaluate the performance of our model    
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))    

# use the last 5000 documents to update the model 
clf = clf.partial_fit(X_test, y_test)