In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
import glob
import requests
import tarfile
import tqdm
import os

## From: Python Machine Learning by Raschka and Mirjalili

---

## Get the data

In [2]:
# only need to rerun if csv's are not in ../data

imdb_url = 'http://ai.stanford.edu/~amaas/data/sentiment/'
imdb_tar = 'aclImdb_v1.tar.gz'
data_base = '../data/'
imdb_base = 'aclImdb'
imdb_csv = 'acl_imdb_data.csv'

if not glob.glob(os.path.join(data_base,imdb_tar)):
    req = requests.get(imdb_url+imdb_tar)
    open('../data/'+imdb_tar,'wb').write(req.content)

if not glob.glob(os.path.join(data_base,imdb_base)):
    with tarfile.open(os.path.join(data_base,imdb_tar),'r:gz') as tar:
            tar.extractall(path=data_base)
if not glob.glob(os.path.join(data_base,imdb_csv)):
    labels = {'pos':1,'neg':0}
    df = pd.DataFrame()
    with tqdm.tqdm(total=50000) as pbar:
        for s in ('test','train'):
            for l in ('pos','neg'):
                path = os.path.join(data_base,imdb_base,s,l)
                for file in sorted(os.listdir(path)):
                    with open(os.path.join(path,file),
                              'r', encoding='utf-8') as infile:
                        txt = infile.read()
                    df = df.append([[txt, labels[l]]],ignore_index=True)
                    pbar.update()
    df.columns = ['review','sentiment']
    
    np.random.seed(0)
    df = df.reindex(np.random.permutation(df.index))
    df.to_csv('../data/acl_imdb_data.csv',index=False,encoding='utf-8')

In [3]:
df = pd.read_csv('../data/acl_imdb_data.csv',encoding='utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [4]:
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


---

## Transform Data

In [5]:
test_doc = df.loc[1,'review'][:52]
test_doc

'OK... so... I really like Kris Kristofferson and his'

In [6]:
# preprocess the text to remove html tags, normalize emoticons, remove other non-word characters and lowercase
import re
def preprocessor(text):
    text = re.sub('<[^>]*>','',text)                             # remove html tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)  # find emoticons, see https://regex101.com/
    # remove non-word chars, lowercase, add back in emoticons
    text = (re.sub('[\W]+',' ',text.lower()) + ' '.join(emoticons).replace('-','')) 
    return text

In [7]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [8]:
preprocessor(test_doc)

'ok so i really like kris kristofferson and his'

In [9]:
# We'll do this once here to speed up the grid search later on
df['review_processed'] = df.review.apply(preprocessor)

In [10]:
test_doc_processed = preprocessor(test_doc)
test_doc_processed

'ok so i really like kris kristofferson and his'

## Different Tokenizers

In [11]:
def tokenizer_split(text):
    return text.split() # split on whitespace

tokenizer_split(preprocessor(test_doc_processed))

['ok', 'so', 'i', 'really', 'like', 'kris', 'kristofferson', 'and', 'his']

In [12]:
# need to run: conda install -n eods-s22 nltk
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_split_porter(text):
    return [porter.stem(word) for word in tokenizer_split(text)]

tokenizer_split_porter(test_doc_processed)

['ok', 'so', 'i', 'realli', 'like', 'kri', 'kristofferson', 'and', 'hi']

---

## Train Model

In [13]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df.review_processed,df.sentiment)

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

lr_tfidf = Pipeline([('vect',tfidf),
                     ('clf',LogisticRegression(random_state=0,solver='liblinear'))])

In [15]:
%%time
# NOTE!! This step takes a long time: ~1.5 hours on 8-core i7 @ 1.8 GHz

# try with two different parameter grids, one using tfidf, the other only tf
param_grid = [
    {'vect__ngram_range': [(1,1)],
     'vect__max_df':[.25,.5,.75],
     'vect__tokenizer': [tokenizer_split,tokenizer_split_porter], # try with and without stemming
     'clf__penalty': ['l1','l2'],
     'clf__C':[1.0,10.0,100.0],
    },
    
    {'vect__ngram_range': [(1,1)],
     'vect__max_df':[.25,.5,.75],
     'vect__tokenizer': [tokenizer_split,tokenizer_split_porter],
     'vect__use_idf': [False],                        # try using term frequencies without tf-idf
     'vect__norm': [None],                            # turn off norming when using tf
     'clf__penalty':['l1','l2'],
     'clf__C': [1.0,10.0,100.0]
    }
]

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=3, verbose=2, n_jobs=-1)
gs_lr_tfidf.fit(X_train,y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
CPU times: user 36.5 s, sys: 37.5 s, total: 1min 13s
Wall time: 1h 30min 56s


In [16]:
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')
print(f'CV Accuracy: {gs_lr_tfidf.best_score_:0.3f}')

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 1), 'vect__tokenizer': <function tokenizer_split at 0x7f12a44e05e0>}
CV Accuracy: 0.896


In [17]:
clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test):0.3f}')

Test Accuracy: 0.905


In [18]:
(y_test == 1).sum() / len(y_test)

0.50408

In [19]:
gs_lr_tfidf.predict(['this was great']),np.round(gs_lr_tfidf.predict_proba(['this was great']))

(array([1]), array([[0., 1.]]))

In [20]:
gs_lr_tfidf.predict(['this was bad']),np.round(gs_lr_tfidf.predict_proba(['this was bad']),2)

(array([0]), array([[1., 0.]]))

In [21]:
gs_lr_tfidf.predict(['this was ok in parts']),np.round(gs_lr_tfidf.predict_proba(['this was ok in parts']))

(array([1]), array([[0., 1.]]))

[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7fc219ea29e0>; total time=   7.2s
[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.5, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7fc219ea2cb0>; total time= 6.3min
[CV] END clf__C=1.0, clf__penalty=l2, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7fc219ea2c20>; total time= 6.0min
[CV] END clf__C=10.0, clf__penalty=l1, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7fc219ea2cb0>; total time=  15.3s
[CV] END clf__C=10.0, clf__penalty=l1, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7fc219ea2c20>; total time= 7.6min
[CV] END clf__C=10.0, clf__penalty=l2, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7fc219ea2cb0>; total time= 

[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7f1edcebe9e0>; total time=   6.9s
[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.5, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7f1edcebecb0>; total time=   9.9s
[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.75, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7f1edcebec20>; total time=  11.0s
[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.75, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7f1edcebecb0>; total time= 6.3min
[CV] END clf__C=1.0, clf__penalty=l2, vect__max_df=0.5, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7f1edcebec20>; total time= 6.1min
[CV] END clf__C=10.0, clf__penalty=l1, vect__max_df=0.5, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7f1edcebecb0>; total time=  18.4s
[CV]

[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.5, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7f8385fda9e0>; total time=   7.1s
[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.5, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7f8385fdacb0>; total time= 6.2min
[CV] END clf__C=1.0, clf__penalty=l2, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7f8385fdac20>; total time= 5.9min
[CV] END clf__C=10.0, clf__penalty=l1, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7f8385fdacb0>; total time=  14.0s
[CV] END clf__C=10.0, clf__penalty=l1, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7f8385fdac20>; total time=  17.8s
[CV] END clf__C=10.0, clf__penalty=l1, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7f8385fdacb0>; total time= 7

[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7efed594a9e0>; total time=   7.0s
[CV] END clf__C=1.0, clf__penalty=l1, vect__max_df=0.5, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7efed594acb0>; total time= 6.2min
[CV] END clf__C=1.0, clf__penalty=l2, vect__max_df=0.25, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7efed594ac20>; total time= 5.9min
[CV] END clf__C=1.0, clf__penalty=l2, vect__max_df=0.75, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7efed594acb0>; total time= 7.5min
[CV] END clf__C=10.0, clf__penalty=l1, vect__max_df=0.75, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split at 0x7efed594ac20>; total time=  12.7s
[CV] END clf__C=10.0, clf__penalty=l1, vect__max_df=0.75, vect__ngram_range=(1, 1), vect__tokenizer=<function tokenizer_split_porter at 0x7efed594acb0>; total 