 See links for instructions on installation if not already installed.
  - [NLTK](https://www.nltk.org/install.html) (tested with 3.6.7 and with 3.2.5.)
  - [Scikit-Learn](https://scikit-learn.org/stable/install.html) (test with 1.0.2)
  - [SciPy](https://scipy.org/install/) (tested with 1.7.3 and with 1.4.1)

In [1]:
!pip install gdown
!gdown --id 1thWkUj7uGOApr_dXRvMr9TsEHpo_H_2q -O sst2.zip
!mkdir -p data
!unzip sst2.zip -d data
!rm sst2.zip

 # Feature Engineering - Lemmatization

In [2]:
from collections import Counter
import json
import re
from pathlib import Path

from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

print("Build lemmatization vocab from sst2.train")
data_dir = Path('sst2/')
tokenizer = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()
lem_counter = Counter()
lem_counter.update(['<pad>', '<unk>'])
data_train = open(data_dir.joinpath('sst2.train')).readlines()
print(f"Size of training data: {len(data_train)}")

token_lines = []

for line in data_train:
    lower_line = line.lower()
    token_lines.append(tokenizer.tokenize(lower_line))

lem_lines = []

for line in token_lines:
    lem_lines.append([lemmatizer.lemmatize(word, 'v') for word in line])
        
for sentence in lem_lines:
    sentence = sentence[1:]
    #sentence = [word for word in sentence if word not in lst_stopwords]
    for word in sentence:
        lem_counter[word] = lem_counter.get(word, 0) + 1

print(f"Vocab size before frequency filtering: {len(lem_counter)}")

lem_vocab = {key: val for key, val in lem_counter.items() if val >= 3 or key == '<pad>' or key == '<unk>'}
lem_vocab = {key: 0 for key in lem_vocab}
lem_vocab.update((k, i) for i, k in enumerate(lem_vocab))

print(f"Vocab size after frequency filtering: {len(lem_vocab)}")
output_filepath = data_dir.joinpath('lemmatization_vocab.json')
json.dump(lem_vocab, open(output_filepath, mode='w'))


Build lemmatization vocab from sst2.train
Size of training data: 6920
Vocab size before frequency filtering: 11568
Vocab size after frequency filtering: 4480


In [3]:
import json
from nltk.tokenize import WordPunctTokenizer
import numpy as np
import math
from scipy import sparse

def lem_features(vocab, data_dir, dataset, data_type, tokenizer, feature_name, tfidf):
    '''
    Generates sparse matrices for lemmatization with TF-IDF
    '''
        
    data = open(data_dir.joinpath(dataset)).readlines()
    tokenizer = WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()
    
    tokens = []
    for line in data:
        lower_line = line.lower()
        tokens.append(tokenizer.tokenize(lower_line))
        
    lemma = []
    for line in tokens:
        lemma.append([lemmatizer.lemmatize(word, 'v') for word in line])
    
    final_lemma = []
    for line in lemma:
        split_line = line[1:]
        final_lemma.append(split_line)

    idf_dict = {key: 0 for key in vocab.keys()}
    for line in final_lemma:
        for word in line:
            if word in vocab.keys():
                idf_dict[word] = idf_dict.get(word, 0) + 1
            
    for key, value in idf_dict.items():
        idf_dict[key] = math.log10(len(final_lemma)/(value + 1))
        
    data_dict = {index: {key: value for key, value in idf_dict.items()} for index, value in enumerate(final_lemma)}
    for line in final_lemma:
        for word in line:
            tf = line.count(word) / len(line)
            if word in vocab.keys():
                data_dict[final_lemma.index(line)][word] = data_dict[final_lemma.index(line)].get(word)*tf
    
    binary_lem = {index: {key: 0 for key in vocab.keys()} for index, value in enumerate(final_lemma)}
    for line in final_lemma:
        for word in line:
            if word in vocab.keys():
                binary_lem[final_lemma.index(line)][word] = 1
            else:
                binary_lem[final_lemma.index(line)]['<unk>'] = 1
    
    data_matrix = np.array([[data_dict[index][key] for key in vocab.keys()] for index, value in enumerate(final_lemma)])
    lem_matrix = np.array([[binary_lem[index][key] for key in vocab.keys()] for index, value in enumerate(final_lemma)])
    
    if tfidf is True: 
        sparse_matrix = sparse.csr_matrix(data_matrix)
        print(("The shape of the " + data_type + " matrix is: "), data_matrix.shape)
        print()
    else:
        sparse_matrix = sparse.csr_matrix(lem_matrix)
        print(("The shape of the " + data_type + " matrix is: "), lem_matrix.shape)
        print()
        
    sparse.save_npz('sst2/' + data_type + '_' + feature_name + '_features.npz', sparse_matrix)
    

In [4]:
lem_features(lem_vocab, Path('sst2/'), 'sst2.train', 'train', WordPunctTokenizer(), 'lemmatization', True)
lem_features(lem_vocab, Path('sst2/'), 'sst2.dev', 'dev', WordPunctTokenizer(), 'lemmatization', True)
lem_features(lem_vocab, Path('sst2/'), 'sst2.test', 'test', WordPunctTokenizer(), 'lemmatization', True)

The shape of the train matrix is:  (6920, 4480)

The shape of the dev matrix is:  (872, 4480)

The shape of the test matrix is:  (1821, 4480)



In [5]:
train_lemma = sparse.load_npz('sst2/train_lemmatization_features.npz')
dev_lemma = sparse.load_npz('sst2/dev_lemmatization_features.npz')
test_lemma = sparse.load_npz('sst2/test_lemmatization_features.npz')

train_labels = np.load('sst2/train_labels.npz')
dev_labels = np.load('sst2/dev_labels.npz')
test_labels = np.load('sst2/test_labels.npz')

In [6]:
def print_important_weights(weights, words):
    """
    Print important pairs of weights and words.
    # Parameters
    weights : `Iterable`, required.
        Weights from a learned model.
    words : `Iterable`, required.
        Word types of the vocabulary.  
        It must be true that `len(weights) == len(words)`.
    # Returns
        `None`
    """

    def print_pairs(pairs):
        for weight, word in pairs:
            print("{: .4f} | {}".format(weight, word))

    assert len(weights) == len(words)
    pairs = list(zip(weights, words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
    print("Most positive words:")
    print_pairs(reversed(pairs[-10:]))
    print("\nMost negative words:")
    print_pairs(pairs[:10])

    pairs = list(zip(abs(weights), words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=False)
    print("\nMost neutral words:")
    print_pairs(pairs[:10])


 # Logistic regression with scikit-learn

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def fit_and_eval_logistic_regression(data_dir: Path,
                                     train_X, train_Y,
                                     test_X, test_Y,
                                     feature_name: str) -> LogisticRegression:
    '''
    Fits and evaluates the logistic regression model using the scikit-learn library
    
    Inputs:
        data_dir (path): the data directory
        trn_data (file): training data 
        tst_data (file): testing or dev data
        
    Output:
        model_trained (LogisticRegression): object of LogisticRegression after it is trained
    '''
    
    model = LogisticRegression()
    model.fit(train_X, train_Y['arr_0'])
    y_pred = model.predict(test_X)
    
    print("The accuracy score is: ", accuracy_score(test_Y['arr_0'], y_pred))
    print("The f1 score is: ", f1_score(test_Y['arr_0'], y_pred, average = 'weighted'))
    
    return model

### dev model

In [8]:
print("These are the scores for dev")
print()

fit_and_eval_logistic_regression(feature_name = 'lemmatization',
                                 train_X = train_lemma, train_Y = train_labels,
                                 test_X = dev_lemma, test_Y = dev_labels,
                                 data_dir = Path('sst2/'))

These are the scores for dev

The accuracy score is:  0.6422018348623854
The f1 score is:  0.5967287870722757


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### test model

In [9]:
print("These are the scores for test")
print()

fit_and_eval_logistic_regression(feature_name = 'lemmatization',
                                 train_X = train_lemma, train_Y = train_labels,
                                 test_X = test_lemma, test_Y = test_labels,
                                 data_dir = Path('sst2/'))

These are the scores for test

The accuracy score is:  0.7660626029654036
The f1 score is:  0.7616246637172223


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

## Weights Analysis

In [10]:
model_trained: LogisticRegression = fit_and_eval_logistic_regression(
    feature_name='lemmatization', train_X = train_lemma, train_Y = train_labels, test_X = test_lemma, test_Y = test_labels,data_dir=Path('data'))
weights = model_trained.coef_[0]
vocab = json.load(open(data_dir.joinpath('lemmatization_vocab.json')))
print_important_weights(weights=weights, words=vocab.keys())


The accuracy score is:  0.7660626029654036
The f1 score is:  0.7616246637172223
Most positive words:
-1.8231 | solid
-1.7849 | powerful
-1.7820 | remarkable
-1.7757 | enjoyable
-1.7091 | fun
-1.6996 | and
-1.6156 | refresh
-1.3956 | hilarious
-1.3933 | love
-1.3918 | beautifully

Most negative words:
 1.7768 | stupid
 1.6524 | suffer
 1.6224 | lack
 1.6055 | bland
 1.6001 | dull
 1.5975 | bore
 1.5635 | worst
 1.5444 | depress
 1.4741 | flat
 1.4548 | instead

Most neutral words:
 0.0000 | stage
 0.0000 | narrative
 0.0002 | sane
 0.0003 | bake
 0.0004 | handle
 0.0004 | --
 0.0008 | hunter
 0.0012 | legged
 0.0014 | upper
 0.0018 | mcgrath


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Error Analysis

In [11]:
def test_errors(data_dir: Path, train_X, train_Y, test_X, test_Y, feature_name: str) -> LogisticRegression:
    '''
    Returns predicted label output for given training and testing data
    '''
    
    model = LogisticRegression()
    model.fit(train_X, train_Y['arr_0'])
    y_pred = model.predict(test_X)
    
    return y_pred


In [12]:
test_model = test_errors(feature_name = 'lemmatization',
                         train_X = train_lemma, train_Y = train_labels,
                         test_X = test_lemma, test_Y = test_labels,
                         data_dir = Path('sst2/'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

def error_frame(pred_model, labels):
    '''
    Generates a pandas dataframe of mislabeled reviews
    '''
    
    label_column = pd.DataFrame(labels)
    label_column.rename(columns = {0: "True"}, inplace = True)
    model_column = pd.DataFrame(pred_model)
    model_column.rename(columns = {0: "Predicted"}, inplace = True)
    
    data_test = open(data_dir.joinpath('sst2.test')).readlines()
    tokens = []
    for line in data_test:
        lower_line = line.lower()
        tokens.append(tokenizer.tokenize(lower_line))
    
    full_frame = label_column.join(model_column)
    full_frame["Match"] = (full_frame["True"] == full_frame["Predicted"]).astype(int)
    full_frame["Line"] = tokens
    full_frame.index += 1
    
    print("The number of errors found: ", len(full_frame.loc[full_frame['Match'] == 0]))
    return full_frame.loc[full_frame['Match'] == 0]
     

  pd.set_option('display.max_colwidth', -1)


In [14]:
error_frame(test_model, test_labels['arr_0'])

The number of errors found:  426


Unnamed: 0,True,Predicted,Match,Line
2,0,1,0,"[0, a, gob, of, drivel, so, sickly, sweet, ,, even, the, eager, consumers, of, moore, ', s, pasteurized, ditties, will, retch, it, up, like, rancid, crème, brûlée, .]"
8,1,0,0,"[1, the, movie, exists, for, its, soccer, action, and, its, fine, acting, .]"
9,0,1,0,"[0, arnold, ', s, jump, from, little, screen, to, big, will, leave, frowns, on, more, than, a, few, faces, .]"
17,1,0,0,"[1, if, your, senses, have, n, ', t, been, dulled, by, slasher, films, and, gorefests, ,, if, you, ', re, a, connoisseur, of, psychological, horror, ,, this, is, your, ticket, .]"
19,0,1,0,"[0, as, conceived, by, mr, ., schaeffer, ,, christopher, and, grace, are, little, more, than, collections, of, quirky, traits, lifted, from, a, screenwriter, ', s, outline, and, thrown, at, actors, charged, with, the, impossible, task, of, making, them, jell, .]"
20,0,1,0,"[0, those, who, managed, to, avoid, the, deconstructionist, theorizing, of, french, philosopher, jacques, derrida, in, college, can, now, take, an, 85, -, minute, brush, -, up, course, with, the, documentary, derrida, .]"
22,0,1,0,"[0, but, what, saves, lives, on, the, freeway, does, not, necessarily, make, for, persuasive, viewing, .]"
23,1,0,0,"[1, steve, irwin, ', s, method, is, ernest, hemmingway, at, accelerated, speed, and, volume, .]"
29,0,1,0,"[0, the, premise, for, this, kegger, comedy, probably, sounded, brilliant, four, six, -, packs, and, a, pitcher, of, margaritas, in, ,, but, the, film, must, have, been, written, ..., in, the, thrall, of, a, vicious, hangover, .]"
44,1,0,0,"[1, much, monkeyfun, for, all, .]"


In [15]:
def print_weight(weights, words):
    """
    Prints all word-weight pairs for given data and vocabulary
    """

    def print_pairs(pairs):
        for weight, word in pairs:
            print("{: .4f} | {}".format(weight, word))

    assert len(weights) == len(words)
    pairs = list(zip(weights, words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
    print_pairs(pairs)


In [16]:
print_weight(weights = weights, words=vocab.keys())

 1.7768 | stupid
 1.6524 | suffer
 1.6224 | lack
 1.6055 | bland
 1.6001 | dull
 1.5975 | bore
 1.5635 | worst
 1.5444 | depress
 1.4741 | flat
 1.4548 | instead
 1.4349 | too
 1.4344 | failure
 1.4247 | lousy
 1.3922 | none
 1.3906 | tv
 1.3571 | unfortunately
 1.3524 | devoid
 1.3366 | tiresome
 1.3352 | mess
 1.2984 | fail
 1.2974 | listless
 1.2928 | unfunny
 1.2701 | barely
 1.2666 | bad
 1.2406 | pretentious
 1.2330 | ill
 1.1980 | no
 1.1938 | violence
 1.1870 | uneven
 1.1612 | awful
 1.1482 | empty
 1.1465 | settle
 1.1434 | waste
 1.1428 | tire
 1.1376 | already
 1.1292 | be
 1.1129 | lazy
 1.1067 | repetitive
 1.1053 | wannabe
 1.0744 | slip
 1.0731 | lame
 1.0703 | lose
 1.0672 | generic
 1.0642 | ultimately
 1.0628 | sheridan
 1.0569 | episode
 1.0556 | idea
 1.0536 | incoherent
 1.0498 | problem
 1.0359 | obvious
 1.0334 | only
 1.0284 | contrive
 1.0280 | load
 1.0254 | neither
 1.0216 | choose
 1.0179 | pointless
 1.0149 | mistake
 1.0094 | dreary
 1.0094 | insult
 0.99

 0.3304 | inoffensive
 0.3303 | humor
 0.3303 | lilia
 0.3302 | immature
 0.3297 | term
 0.3296 | repeat
 0.3296 | importance
 0.3295 | malaise
 0.3291 | labor
 0.3290 | trot
 0.3278 | believability
 0.3277 | hitler
 0.3275 | studios
 0.3275 | nevertheless
 0.3274 | import
 0.3271 | jackass
 0.3269 | chekhov
 0.3269 | $
 0.3268 | accent
 0.3263 | feminist
 0.3260 | clumsily
 0.3259 | infuriate
 0.3258 | misguide
 0.3258 | daughters
 0.3243 | lesson
 0.3239 | chop
 0.3236 | 51
 0.3235 | childlike
 0.3227 | loosely
 0.3224 | incongruous
 0.3223 | gag
 0.3218 | kung
 0.3218 | hickenlooper
 0.3214 | rest
 0.3213 | emptiness
 0.3213 | former
 0.3210 | rescue
 0.3209 | collection
 0.3209 | exact
 0.3206 | tom
 0.3199 | develop
 0.3198 | almost
 0.3197 | invest
 0.3193 | low
 0.3187 | beyond
 0.3177 | vs
 0.3171 | target
 0.3170 | sick
 0.3169 | accuse
 0.3167 | schmaltz
 0.3167 | exploitation
 0.3155 | insipid
 0.3155 | modernize
 0.3154 | excruciate
 0.3150 | simply
 0.3150 | qualities
 0.3

 0.0901 | consolation
 0.0900 | emotional
 0.0899 | prophecies
 0.0891 | sequence
 0.0890 | americans
 0.0886 | curious
 0.0884 | grasp
 0.0882 | summertime
 0.0878 | machine
 0.0874 | main
 0.0873 | bump
 0.0871 | leave
 0.0864 | ensue
 0.0862 | plotline
 0.0861 | previous
 0.0857 | meticulously
 0.0856 | awfully
 0.0852 | otherwise
 0.0851 | wilde
 0.0850 | advance
 0.0850 | dong
 0.0848 | reject
 0.0844 | sudden
 0.0842 | sleeper
 0.0841 | ponderous
 0.0841 | business
 0.0838 | humanize
 0.0831 | wollter
 0.0830 | maintain
 0.0827 | six
 0.0825 | equal
 0.0824 | disney
 0.0819 | screwball
 0.0818 | compose
 0.0813 | manipulation
 0.0812 | crane
 0.0810 | cut
 0.0807 | leather
 0.0807 | zombie
 0.0806 | near
 0.0806 | xxx
 0.0801 | re
 0.0801 | out
 0.0798 | this
 0.0786 | goose
 0.0783 | arrive
 0.0773 | cumulative
 0.0770 | ultimate
 0.0769 | put
 0.0764 | selection
 0.0762 | nifty
 0.0760 | everything
 0.0759 | unite
 0.0754 | characterizations
 0.0751 | piccoli
 0.0751 | there
 0

-0.0230 | seal
-0.0230 | show
-0.0231 | magnificent
-0.0234 | daytime
-0.0236 | eat
-0.0236 | solondz
-0.0238 | count
-0.0242 | host
-0.0245 | along
-0.0247 | jerk
-0.0247 | hammy
-0.0250 | arguments
-0.0251 | profoundly
-0.0253 | pro
-0.0255 | finally
-0.0255 | exterior
-0.0259 | :
-0.0259 | game
-0.0262 | gibson
-0.0263 | nick
-0.0263 | private
-0.0265 | gangster
-0.0265 | cash
-0.0267 | teachers
-0.0267 | bravado
-0.0269 | button
-0.0269 | nijinsky
-0.0275 | hell
-0.0275 | mike
-0.0275 | celebration
-0.0277 | time
-0.0288 | door
-0.0288 | two
-0.0290 | italian
-0.0291 | schumacher
-0.0296 | notable
-0.0298 | reserve
-0.0300 | never
-0.0301 | demonstrate
-0.0304 | uncertain
-0.0305 | fully
-0.0307 | actually
-0.0311 | clone
-0.0315 | anomie
-0.0316 | stunt
-0.0317 | deprecate
-0.0321 | allen
-0.0327 | mount
-0.0330 | floor
-0.0341 | food
-0.0342 | lively
-0.0344 | makers
-0.0358 | mann
-0.0365 | insistent
-0.0366 | shop
-0.0372 | closer
-0.0373 | awkwardness
-0.0378 | member
-0.0379 

-0.2661 | binoche
-0.2661 | crack
-0.2664 | spotlight
-0.2666 | comedy
-0.2666 | simone
-0.2669 | standards
-0.2669 | confessions
-0.2671 | separate
-0.2673 | terror
-0.2675 | commercialism
-0.2677 | jackson
-0.2677 | field
-0.2679 | list
-0.2683 | know
-0.2686 | exotic
-0.2687 | strongest
-0.2688 | salma
-0.2690 | detective
-0.2691 | cram
-0.2691 | clever
-0.2691 | nuanced
-0.2696 | journey
-0.2701 | trace
-0.2704 | sublime
-0.2710 | valuable
-0.2714 | 1970s
-0.2716 | share
-0.2716 | worry
-0.2719 | tear
-0.2726 | iranian
-0.2726 | escapist
-0.2727 | scope
-0.2731 | web
-0.2732 | seam
-0.2732 | disquiet
-0.2733 | morton
-0.2737 | cleverness
-0.2740 | nicely
-0.2740 | bullock
-0.2741 | example
-0.2742 | generous
-0.2747 | janice
-0.2748 | spectacle
-0.2748 | intelligently
-0.2751 | wang
-0.2752 | complex
-0.2752 | their
-0.2754 | undercurrent
-0.2757 | frequently
-0.2763 | achievement
-0.2764 | powerpuff
-0.2765 | poetic
-0.2773 | de
-0.2774 | vein
-0.2775 | alert
-0.2787 | aficionados

-0.4227 | felt
-0.4233 | chronicle
-0.4233 | neo
-0.4237 | adult
-0.4239 | complexity
-0.4239 | believer
-0.4243 | shrek
-0.4256 | oddly
-0.4257 | kidman
-0.4262 | picture
-0.4264 | tremendous
-0.4266 | improvement
-0.4269 | betrayal
-0.4284 | broomfield
-0.4285 | freshness
-0.4293 | impart
-0.4294 | intense
-0.4299 | drive
-0.4301 | event
-0.4306 | bartlett
-0.4311 | comedies
-0.4313 | fellow
-0.4315 | taut
-0.4319 | phenomenal
-0.4335 | exude
-0.4339 | sweetness
-0.4347 | absolutely
-0.4349 | creations
-0.4349 | minor
-0.4351 | charismatic
-0.4351 | wild
-0.4357 | assemble
-0.4358 | ones
-0.4358 | concern
-0.4371 | invent
-0.4372 | presence
-0.4378 | lucas
-0.4379 | metaphorical
-0.4380 | brim
-0.4381 | perspective
-0.4384 | lips
-0.4384 | glorious
-0.4392 | korean
-0.4412 | charlotte
-0.4421 | leap
-0.4434 | wash
-0.4439 | image
-0.4444 | duration
-0.4444 | courage
-0.4445 | heroic
-0.4465 | group
-0.4467 | pretty
-0.4474 | tough
-0.4478 | walter
-0.4478 | gritty
-0.4480 | way
-0.44