 See links for instructions on installation if not already installed.
  - [NLTK](https://www.nltk.org/install.html) (tested with 3.6.7 and with 3.2.5.)
  - [Scikit-Learn](https://scikit-learn.org/stable/install.html) (test with 1.0.2)
  - [SciPy](https://scipy.org/install/) (tested with 1.7.3 and with 1.4.1)

In [1]:
!pip install gdown
!gdown --id 1thWkUj7uGOApr_dXRvMr9TsEHpo_H_2q -O sst2.zip
!mkdir -p data
!unzip sst2.zip -d data
!rm sst2.zip

# Feature Engineering - Lemmatization

In [2]:
from collections import Counter
import json
import re
from pathlib import Path

from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

print("Build lemmatization vocab from sst2.train")
data_dir = Path('sst2/')
tokenizer = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()
lem_counter = Counter()
lem_counter.update(['<pad>', '<unk>'])
data_train = open(data_dir.joinpath('sst2.train')).readlines()
print(f"Size of training data: {len(data_train)}")

token_lines = []

for line in data_train:
    lower_line = line.lower()
    token_lines.append(tokenizer.tokenize(lower_line))

lem_lines = []

for line in token_lines:
    lem_lines.append([lemmatizer.lemmatize(word, 'v') for word in line])
        
for sentence in lem_lines:
    sentence = sentence[1:]
    #sentence = [word for word in sentence if word not in lst_stopwords]
    for word in sentence:
        lem_counter[word] = lem_counter.get(word, 0) + 1

print(f"Vocab size before frequency filtering: {len(lem_counter)}")

lem_vocab = {key: val for key, val in lem_counter.items() if val >= 3 or key == '<pad>' or key == '<unk>'}
lem_vocab = {key: 0 for key in lem_vocab}
lem_vocab.update((k, i) for i, k in enumerate(lem_vocab))

print(f"Vocab size after frequency filtering: {len(lem_vocab)}")
output_filepath = data_dir.joinpath('lemmatization_vocab.json')
json.dump(lem_vocab, open(output_filepath, mode='w'))


Build lemmatization vocab from sst2.train
Size of training data: 6920
Vocab size before frequency filtering: 11568
Vocab size after frequency filtering: 4480


In [3]:
import json
from nltk.tokenize import WordPunctTokenizer
import numpy as np
import math
from scipy import sparse

def lem_features(vocab, data_dir, dataset, data_type, tokenizer, feature_name, tfidf):
    '''
    Generates sparse matrices for lemmatization with unigram binary features
    '''
        
    data = open(data_dir.joinpath(dataset)).readlines()
    tokenizer = WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()
    
    tokens = []
    for line in data:
        lower_line = line.lower()
        tokens.append(tokenizer.tokenize(lower_line))
        
    lemma = []
    for line in tokens:
        lemma.append([lemmatizer.lemmatize(word, 'v') for word in line])
    
    final_lemma = []
    for line in lemma:
        split_line = line[1:]
        final_lemma.append(split_line)

    idf_dict = {key: 0 for key in vocab.keys()}
    for line in final_lemma:
        for word in line:
            if word in vocab.keys():
                idf_dict[word] = idf_dict.get(word, 0) + 1
            
    for key, value in idf_dict.items():
        idf_dict[key] = math.log10(len(final_lemma)/(value + 1))
        
    data_dict = {index: {key: value for key, value in idf_dict.items()} for index, value in enumerate(final_lemma)}
    for line in final_lemma:
        for word in line:
            tf = line.count(word) / len(line)
            if word in vocab.keys():
                data_dict[final_lemma.index(line)][word] = data_dict[final_lemma.index(line)].get(word)*tf
    
    binary_lem = {index: {key: 0 for key in vocab.keys()} for index, value in enumerate(final_lemma)}
    for line in final_lemma:
        for word in line:
            if word in vocab.keys():
                binary_lem[final_lemma.index(line)][word] = 1
            else:
                binary_lem[final_lemma.index(line)]['<unk>'] = 1
    
    data_matrix = np.array([[data_dict[index][key] for key in vocab.keys()] for index, value in enumerate(final_lemma)])
    lem_matrix = np.array([[binary_lem[index][key] for key in vocab.keys()] for index, value in enumerate(final_lemma)])
    
    if tfidf is True: 
        sparse_matrix = sparse.csr_matrix(data_matrix)
        print(("The shape of the " + data_type + " matrix is: "), data_matrix.shape)
        print()
    else:
        sparse_matrix = sparse.csr_matrix(lem_matrix)
        print(("The shape of the " + data_type + " matrix is: "), lem_matrix.shape)
        print()
        
    sparse.save_npz('sst2/' + data_type + '_' + feature_name + '_features.npz', sparse_matrix)
    

In [4]:
lem_features(lem_vocab, Path('sst2/'), 'sst2.train', 'train', WordPunctTokenizer(), 'lemmatization_binary', False)
lem_features(lem_vocab, Path('sst2/'), 'sst2.dev', 'dev', WordPunctTokenizer(), 'lemmatization_binary', False)
lem_features(lem_vocab, Path('sst2/'), 'sst2.test', 'test', WordPunctTokenizer(), 'lemmatization_binary', False)

The shape of the train matrix is:  (6920, 4480)

The shape of the dev matrix is:  (872, 4480)

The shape of the test matrix is:  (1821, 4480)



In [5]:
train_lemma = sparse.load_npz('sst2/train_lemmatization_binary_features.npz')
dev_lemma = sparse.load_npz('sst2/dev_lemmatization_binary_features.npz')
test_lemma = sparse.load_npz('sst2/test_lemmatization_binary_features.npz')

train_labels = np.load('sst2/train_labels.npz')
dev_labels = np.load('sst2/dev_labels.npz')
test_labels = np.load('sst2/test_labels.npz')

In [6]:
def print_important_weights(weights, words):
    """
    Print important pairs of weights and words.
    # Parameters
    weights : `Iterable`, required.
        Weights from a learned model.
    words : `Iterable`, required.
        Word types of the vocabulary.  
        It must be true that `len(weights) == len(words)`.
    # Returns
        `None`
    """

    def print_pairs(pairs):
        for weight, word in pairs:
            print("{: .4f} | {}".format(weight, word))

    assert len(weights) == len(words)
    pairs = list(zip(weights, words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
    print("Most positive words:")
    print_pairs(pairs[:10])
    print("\nMost negative words:")
    print_pairs(reversed(pairs[-10:]))

    pairs = list(zip(abs(weights), words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=False)
    print("\nMost neutral words:")
    print_pairs(pairs[:10])


 # Logistic regression with scikit-learn

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def fit_and_eval_logistic_regression(data_dir: Path,
                                     train_X, train_Y,
                                     test_X, test_Y,
                                     feature_name: str) -> LogisticRegression:
    '''
    Fits and evaluates the logistic regression model using the scikit-learn library
    
    Inputs:
        data_dir (path): the data directory
        trn_data (file): training data 
        tst_data (file): testing or dev data
        
    Output:
        model_trained (LogisticRegression): object of LogisticRegression after it is trained
    '''
    
    model = LogisticRegression()
    model.fit(train_X, train_Y['arr_0'])
    y_pred = model.predict(test_X)
    
    print("The accuracy score is: ", accuracy_score(test_Y['arr_0'], y_pred))
    print("The f1 score is: ", f1_score(test_Y['arr_0'], y_pred, average = 'weighted'))
    
    return model


### dev model

In [8]:
print("These are the scores for dev")
print()

fit_and_eval_logistic_regression(feature_name = 'lemmatization',
                                 train_X = train_lemma, train_Y = train_labels,
                                 test_X = dev_lemma, test_Y = dev_labels,
                                 data_dir = Path('sst2/'))

These are the scores for dev

The accuracy score is:  0.801605504587156
The f1 score is:  0.8015634682594862


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### test model

In [9]:
print("These are the scores for test")
print()

fit_and_eval_logistic_regression(feature_name = 'lemmatization',
                                 train_X = train_lemma, train_Y = train_labels,
                                 test_X = test_lemma, test_Y = test_labels,
                                 data_dir = Path('sst2/'))

These are the scores for test

The accuracy score is:  0.8099945085118067
The f1 score is:  0.8099904975297524


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

## Weights Analysis

In [10]:
model_trained: LogisticRegression = fit_and_eval_logistic_regression(
    feature_name='lemmatization', train_X = train_lemma, train_Y = train_labels, test_X = test_lemma, test_Y = test_labels,data_dir=Path('data'))
weights = model_trained.coef_[0]
vocab = json.load(open(data_dir.joinpath('lemmatization_vocab.json')))
print_important_weights(weights=weights, words=vocab.keys())


The accuracy score is:  0.8099945085118067
The f1 score is:  0.8099904975297524
Most positive words:
 1.9667 | powerful
 1.9043 | remarkable
 1.8830 | solid
 1.8455 | refresh
 1.8033 | fun
 1.7913 | enjoyable
 1.6969 | beautifully
 1.5337 | hilarious
 1.5003 | brilliant
 1.4966 | fascinate

Most negative words:
-2.0642 | stupid
-1.8817 | lack
-1.8490 | suffer
-1.8436 | dull
-1.7698 | depress
-1.7622 | bland
-1.7572 | bore
-1.7399 | failure
-1.7028 | lousy
-1.6861 | flat

Most neutral words:
 0.0000 | <pad>
 0.0002 | particularly
 0.0003 | kitchen
 0.0005 | darkness
 0.0005 | flower
 0.0005 | period
 0.0006 | credibility
 0.0007 | lush
 0.0011 | act
 0.0014 | notion


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Error Analysis

In [11]:
def test_errors(data_dir: Path, train_X, train_Y, test_X, test_Y, feature_name: str) -> LogisticRegression:
    '''
    Returns predicted label output for given training and testing data
    '''
    
    model = LogisticRegression()
    model.fit(train_X, train_Y['arr_0'])
    y_pred = model.predict(test_X)
    
    return y_pred


In [12]:
test_model = test_errors(feature_name = 'unigram_binary',
                         train_X = train_lemma, train_Y = train_labels,
                         test_X = test_lemma, test_Y = test_labels,
                         data_dir = Path('sst2/'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

def error_frame(pred_model, labels):
    '''
    Generates a pandas dataframe of mislabeled reviews
    '''
    
    label_column = pd.DataFrame(labels)
    label_column.rename(columns = {0: "True"}, inplace = True)
    model_column = pd.DataFrame(pred_model)
    model_column.rename(columns = {0: "Predicted"}, inplace = True)
    
    data_test = open(data_dir.joinpath('sst2.test')).readlines()
    tokens = []
    for line in data_test:
        lower_line = line.lower()
        tokens.append(tokenizer.tokenize(lower_line))
    
    full_frame = label_column.join(model_column)
    full_frame["Match"] = (full_frame["True"] == full_frame["Predicted"]).astype(int)
    full_frame["Line"] = tokens
    full_frame.index += 1
    
    print("The number of errors found: ", len(full_frame.loc[full_frame['Match'] == 0]))
    return full_frame.loc[full_frame['Match'] == 0]
     

  pd.set_option('display.max_colwidth', -1)


In [14]:
error_frame(test_model, test_labels['arr_0'])

The number of errors found:  346


Unnamed: 0,True,Predicted,Match,Line
2,0,1,0,"[0, a, gob, of, drivel, so, sickly, sweet, ,, even, the, eager, consumers, of, moore, ', s, pasteurized, ditties, will, retch, it, up, like, rancid, crème, brûlée, .]"
8,1,0,0,"[1, the, movie, exists, for, its, soccer, action, and, its, fine, acting, .]"
11,1,0,0,"[1, jason, x, has, cheesy, effects, and, a, hoary, plot, ,, but, its, macabre, ,, self, -, deprecating, sense, of, humor, makes, up, for, a, lot, .]"
14,1,0,0,"[1, though, the, violence, is, far, less, sadistic, than, usual, ,, the, film, is, typical, miike, :, fast, ,, furious, and, full, of, off, -, the, -, cuff, imaginative, flourishes, .]"
17,1,0,0,"[1, if, your, senses, have, n, ', t, been, dulled, by, slasher, films, and, gorefests, ,, if, you, ', re, a, connoisseur, of, psychological, horror, ,, this, is, your, ticket, .]"
19,0,1,0,"[0, as, conceived, by, mr, ., schaeffer, ,, christopher, and, grace, are, little, more, than, collections, of, quirky, traits, lifted, from, a, screenwriter, ', s, outline, and, thrown, at, actors, charged, with, the, impossible, task, of, making, them, jell, .]"
20,0,1,0,"[0, those, who, managed, to, avoid, the, deconstructionist, theorizing, of, french, philosopher, jacques, derrida, in, college, can, now, take, an, 85, -, minute, brush, -, up, course, with, the, documentary, derrida, .]"
22,0,1,0,"[0, but, what, saves, lives, on, the, freeway, does, not, necessarily, make, for, persuasive, viewing, .]"
23,1,0,0,"[1, steve, irwin, ', s, method, is, ernest, hemmingway, at, accelerated, speed, and, volume, .]"
29,0,1,0,"[0, the, premise, for, this, kegger, comedy, probably, sounded, brilliant, four, six, -, packs, and, a, pitcher, of, margaritas, in, ,, but, the, film, must, have, been, written, ..., in, the, thrall, of, a, vicious, hangover, .]"


In [15]:
def print_weight(weights, words):
    """
    Prints all word-weight pairs for given data and vocabulary
    """

    def print_pairs(pairs):
        for weight, word in pairs:
            print("{: .4f} | {}".format(weight, word))

    assert len(weights) == len(words)
    pairs = list(zip(weights, words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
    print_pairs(pairs)


In [16]:
print_weight(weights = weights, words=vocab.keys())

 1.9667 | powerful
 1.9043 | remarkable
 1.8830 | solid
 1.8455 | refresh
 1.8033 | fun
 1.7913 | enjoyable
 1.6969 | beautifully
 1.5337 | hilarious
 1.5003 | brilliant
 1.4966 | fascinate
 1.4780 | terrific
 1.4596 | unexpected
 1.4560 | manage
 1.4535 | entertain
 1.4382 | heart
 1.4312 | always
 1.4310 | wonderful
 1.3982 | definitely
 1.3874 | delight
 1.3752 | assure
 1.3707 | summer
 1.3597 | smarter
 1.3560 | resist
 1.3536 | years
 1.3483 | human
 1.3292 | spirit
 1.3230 | best
 1.3195 | worth
 1.3112 | portrait
 1.3032 | enjoy
 1.3015 | refreshingly
 1.3005 | reward
 1.2981 | embrace
 1.2888 | masterpiece
 1.2850 | genre
 1.2795 | engross
 1.2554 | smart
 1.2524 | rare
 1.2518 | inventive
 1.2485 | somewhat
 1.2448 | delicate
 1.2427 | treat
 1.2376 | perfectly
 1.2359 | impressive
 1.2358 | bring
 1.2310 | bowl
 1.2274 | explore
 1.2261 | deeply
 1.2216 | love
 1.2150 | funny
 1.2076 | fast
 1.2070 | please
 1.2031 | foster
 1.1918 | miracle
 1.1886 | likable
 1.1859 | notch

 0.2644 | chaplin
 0.2642 | style
 0.2642 | c
 0.2637 | delivery
 0.2636 | soak
 0.2632 | confessions
 0.2629 | obsessive
 0.2627 | greater
 0.2622 | parent
 0.2620 | extent
 0.2616 | my
 0.2614 | treatment
 0.2614 | land
 0.2612 | early
 0.2609 | hong
 0.2609 | kong
 0.2605 | screen
 0.2601 | web
 0.2600 | sexual
 0.2598 | tradition
 0.2597 | religious
 0.2593 | crew
 0.2592 | liberate
 0.2592 | neatly
 0.2592 | tasteful
 0.2587 | rohmer
 0.2587 | contribute
 0.2585 | jumbo
 0.2584 | evolve
 0.2564 | multi
 0.2563 | baran
 0.2563 | entertainingly
 0.2561 | span
 0.2558 | favorite
 0.2555 | odyssey
 0.2553 | significantly
 0.2542 | ,
 0.2536 | eloquently
 0.2532 | artistic
 0.2530 | blockbusters
 0.2529 | skillfully
 0.2528 | racial
 0.2527 | sober
 0.2524 | notorious
 0.2518 | competition
 0.2516 | together
 0.2510 | likely
 0.2509 | fresh
 0.2503 | band
 0.2503 | mostly
 0.2500 | uncommonly
 0.2499 | hit
 0.2495 | tie
 0.2482 | singer
 0.2480 | lull
 0.2477 | differences
 0.2475 | fa

 0.1226 | stuff
 0.1223 | feces
 0.1221 | innocence
 0.1220 | feature
 0.1216 | associate
 0.1215 | bias
 0.1212 | participants
 0.1212 | hedonistic
 0.1211 | deception
 0.1204 | jazzy
 0.1202 | bang
 0.1194 | fit
 0.1192 | from
 0.1187 | large
 0.1186 | insurance
 0.1183 | unsentimental
 0.1182 | juliette
 0.1180 | ferrara
 0.1178 | contemplation
 0.1178 | cheeky
 0.1171 | unflinching
 0.1170 | endure
 0.1169 | sayles
 0.1166 | 13
 0.1165 | disregard
 0.1162 | fable
 0.1157 | than
 0.1155 | destructive
 0.1149 | steven
 0.1148 | ensemble
 0.1146 | affable
 0.1144 | auteuil
 0.1142 | tensions
 0.1140 | pantheon
 0.1130 | jewish
 0.1129 | wondrous
 0.1127 | cash
 0.1127 | giggle
 0.1118 | iconoclastic
 0.1114 | men
 0.1113 | system
 0.1111 | ever
 0.1110 | root
 0.1103 | parker
 0.1101 | makers
 0.1100 | inclination
 0.1097 | ode
 0.1094 | ravish
 0.1094 | medium
 0.1087 | retain
 0.1084 | lee
 0.1081 | directly
 0.1080 | surrender
 0.1071 | precious
 0.1067 | special
 0.1064 | bogged
 

-0.1509 | language
-0.1511 | visible
-0.1511 | agent
-0.1512 | mill
-0.1517 | château
-0.1519 | adams
-0.1522 | main
-0.1522 | stagger
-0.1522 | impossible
-0.1526 | complications
-0.1526 | aid
-0.1527 | interest
-0.1528 | angry
-0.1533 | ve
-0.1534 | mean
-0.1538 | dong
-0.1541 | attention
-0.1541 | first
-0.1541 | should
-0.1544 | frank
-0.1544 | domestic
-0.1549 | 3
-0.1557 | <unk>
-0.1558 | fixate
-0.1560 | raunchy
-0.1565 | emphasize
-0.1568 | marvelous
-0.1570 | walsh
-0.1574 | accurate
-0.1576 | marine
-0.1578 | dragonfly
-0.1581 | through
-0.1582 | paradiso
-0.1589 | consolation
-0.1590 | british
-0.1591 | urge
-0.1594 | rhetoric
-0.1604 | assault
-0.1609 | 10
-0.1609 | them
-0.1614 | opportunities
-0.1620 | between
-0.1621 | florid
-0.1626 | sequence
-0.1627 | her
-0.1628 | girls
-0.1630 | lavish
-0.1635 | boot
-0.1636 | million
-0.1638 | how
-0.1638 | astonishingly
-0.1644 | nature
-0.1647 | unleash
-0.1648 | yard
-0.1649 | haneke
-0.1650 | learn
-0.1652 | fumble
-0.1654 | yo

-0.2975 | shame
-0.2986 | d
-0.2992 | occasionally
-0.2992 | cloak
-0.2995 | order
-0.2998 | deniro
-0.2998 | today
-0.3008 | 86
-0.3012 | woo
-0.3019 | eastwood
-0.3022 | beer
-0.3024 | innovation
-0.3024 | write
-0.3028 | fizz
-0.3030 | stretch
-0.3035 | achieve
-0.3038 | cooper
-0.3040 | cox
-0.3044 | hotel
-0.3060 | willingness
-0.3061 | seas
-0.3078 | acceptable
-0.3085 | problematic
-0.3087 | coma
-0.3087 | loss
-0.3088 | inherent
-0.3096 | rerun
-0.3097 | note
-0.3099 | players
-0.3101 | more
-0.3107 | joel
-0.3110 | hour
-0.3111 | prep
-0.3131 | fiennes
-0.3138 | shoe
-0.3139 | procession
-0.3140 | emptiness
-0.3144 | sweetest
-0.3150 | hal
-0.3155 | below
-0.3156 | explosion
-0.3158 | movie
-0.3173 | marquis
-0.3179 | emphasis
-0.3181 | cage
-0.3181 | shadyac
-0.3183 | schwarzenegger
-0.3183 | loosely
-0.3185 | practically
-0.3186 | chop
-0.3190 | downright
-0.3191 | sap
-0.3192 | sonny
-0.3192 | such
-0.3198 | exposition
-0.3198 | formula
-0.3203 | ararat
-0.3204 | psyche
-0.

-0.9288 | night
-0.9313 | over
-0.9332 | flaccid
-0.9363 | horrible
-0.9372 | poorly
-0.9399 | chan
-0.9410 | sandler
-0.9454 | then
-0.9476 | junk
-0.9477 | exercise
-0.9486 | alienation
-0.9536 | bag
-0.9537 | name
-0.9544 | whole
-0.9562 | virtually
-0.9564 | overwrought
-0.9572 | mildly
-0.9590 | hole
-0.9639 | monster
-0.9651 | fall
-0.9673 | product
-0.9693 | wear
-0.9711 | smash
-0.9723 | plot
-0.9781 | crap
-0.9798 | somewhere
-0.9846 | showgirls
-0.9848 | nothing
-0.9850 | pedigree
-0.9862 | n
-0.9885 | terribly
-0.9897 | scatter
-0.9904 | ghost
-0.9924 | artificial
-0.9948 | strain
-0.9974 | harder
-0.9987 | apparent
-1.0013 | reason
-1.0050 | manipulative
-1.0076 | squander
-1.0089 | obscure
-1.0100 | unless
-1.0116 | try
-1.0125 | resemble
-1.0141 | fluffy
-1.0184 | cold
-1.0271 | toss
-1.0298 | length
-1.0309 | comparison
-1.0318 | sadly
-1.0345 | money
-1.0367 | dog
-1.0382 | remake
-1.0383 | grate
-1.0416 | violent
-1.0450 | house
-1.0482 | disappointment
-1.0483 | reput