 See links for instructions on installation if not already installed.
  - [NLTK](https://www.nltk.org/install.html) (tested with 3.6.7 and with 3.2.5.)
  - [Scikit-Learn](https://scikit-learn.org/stable/install.html) (test with 1.0.2)
  - [SciPy](https://scipy.org/install/) (tested with 1.7.3 and with 1.4.1)

In [1]:
!pip install gdown
!gdown --id 1thWkUj7uGOApr_dXRvMr9TsEHpo_H_2q -O sst2.zip
!mkdir -p data
!unzip sst2.zip -d data
!rm sst2.zip

 # Feature Engineering - Lemmatization

In [2]:
from collections import Counter
import json
import re
from pathlib import Path

from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

print("Build lemmatization vocab from sst2.train")
data_dir = Path('sst2/')
tokenizer = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()
lem_counter = Counter()
lem_counter.update(['<pad>', '<unk>'])
data_train = open(data_dir.joinpath('sst2.train')).readlines()
print(f"Size of training data: {len(data_train)}")

token_lines = []

for line in data_train:
    lower_line = line.lower()
    token_lines.append(tokenizer.tokenize(lower_line))

lem_lines = []

for line in token_lines:
    lem_lines.append([lemmatizer.lemmatize(word, 'v') for word in line])
        
for sentence in lem_lines:
    sentence = sentence[1:]
    #sentence = [word for word in sentence if word not in lst_stopwords]
    for word in sentence:
        lem_counter[word] = lem_counter.get(word, 0) + 1

print(f"Vocab size before frequency filtering: {len(lem_counter)}")

lem_vocab = {key: val for key, val in lem_counter.items() if val >= 3 or key == '<pad>' or key == '<unk>'}
lem_vocab = {key: 0 for key in lem_vocab}
lem_vocab.update((k, i) for i, k in enumerate(lem_vocab))

print(f"Vocab size after frequency filtering: {len(lem_vocab)}")
output_filepath = data_dir.joinpath('lemmatization_vocab.json')
json.dump(lem_vocab, open(output_filepath, mode='w'))


Build lemmatization vocab from sst2.train
Size of training data: 6920
Vocab size before frequency filtering: 11568
Vocab size after frequency filtering: 4480


In [3]:
import json
from nltk.tokenize import WordPunctTokenizer
import numpy as np
import math
from scipy import sparse

def lem_features(vocab, data_dir, dataset, data_type, tokenizer, feature_name, tfidf, count):
    '''
    Generates sparse matrices for lemmatization with count feature
    '''
        
    data = open(data_dir.joinpath(dataset)).readlines()
    tokenizer = WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()
    
    tokens = []
    for line in data:
        lower_line = line.lower()
        tokens.append(tokenizer.tokenize(lower_line))
        
    lemma = []
    for line in tokens:
        lemma.append([lemmatizer.lemmatize(word, 'v') for word in line])
    
    final_lemma = []
    for line in lemma:
        split_line = line[1:]
        final_lemma.append(split_line)

    idf_dict = {key: 0 for key in vocab.keys()}
    for line in final_lemma:
        for word in line:
            if word in vocab.keys():
                idf_dict[word] = idf_dict.get(word, 0) + 1
            
    for key, value in idf_dict.items():
        idf_dict[key] = math.log10(len(final_lemma)/(value + 1))
        
    data_dict = {index: {key: value for key, value in idf_dict.items()} for index, value in enumerate(final_lemma)}
    for line in final_lemma:
        for word in line:
            tf = line.count(word) / len(line)
            if word in vocab.keys():
                data_dict[final_lemma.index(line)][word] = data_dict[final_lemma.index(line)].get(word)*tf
    
    binary_lem = {index: {key: 0 for key in vocab.keys()} for index, value in enumerate(final_lemma)}
    for line in final_lemma:
        for word in line:
            if word in vocab.keys():
                binary_lem[final_lemma.index(line)][word] = 1
            else:
                binary_lem[final_lemma.index(line)]['<unk>'] = 1
                
    count_lem = {index: {key: 0 for key in vocab.keys()} for index, value in enumerate(final_lemma)}
    for line in final_lemma:
        for word in line:
            if word in vocab.keys():
                count_lem[final_lemma.index(line)][word] = count_lem[final_lemma.index(line)].get(word, 0) + 1
            else:
                count_lem[final_lemma.index(line)]['<unk>'] = count_lem[final_lemma.index(line)].get('<unk>', 0) + 1
    
    data_matrix = np.array([[data_dict[index][key] for key in vocab.keys()] for index, value in enumerate(final_lemma)])
    lem_matrix = np.array([[binary_lem[index][key] for key in vocab.keys()] for index, value in enumerate(final_lemma)])
    count_matrix = np.array([[count_lem[index][key] for key in vocab.keys()] for index, value in enumerate(final_lemma)])
    
    if tfidf is True: 
        sparse_matrix = sparse.csr_matrix(data_matrix)
        print(("The shape of the " + data_type + " matrix is: "), data_matrix.shape)
        print()
        
    if count is True:
        sparse_matrix = sparse.csr_matrix(count_matrix)
        print(("The shape of the " + data_type + " matrix is: "), count_matrix.shape)
        print()
        
    sparse.save_npz('sst2/' + data_type + '_' + feature_name + '_features.npz', sparse_matrix)
    

In [4]:
lem_features(lem_vocab, Path('sst2/'), 'sst2.train', 'train', WordPunctTokenizer(), 'lemmatization_count', False, True)
lem_features(lem_vocab, Path('sst2/'), 'sst2.dev', 'dev', WordPunctTokenizer(), 'lemmatization_count', False, True)
lem_features(lem_vocab, Path('sst2/'), 'sst2.test', 'test', WordPunctTokenizer(), 'lemmatization_count', False, True)

The shape of the train matrix is:  (6920, 4480)

The shape of the dev matrix is:  (872, 4480)

The shape of the test matrix is:  (1821, 4480)



In [5]:
train_lemma = sparse.load_npz('sst2/train_lemmatization_count_features.npz')
dev_lemma = sparse.load_npz('sst2/dev_lemmatization_count_features.npz')
test_lemma = sparse.load_npz('sst2/test_lemmatization_count_features.npz')

train_labels = np.load('sst2/train_labels.npz')
dev_labels = np.load('sst2/dev_labels.npz')
test_labels = np.load('sst2/test_labels.npz')

In [6]:
def print_important_weights(weights, words):
    """
    Print important pairs of weights and words.
    # Parameters
    weights : `Iterable`, required.
        Weights from a learned model.
    words : `Iterable`, required.
        Word types of the vocabulary.  
        It must be true that `len(weights) == len(words)`.
    # Returns
        `None`
    """

    def print_pairs(pairs):
        for weight, word in pairs:
            print("{: .4f} | {}".format(weight, word))

    assert len(weights) == len(words)
    pairs = list(zip(weights, words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
    print("Most positive words:")
    print_pairs(pairs[:10])
    print("\nMost negative words:")
    print_pairs(reversed(pairs[-10:]))

    pairs = list(zip(abs(weights), words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=False)
    print("\nMost neutral words:")
    print_pairs(pairs[:10])


 # Logistic regression with scikit-learn

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def fit_and_eval_logistic_regression(data_dir: Path,
                                     train_X, train_Y,
                                     test_X, test_Y,
                                     feature_name: str) -> LogisticRegression:
    '''
    Fits and evaluates the logistic regression model using the scikit-learn library
    
    Inputs:
        data_dir (path): the data directory
        trn_data (file): training data 
        tst_data (file): testing or dev data
        
    Output:
        model_trained (LogisticRegression): object of LogisticRegression after it is trained
    '''
    
    model = LogisticRegression()
    model.fit(train_X, train_Y['arr_0'])
    y_pred = model.predict(test_X)
    
    print("The accuracy score is: ", accuracy_score(test_Y['arr_0'], y_pred))
    print("The f1 score is: ", f1_score(test_Y['arr_0'], y_pred, average = 'weighted'))
    
    return model


### dev model

In [8]:
print("These are the scores for dev")
print()

fit_and_eval_logistic_regression(feature_name = 'lemmatization',
                                 train_X = train_lemma, train_Y = train_labels,
                                 test_X = dev_lemma, test_Y = dev_labels,
                                 data_dir = Path('sst2/'))

These are the scores for dev

The accuracy score is:  0.7912844036697247
The f1 score is:  0.7912316621949108


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### test model

In [9]:
print("These are the scores for test")
print()

fit_and_eval_logistic_regression(feature_name = 'lemmatization',
                                 train_X = train_lemma, train_Y = train_labels,
                                 test_X = test_lemma, test_Y = test_labels,
                                 data_dir = Path('sst2/'))

These are the scores for test

The accuracy score is:  0.8056013179571664
The f1 score is:  0.8055989729904106


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

## Weights Analysis

In [10]:
model_trained: LogisticRegression = fit_and_eval_logistic_regression(
    feature_name='lemmatization', train_X = train_lemma, train_Y = train_labels, test_X = test_lemma, test_Y = test_labels,data_dir=Path('data'))
weights = model_trained.coef_[0]
vocab = json.load(open(data_dir.joinpath('lemmatization_vocab.json')))
print_important_weights(weights=weights, words=vocab.keys())


The accuracy score is:  0.8056013179571664
The f1 score is:  0.8055989729904106
Most positive words:
 1.9600 | powerful
 1.9344 | remarkable
 1.8574 | solid
 1.8098 | fun
 1.7499 | refresh
 1.7000 | beautifully
 1.6686 | definitely
 1.6582 | terrific
 1.6015 | enjoyable
 1.5293 | hilarious

Most negative words:
-2.0013 | stupid
-1.9124 | lack
-1.8806 | suffer
-1.8669 | dull
-1.7895 | depress
-1.7815 | bland
-1.7693 | failure
-1.7560 | lousy
-1.7188 | unfortunately
-1.7115 | none

Most neutral words:
 0.0000 | <pad>
 0.0001 | ''
 0.0002 | gibson
 0.0002 | light
 0.0004 | possibilities
 0.0006 | funnier
 0.0008 | controversy
 0.0009 | convince
 0.0009 | mend
 0.0010 | comment


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Error Analysis

In [11]:
def test_errors(data_dir: Path, train_X, train_Y, test_X, test_Y, feature_name: str) -> LogisticRegression:
    '''
    Returns predicted label output for given training and testing data
    '''
    
    model = LogisticRegression()
    model.fit(train_X, train_Y['arr_0'])
    y_pred = model.predict(test_X)
    
    return y_pred


In [12]:
test_model = test_errors(feature_name = 'unigram_binary',
                         train_X = train_lemma, train_Y = train_labels,
                         test_X = test_lemma, test_Y = test_labels,
                         data_dir = Path('sst2/'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

def error_frame(pred_model, labels):
    '''
    Generates a pandas dataframe of mislabeled reviews
    '''
    
    label_column = pd.DataFrame(labels)
    label_column.rename(columns = {0: "True"}, inplace = True)
    model_column = pd.DataFrame(pred_model)
    model_column.rename(columns = {0: "Predicted"}, inplace = True)
    
    data_test = open(data_dir.joinpath('sst2.test')).readlines()
    tokens = []
    for line in data_test:
        lower_line = line.lower()
        tokens.append(tokenizer.tokenize(lower_line))
    
    full_frame = label_column.join(model_column)
    full_frame["Match"] = (full_frame["True"] == full_frame["Predicted"]).astype(int)
    full_frame["Line"] = tokens
    full_frame.index += 1
    
    print("The number of errors found: ", len(full_frame.loc[full_frame['Match'] == 0]))
    return full_frame.loc[full_frame['Match'] == 0]
     

  pd.set_option('display.max_colwidth', -1)


In [14]:
error_frame(test_model, test_labels['arr_0'])

The number of errors found:  354


Unnamed: 0,True,Predicted,Match,Line
8,1,0,0,"[1, the, movie, exists, for, its, soccer, action, and, its, fine, acting, .]"
11,1,0,0,"[1, jason, x, has, cheesy, effects, and, a, hoary, plot, ,, but, its, macabre, ,, self, -, deprecating, sense, of, humor, makes, up, for, a, lot, .]"
14,1,0,0,"[1, though, the, violence, is, far, less, sadistic, than, usual, ,, the, film, is, typical, miike, :, fast, ,, furious, and, full, of, off, -, the, -, cuff, imaginative, flourishes, .]"
17,1,0,0,"[1, if, your, senses, have, n, ', t, been, dulled, by, slasher, films, and, gorefests, ,, if, you, ', re, a, connoisseur, of, psychological, horror, ,, this, is, your, ticket, .]"
19,0,1,0,"[0, as, conceived, by, mr, ., schaeffer, ,, christopher, and, grace, are, little, more, than, collections, of, quirky, traits, lifted, from, a, screenwriter, ', s, outline, and, thrown, at, actors, charged, with, the, impossible, task, of, making, them, jell, .]"
20,0,1,0,"[0, those, who, managed, to, avoid, the, deconstructionist, theorizing, of, french, philosopher, jacques, derrida, in, college, can, now, take, an, 85, -, minute, brush, -, up, course, with, the, documentary, derrida, .]"
22,0,1,0,"[0, but, what, saves, lives, on, the, freeway, does, not, necessarily, make, for, persuasive, viewing, .]"
23,1,0,0,"[1, steve, irwin, ', s, method, is, ernest, hemmingway, at, accelerated, speed, and, volume, .]"
29,0,1,0,"[0, the, premise, for, this, kegger, comedy, probably, sounded, brilliant, four, six, -, packs, and, a, pitcher, of, margaritas, in, ,, but, the, film, must, have, been, written, ..., in, the, thrall, of, a, vicious, hangover, .]"
44,1,0,0,"[1, much, monkeyfun, for, all, .]"


In [15]:
def print_weight(weights, words):
    """
    Prints all word-weight pairs for given data and vocabulary
    """

    def print_pairs(pairs):
        for weight, word in pairs:
            print("{: .4f} | {}".format(weight, word))

    assert len(weights) == len(words)
    pairs = list(zip(weights, words))
    pairs = sorted(pairs, key=lambda x: x[0], reverse=True)
    print_pairs(pairs)


In [16]:
print_weight(weights = weights, words=vocab.keys())

 1.9600 | powerful
 1.9344 | remarkable
 1.8574 | solid
 1.8098 | fun
 1.7499 | refresh
 1.7000 | beautifully
 1.6686 | definitely
 1.6582 | terrific
 1.6015 | enjoyable
 1.5293 | hilarious
 1.5090 | brilliant
 1.4772 | manage
 1.4452 | smarter
 1.4382 | entertain
 1.4280 | always
 1.4255 | heart
 1.4229 | unexpected
 1.4222 | wonderful
 1.3925 | delight
 1.3703 | human
 1.3584 | reward
 1.3554 | fast
 1.3496 | summer
 1.3440 | best
 1.3432 | somewhat
 1.3346 | embrace
 1.3302 | worth
 1.3296 | resist
 1.3290 | fascinate
 1.3263 | rare
 1.3141 | assure
 1.2975 | spirit
 1.2797 | portrait
 1.2775 | please
 1.2760 | genre
 1.2696 | smart
 1.2573 | impressive
 1.2531 | engross
 1.2527 | deeply
 1.2492 | inventive
 1.2415 | enjoy
 1.2385 | explore
 1.2378 | masterpiece
 1.2357 | refreshingly
 1.2356 | cinema
 1.2197 | bring
 1.1992 | bowl
 1.1985 | capture
 1.1889 | perfectly
 1.1862 | gem
 1.1845 | foster
 1.1820 | love
 1.1765 | lie
 1.1685 | intoxicate
 1.1529 | horrify
 1.1478 | funny


 0.4250 | community
 0.4247 | duke
 0.4240 | splendid
 0.4239 | napoleon
 0.4237 | locations
 0.4236 | interaction
 0.4235 | discovery
 0.4233 | deftly
 0.4231 | haynes
 0.4229 | believer
 0.4226 | see
 0.4223 | hair
 0.4221 | visit
 0.4221 | modern
 0.4210 | study
 0.4209 | discipline
 0.4199 | richer
 0.4196 | escapist
 0.4193 | simple
 0.4192 | grand
 0.4187 | visually
 0.4184 | austin
 0.4182 | recommend
 0.4173 | reno
 0.4170 | chronicle
 0.4167 | range
 0.4161 | yarn
 0.4146 | whale
 0.4131 | pokemon
 0.4131 | heartbreak
 0.4128 | franklin
 0.4125 | dragon
 0.4117 | easy
 0.4112 | clooney
 0.4107 | wash
 0.4095 | radiant
 0.4087 | glorious
 0.4074 | spectacle
 0.4073 | standards
 0.4068 | sumptuous
 0.4067 | 50s
 0.4066 | fill
 0.4060 | within
 0.4055 | raw
 0.4055 | enthusiasm
 0.4054 | intense
 0.4042 | with
 0.4040 | 1970s
 0.4038 | prism
 0.4032 | compellingly
 0.4031 | since
 0.4029 | flavor
 0.4029 | scale
 0.4027 | hugh
 0.4021 | morph
 0.4017 | canon
 0.4014 | creations
 

 0.1161 | brown
 0.1155 | retro
 0.1154 | exploitative
 0.1153 | irwin
 0.1146 | directly
 0.1142 | esteem
 0.1140 | insurance
 0.1133 | strike
 0.1129 | 90
 0.1129 | jaglom
 0.1129 | bartleby
 0.1123 | er
 0.1122 | rollick
 0.1119 | unsentimental
 0.1113 | mass
 0.1110 | pellington
 0.1108 | photograph
 0.1105 | mann
 0.1104 | paranoia
 0.1103 | crowd
 0.1103 | conversations
 0.1098 | enigmatic
 0.1090 | deception
 0.1087 | original
 0.1086 | cuban
 0.1081 | startle
 0.1081 | embarrass
 0.1080 | 1
 0.1073 | aggressive
 0.1068 | stitch
 0.1067 | resonance
 0.1066 | sing
 0.1062 | incessant
 0.1062 | \/
 0.1059 | two
 0.1059 | size
 0.1057 | most
 0.1055 | translate
 0.1055 | floor
 0.1054 | 15
 0.1049 | naturally
 0.1042 | a
 0.1041 | sword
 0.1040 | excitement
 0.1040 | spider
 0.1039 | grind
 0.1039 | high
 0.1037 | animals
 0.1036 | seat
 0.1035 | buoyant
 0.1031 | insomnia
 0.1029 | sun
 0.1029 | ichi
 0.1025 | pit
 0.1023 | clear
 0.1021 | spend
 0.1012 | stage
 0.1000 | extremely

-0.0190 | celebrity
-0.0190 | cruel
-0.0194 | auteur
-0.0194 | cleverly
-0.0195 | gore
-0.0198 | friendship
-0.0206 | imaginative
-0.0210 | unabashedly
-0.0215 | raymond
-0.0217 | dot
-0.0231 | thirty
-0.0232 | predecessors
-0.0241 | cheesy
-0.0248 | monty
-0.0249 | cartoon
-0.0252 | photo
-0.0252 | legged
-0.0253 | brit
-0.0253 | terminally
-0.0253 | arty
-0.0254 | cumulative
-0.0257 | zombie
-0.0258 | question
-0.0258 | edgy
-0.0259 | consequences
-0.0261 | hero
-0.0262 | danny
-0.0263 | ensue
-0.0266 | far
-0.0266 | presentation
-0.0275 | program
-0.0278 | enlighten
-0.0280 | rocket
-0.0283 | counter
-0.0288 | sundance
-0.0292 | boldly
-0.0295 | cyber
-0.0298 | derivative
-0.0302 | leather
-0.0303 | guide
-0.0303 | flamboyant
-0.0308 | add
-0.0310 | line
-0.0323 | belly
-0.0323 | reject
-0.0325 | adage
-0.0326 | cover
-0.0327 | plague
-0.0328 | palestinian
-0.0329 | amy
-0.0329 | issue
-0.0331 | problems
-0.0335 | degrade
-0.0341 | ...
-0.0343 | fully
-0.0349 | dean
-0.0350 | seal
-

-0.3029 | fiennes
-0.3031 | theaters
-0.3033 | choices
-0.3040 | humble
-0.3043 | alone
-0.3047 | park
-0.3047 | observe
-0.3048 | marine
-0.3049 | friend
-0.3057 | painful
-0.3061 | material
-0.3071 | control
-0.3073 | droll
-0.3074 | iwai
-0.3075 | initial
-0.3089 | strictly
-0.3091 | regret
-0.3095 | skills
-0.3096 | stab
-0.3097 | farrelly
-0.3101 | inexplicably
-0.3104 | flimsy
-0.3105 | territory
-0.3107 | stories
-0.3108 | d
-0.3116 | tunney
-0.3118 | sermon
-0.3119 | involve
-0.3119 | expose
-0.3121 | graceless
-0.3121 | soundtrack
-0.3122 | project
-0.3123 | loss
-0.3124 | belt
-0.3129 | anyone
-0.3139 | station
-0.3140 | distant
-0.3143 | procession
-0.3143 | ruin
-0.3146 | shift
-0.3147 | continuity
-0.3149 | very
-0.3151 | cooper
-0.3164 | prep
-0.3172 | note
-0.3173 | come
-0.3174 | fake
-0.3175 | tart
-0.3176 | ringu
-0.3178 | sentence
-0.3182 | tailor
-0.3186 | outtakes
-0.3188 | substance
-0.3189 | australian
-0.3190 | generate
-0.3191 | r
-0.3192 | innovation
-0.3192 |

-0.4930 | sever
-0.4938 | random
-0.4943 | stock
-0.4947 | schneider
-0.4947 | truck
-0.4949 | mechanical
-0.4949 | painfully
-0.4953 | daughters
-0.4955 | weird
-0.4956 | whiny
-0.4964 | freak
-0.4967 | halloween
-0.4969 | embarrassment
-0.4978 | speed
-0.4984 | shake
-0.4984 | not
-0.4986 | inoffensive
-0.4987 | same
-0.4987 | rote
-0.4990 | windtalkers
-0.5001 | whodunit
-0.5004 | joyless
-0.5012 | melodrama
-0.5022 | vast
-0.5027 | tend
-0.5032 | radical
-0.5032 | pass
-0.5033 | budget
-0.5033 | veer
-0.5034 | fewer
-0.5034 | ballistic
-0.5036 | self
-0.5036 | hot
-0.5039 | heritage
-0.5051 | ineptly
-0.5060 | gooding
-0.5062 | gag
-0.5065 | mediocrity
-0.5078 | party
-0.5079 | hawke
-0.5093 | scary
-0.5106 | palma
-0.5112 | mire
-0.5112 | report
-0.5118 | string
-0.5130 | desire
-0.5131 | sham
-0.5132 | third
-0.5134 | vague
-0.5137 | personality
-0.5145 | taylor
-0.5146 | card
-0.5149 | tragedies
-0.5152 | drain
-0.5156 | teenagers
-0.5158 | payoff
-0.5158 | support
-0.5160 | tob