In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np

torch.manual_seed(1)

from tqdm.notebook import tqdm
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

## Load data
source is https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [2]:
data_path = '/Users/tyler/Documents/programming/pytorch_nlp/data/word2vec-nlp-tutorial/'

In [3]:
df = pd.read_csv(data_path+'labeledTrainData.tsv',sep='\t')

In [4]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
train = df[:15000]
val = df[15000:20000]
test = df[20000:]

In [6]:
len(train),len(val),len(test)

(15000, 5000, 5000)

In [7]:
def process_review(review):
    chars = ['/','\\','>','<','-','br']
    chars.extend('1 2 3 4 5 6 7 8 9 0'.split())
    for char in chars:
        review = review.replace(char,'')
    
    tokens = word_tokenize(review)
    tokens = [t.lower() for t in tokens]
    return tokens

## Make vocab and train data

In [8]:
labels = list(train.sentiment)
reviews = list(train.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

train_data = list(zip(all_words,labels))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))




In [11]:
flat_list = [item for sublist in all_words for item in sublist]
vocab = set(flat_list)

len(vocab)

word_to_idx = {word:idx for idx,word in enumerate(list(vocab))}

counts = Counter(flat_list)

In [12]:
idx_to_word = {idx:word for word,idx in word_to_idx.items()}

## Make validation and test data

In [14]:
labels = list(test.sentiment)
reviews = list(test.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

test_data = list(zip(all_words,labels))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [15]:
labels = list(val.sentiment)
reviews = list(val.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

val_data = list(zip(all_words,labels))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




## Set up model

In [51]:
x_train_split,y_train = list(zip(*train_data))
x_train = [''.join(f'{w} ' for w in s) for s in x_train_split]

x_val_split,y_val = list(zip(*val_data))
x_val = [''.join(f'{w} ' for w in s) for s in x_val_split]

x_test_split,y_test = list(zip(*test_data))
x_test = [''.join(f'{w} ' for w in s) for s in x_test_split]

In [30]:
vectorizer = TfidfVectorizer()

In [31]:
vectorizer.fit(x_train,y_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [35]:
x_train_vec = vectorizer.transform(x_train)

In [37]:
clf = LogisticRegression(random_state=1,max_iter=10000,solver='lbfgs')

In [38]:
clf.fit(x_train_vec,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## Evaluate model

In [54]:
x_list = [x_train, x_val, x_test]
y_list = [y_train, y_val, y_test]
names = 'train val test'.split()

for x,y,name in zip(x_list,y_list,names):
    x_vec = vectorizer.transform(x)
    preds = clf.predict(x_vec)

    num_correct = (preds == np.array(y)).sum()
    total = len(y)
    acc = round(100 * num_correct / total,2)
    print(f'{name} acc: {acc}%')

train acc: 93.55%
val acc: 88.76%
test acc: 87.5%


## Weight analysis - vectorizer

In [141]:
vocab = vectorizer.get_feature_names()
tfidf_matrix = x_train_vec

In [142]:
examples = ['i hated this movie',
           ' i thought this movie was fantastic']

test_matrix = vectorizer.transform(examples)

In [143]:
doc = 1
feature_index = tfidf_matrix[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

In [144]:
words = np.array([vocab[i] for i in feature_index])
weights = np.array([tfidf_matrix[doc, x] for x in feature_index])

In [145]:
sorted_idx = weights.argsort()

In [146]:
words[sorted_idx[-5:]]

array(['this', 'was', 'movie', 'thought', 'fantastic'], dtype='<U9')

In [147]:
weights[sorted_idx[-5:]]

array([0.18197432, 0.23675945, 0.2483629 , 0.53003086, 0.75379936])

## Weight analysis - logistic clf

In [148]:
coef = clf.coef_.flatten()
sorted_coef = coef.argsort()

In [149]:
vocab_array = np.array(vocab)

In [150]:
print('most negative'.upper())
for i in range(5):
    word = vocab_array[sorted_coef[i]]
    weight = coef[sorted_coef[i]]
    print(word,weight)

MOST NEGATIVE
bad -7.609790539797
worst -7.064196414382661
awful -5.06940912023693
waste -4.933565925781307
poor -4.620746014170679


In [151]:
print('most positive'.upper())
for i in range(-5,0):
    word = vocab_array[sorted_coef[i]]
    weight = coef[sorted_coef[i]]
    print(word,weight)

MOST POSITIVE
fun 3.5806254154522956
wonderful 3.685302706533393
best 4.682305922778914
excellent 4.806357642775008
great 6.84484136146761
