In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import string
import nltk
import multiprocessing as mp
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

# W2V embedding

<img src="https://shestakoff.github.io/hse_se_ml/2020/l09-nlp-w2v/img/w2v_window.png" width="480">

Ideally, we would like to have following probabilities (multi-class classification problem):

$$p_{\theta}(v|w) = \frac{\exp(in_w^T out_v)}{\sum_{v'}\exp(in_w^T out_{v'})}$$

However, it's hard to compute the sum in the denominator. So, instead of multi-class problem, we solve binary classification problem (is there a context or not). We minimize

$$L(w,v) = - \log(\sigma(in_w^T out_v)) - \hat{\sum_{v'}}\log(\sigma(-in_w^T out_{v'}))$$

As on the previous seminar, we are going to use a <i>sentiment analysis</i> problem for demonstration, but with slightly different dataset

In [3]:
raw_corpus, sentiment = pd.read_csv('IMDB Dataset.csv').values.T
sentiment = (sentiment=='positive').astype(int)

In [4]:
raw_corpus[42], sentiment[42]

('Of all the films I have seen, this one, The Rage, has got to be one of the worst yet. The direction, LOGIC, continuity, changes in plot-script and dialog made me cry out in pain. "How could ANYONE come up with something so crappy"? Gary Busey is know for his "B" movies, but this is a sure "W" movie. (W=waste).<br /><br />Take for example: about two dozen FBI & local law officers surround a trailer house with a jeep wagoneer. Inside the jeep is MA and is "confused" as to why all the cops are about. Within seconds a huge gun battle ensues, MA being killed straight off. The cops blast away at the jeep with gary and company blasting away at them. The cops fall like dominoes and the jeep with Gary drives around in circles and are not hit by one single bullet/pellet. MA is killed and gary seems to not to have noticed-damn that guy is tough. Truly a miracle, not since the six-shooter held 300 bullets has there been such a miracle.',
 0)

**we would like to drop stop-words**

In [5]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/matyushinleonid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

**preprocess: do lowercase, drop stop-words, do stemmatization**

In [6]:
from nltk.stem.snowball import SnowballStemmer 
stemmer = SnowballStemmer("english")

In [7]:
word_tokenizer = nltk.WordPunctTokenizer()
def lowercase_dropstop_stemmatize(doc):
    doc = [word.lower() for word in word_tokenizer.tokenize(doc) if 
           (word.lower() not in string.punctuation and word.lower() not in stop_words)
          ]
    text_stemmed = list(map(stemmer.stem, doc))
    return ' '.join(text_stemmed)

In [8]:
%%time
with mp.Pool() as pool:
    corpus = pool.map(lowercase_dropstop_stemmatize, raw_corpus)

CPU times: user 105 ms, sys: 250 ms, total: 355 ms
Wall time: 6.11 s


In [9]:
%%time
path = get_tmpfile("word2vec.model")
model = Word2Vec([doc.split(' ') for doc in corpus], size=300, window=5, min_count=2, iter=10)
model.save("word2vec.model")

CPU times: user 3min 28s, sys: 1.76 s, total: 3min 29s
Wall time: 1min 20s


**let us compute most relevant contexts for the word "VADER"**

vector representation (first 50 components):

In [10]:
model.wv['vader'][:50]

array([-0.5729108 , -0.22881374,  0.50018376, -0.20409894, -0.55492777,
       -0.35430026,  0.07701688,  0.25573292,  0.7422419 ,  0.20204526,
       -0.32518914,  0.07587666,  0.821006  ,  0.4427495 ,  0.49637473,
        0.2580907 , -0.02924393, -0.17726961, -0.15624647,  0.0852446 ,
       -0.6651756 ,  0.08680069, -0.6850511 ,  0.09538114, -0.16225617,
       -0.0288447 ,  0.17406179, -0.1850053 , -0.26299593,  0.11451057,
        0.3836558 , -0.08444699, -0.05174693, -0.14174159,  0.38017547,
        0.14349411, -0.3878912 ,  0.30732694,  0.00203037, -0.24798863,
        0.34370682, -0.0265651 ,  0.6500566 , -0.08291614, -0.2405748 ,
       -0.12190673,  0.7439231 ,  0.14104372,  0.3179491 ,  0.50621796],
      dtype=float32)

most relevant vectors wrt cosine distance:

In [11]:
model.most_similar('vader')

[('darth', 0.8780304193496704),
 ('anakin', 0.7383174300193787),
 ('vadar', 0.7381731271743774),
 ('skywalk', 0.7120633125305176),
 ('leia', 0.630388617515564),
 ('palpatin', 0.6245428919792175),
 ('sith', 0.578973114490509),
 ('obi', 0.5750287771224976),
 ('jedi', 0.5672026872634888),
 ('annakin', 0.5638409852981567)]

# Train a model on the embeddings

In [12]:
def corpus_and_model_to_X(corpus, model):
    X = []
    for doc in corpus:
        x = []
        for word in doc.split(' '):
            if word in model.wv.vocab.keys():
                x.append(model.wv[word])
        X.append(np.mean(x, axis=0))
    X = np.stack(X)
    return X

In [13]:
%%time
X = corpus_and_model_to_X(corpus, model)

CPU times: user 19.6 s, sys: 220 ms, total: 19.8 s
Wall time: 17.2 s


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, sentiment, test_size=0.2, random_state=42)

In [15]:
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

0.9476257735520629

# Pre-Trained W2V

In [17]:
import gensim.downloader as api

Pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.

In [18]:
model = api.load('word2vec-google-news-300')

In [19]:
model.most_similar("dollar")

[('greenback', 0.746670663356781),
 ('currency', 0.6809227466583252),
 ('Dollar', 0.6606971025466919),
 ('Japanese_Yen', 0.6504158973693848),
 ('loonie', 0.6266234517097473),
 ('Swiss_franc', 0.6245606541633606),
 ('rupee', 0.620018482208252),
 ('yen.The_Nikkei', 0.6100620031356812),
 ('euro', 0.6061668395996094),
 ('USdollar', 0.6032359600067139)]

In [20]:
model.most_similar("data")

[('Data', 0.7262316942214966),
 ('datasets', 0.603030264377594),
 ('dataset', 0.5796632170677185),
 ('databases', 0.5450121164321899),
 ('statistics', 0.537885844707489),
 ('information', 0.5368290543556213),
 ('database', 0.5325667262077332),
 ('Data_System_IPEDS', 0.5222617983818054),
 ('data.The', 0.5189103484153748),
 ('OpenSpirit_enabled', 0.5174090266227722)]

In [21]:
model.similar_by_vector(model.wv['Merkel'] - model.wv['Germany'] + model['Russia'])

[('Merkel', 0.7865056991577148),
 ('Putin', 0.7305814027786255),
 ('Medvedev', 0.7112522125244141),
 ('Tymoshenko', 0.6713743209838867),
 ('Lavrov', 0.6601910591125488),
 ('Yanukovych', 0.6371150016784668),
 ('Ms_Tymoshenko', 0.6273255348205566),
 ('Yanukovich', 0.6268482208251953),
 ('Kudrin', 0.625721275806427),
 ('Russia', 0.6246747374534607)]

In [22]:
model.similar_by_vector(model.wv['king'] - model.wv['man'] + model['woman'])

[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.6454660892486572),
 ('princess', 0.6156250834465027),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.577711820602417),
 ('kings', 0.5613664388656616),
 ('sultan', 0.5376776456832886),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]

In [23]:
model.similar_by_vector(model.wv['apples'] - model.wv['apple'] + model.wv['cat'])

[('cat', 0.7650878429412842),
 ('cats', 0.7223140001296997),
 ('dogs', 0.6266546249389648),
 ('puppies', 0.6123023629188538),
 ('felines', 0.5935230851173401),
 ('dog', 0.5880736708641052),
 ('kitties', 0.5775256752967834),
 ('kittens', 0.5743311047554016),
 ('chihuahuas', 0.5673359632492065),
 ('pup', 0.5663490295410156)]

In [24]:
%%time
X = corpus_and_model_to_X(corpus, model)
X_train, X_test, y_train, y_test = train_test_split(X, sentiment, test_size=0.2, random_state=42)

CPU times: user 41.4 s, sys: 179 ms, total: 41.6 s
Wall time: 39 s


In [25]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

0.8948508827277051