In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import string
import nltk
import multiprocessing as mp
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

In [None]:
import warnings
warnings.filterwarnings("ignore")

# W2V embedding

<img src="https://shestakoff.github.io/hse_se_ml/2020/l09-nlp-w2v/img/w2v_window.png" width="480">

Ideally, we would like to have following probabilities (multi-class classification problem):

$$p_{\theta}(v|w) = \frac{\exp(in_w^T out_v)}{\sum_{v'}\exp(in_w^T out_{v'})}$$

However, it's hard to compute the sum in the denominator. So, instead of multi-class problem, we solve binary classification problem (is there a context or not). We minimize

$$L(w,v) = - \log(\sigma(in_w^T out_v)) - \hat{\sum_{v'}}\log(\sigma(-in_w^T out_{v'}))$$

As on the previous seminar, we are going to use a <i>sentiment analysis</i> problem for demonstration, but with slightly different dataset

In [None]:
! wget -N https://raw.githubusercontent.com/matyushinleonid/hse_se_ml/master/2020/s09-word2vec/IMDB%20Dataset.csv

In [None]:
raw_corpus, sentiment = pd.read_csv('IMDB Dataset.csv').values.T
sentiment = (sentiment=='positive').astype(int)

In [None]:
raw_corpus[42], sentiment[42]

**we would like to drop stop-words**

In [None]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words[:10]

**preprocess: do lowercase, drop stop-words, do stemmatization**

In [None]:
from nltk.stem.snowball import SnowballStemmer 
stemmer = SnowballStemmer("english")

In [None]:
word_tokenizer = nltk.WordPunctTokenizer()
def lowercase_dropstop_stemmatize(doc):
    doc = [word.lower() for word in word_tokenizer.tokenize(doc) if 
           (word.lower() not in string.punctuation and word.lower() not in stop_words)
          ]
    text_stemmed = list(map(stemmer.stem, doc))
    return ' '.join(text_stemmed)

In [None]:
%%time
with mp.Pool() as pool:
    corpus = pool.map(lowercase_dropstop_stemmatize, raw_corpus)

In [None]:
%%time
path = get_tmpfile("word2vec.model")
model = Word2Vec([doc.split(' ') for doc in corpus], size=300, window=5, min_count=2, iter=10)
model.save("word2vec.model")

**let us compute most relevant contexts for the word "VADER"**

vector representation (first 50 components):

In [None]:
model.wv['vader'][:50]

most relevant vectors wrt cosine distance:

In [None]:
model.most_similar('vader')

# Train a model on the embeddings

In [None]:
def corpus_and_model_to_X(corpus, model):
    X = []
    for doc in corpus:
        x = []
        for word in doc.split(' '):
            if word in model.wv.vocab.keys():
                x.append(model.wv[word])
        X.append(np.mean(x, axis=0))
    X = np.stack(X)
    return X

In [None]:
%%time
X = corpus_and_model_to_X(corpus, model)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, sentiment, test_size=0.2, random_state=42)

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])

# Pre-Trained W2V

In [None]:
import gensim.downloader as api

Pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.

In [None]:
model = api.load('word2vec-google-news-300')

In [None]:
model.most_similar("dollar")

In [None]:
model.most_similar("data")

In [None]:
model.similar_by_vector(model.wv['Merkel'] - model.wv['Germany'] + model['Russia'])

In [None]:
model.similar_by_vector(model.wv['king'] - model.wv['man'] + model['woman'])

In [None]:
model.similar_by_vector(model.wv['apples'] - model.wv['apple'] + model.wv['cat'])

In [None]:
%%time
X = corpus_and_model_to_X(corpus, model)
X_train, X_test, y_train, y_test = train_test_split(X, sentiment, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])