# Intermediate Natural Language Processing (NLP)
## Real World Applications of Word Embeddings

### Notebook Organization:
- Loading and comparing pretrained word embeddings
- Applying word embeddings to a problem
- Training your own embeddings


In [None]:
path = '../input/'
import os
for path, dirs, files in os.walk(f'{path}'):
    print(path)
    for f in files:
        print(f)

### Loading and comparing pretrained word embeddings
We load a few different models to compare how they evaluate similar queries

In [None]:
path = '../input/gensim-word-vectors/'
from gensim.models import KeyedVectors

GLOVE_TWITTER = f'{path}glove-twitter-100/glove-twitter-100'
twitter_model = KeyedVectors.load_word2vec_format(GLOVE_TWITTER)

In [None]:
GLOVE_WIKI = f'{path}glove-wiki-gigaword-300/glove-wiki-gigaword-300'
wiki_model = KeyedVectors.load_word2vec_format(GLOVE_WIKI)

Corpuses have different emphases

In [None]:
twitter_model.most_similar("arms")

In [None]:
wiki_model.most_similar("arms")

In [None]:
wiki_model.most_similar("cloud")

In [None]:
twitter_model.most_similar("cloud")

In [None]:
wiki_model.most_similar("occupy")

In [None]:
twitter_model.most_similar("occupy")

#### Spelling

In [None]:
wiki_model.most_similar("cluod")

In [None]:
twitter_model.most_similar("cluod")

In [None]:
twitter_model.most_similar("foriegn")

In [None]:
wiki_model.most_similar("foriegn")

This blogpost describes a strategy for [correcting spelling using word embeddings](https://blog.usejournal.com/a-simple-spell-checker-built-from-word-vectors-9f28452b6f26)

#### Analogies
The classical example: **man::king as woman::?**

In [None]:
POSITIVE_LIST = ['woman', 'king']
NEGATIVE_LIST = ['man']
twitter_model.most_similar(positive=POSITIVE_LIST, negative=NEGATIVE_LIST)

In [None]:
wiki_model.most_similar(positive=POSITIVE_LIST, negative=NEGATIVE_LIST)

The ubiqutous example of bias:
**man::programmer as woman::?**

In [None]:
POSITIVE_LIST = ['woman', 'programmer']
NEGATIVE_LIST = ['man']
twitter_model.most_similar(positive=POSITIVE_LIST, negative=NEGATIVE_LIST)

In [None]:
wiki_model.most_similar(positive=POSITIVE_LIST, negative=NEGATIVE_LIST)

**man::doctor as woman::?**

In [None]:
POSITIVE_LIST = ['woman', 'doctor']
NEGATIVE_LIST = ['man']
twitter_model.most_similar(positive=POSITIVE_LIST, negative=NEGATIVE_LIST)

In [None]:
wiki_model.most_similar(positive=POSITIVE_LIST, negative=NEGATIVE_LIST)

And now the reverse:
**woman::doctor as man::?**

In [None]:
POSITIVE_LIST = ['man', 'doctor']
NEGATIVE_LIST = ['woman']
twitter_model.most_similar(positive=POSITIVE_LIST, negative=NEGATIVE_LIST)

In [None]:
wiki_model.most_similar(positive=POSITIVE_LIST, negative=NEGATIVE_LIST)

### Dimensionality: a curse or not?
Now we compare two GloVE models trained on the same (Twitter) data, one which is represented by vectors of 100 dimensions and another with 25 m=dimensions. 

In [None]:
GLOVE_TWITTER_S = f'{path}glove-twitter-25/glove-twitter-25'
twitter_model_s = KeyedVectors.load_word2vec_format(GLOVE_TWITTER_S)

In [None]:
twitter_model.most_similar("arms")

In [None]:
twitter_model_s.most_similar("arms")

In [None]:
twitter_model.most_similar("cloud")

In [None]:
twitter_model_s.most_similar("cloud")

In [None]:
twitter_model.most_similar(positive=['woman', 'king'], negative=['man'])

In [None]:
twitter_model_s.most_similar(positive=['woman', 'king'], negative=['man'])

In [None]:
# Load spaCy and the English language model
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
phrase = "NLP is so fun!"
doc = nlp(phrase)
print(f'spaCy vectors are {len(doc[3].vector)} dimensions long')
# Get the vector for 'fun':
print(f'First 20 values of vector for "{doc[3]}"\n', doc[3].vector[:20])

In [None]:
# Mean vector for the entire sentence
print(f'First 20 values of vector for phrase "{phrase}"\n', doc.vector[:20])

Comparing 'NLP' to 'Good'

In [None]:
doc[0].similarity(doc[3])

Comparing 'NLP' to 'bad'

In [None]:
phrase = "NLP is so bad!"
doc2 = nlp(phrase)
doc2[0].similarity(doc2[3])

Comparing 'good' to 'bad'

In [None]:
doc[3].similarity(doc2[3])

Comparing 'NLP is good' to 'NLP is bad'

In [None]:
doc.similarity(doc2)

In [None]:
sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The President greets the press in Chicago'
obama = nlp(sentence_obama)
president = nlp(sentence_president)
obama.similarity(president)

In [None]:
sentence_obama2 = 'Obama speaks in Illinois'
obama2 = nlp(sentence_obama2)
obama2.similarity(president)

In [None]:
sentence_nlp = 'NLP is so fun!'
nlp_fun = nlp(sentence_nlp)
obama2.similarity(nlp_fun)

In [None]:
president.similarity(nlp_fun)

### Applying word embeddings to a ML pipeline
Dataset: [IMDB Data set for NLP analysis](https://www.kaggle.com/rajathmc/bag-of-words-meets-bags-of-popcorn-#labeledTrainData.tsv)

In [None]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
path = '../input/bag-of-words-meets-bags-of-popcorn-/'
train = pd.read_csv(f'{path}labeledTrainData.tsv', header = 0, delimiter = '\t', quoting = 3)
test = pd.read_csv(f'{path}testData.tsv', header = 0, delimiter = '\t', quoting = 3)
print(f'train dim:{train.shape}, test dim:{test.shape}')

In [None]:
train.head()

In [None]:
features = []
for index, row in train.iterrows():
   doc = nlp(row["review"])
   features.append(doc.vector)
    
features_test = []
for index, row in test.iterrows():
   doc = nlp(row["review"])
   features_test.append(doc.vector)

In [None]:
labels = train["sentiment"]
features = train_vectors

labels_test = test["sentiment"]
features_test = test_vectors

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

clf = LinearSVC() # Whatever model name FOR CLASSIFICATION
clf.fit(features, labels)
preds_test = clf.predict(features_test)

print(classification_report(labels_test,
                           preds_test,
                           target_names=train["sentiment"].unique()))

### Training your own embeddings

In [None]:
from string import punctuation as sp
import re
from spacy.lang.en import English
parser = English()
import en_core_web_lg
nlp = en_core_web_lg.load()
from gensim.models import word2vec

Before we start transforming and processing text, we want to look at what are standard features in the libraries.

In [None]:
print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))

In [None]:
nlp.Defaults.stop_words.add("my_new_stopword")
print(len(nlp.Defaults.stop_words))

In [None]:
nlp.Defaults.stop_words.remove("my_new_stopword")
print(len(nlp.Defaults.stop_words))

In [None]:
STOPLIST = nlp.Defaults.stop_words
SYMBOLS = " ".join(sp).split(" ") + ["-", "...", "”", "”"]
from bs4 import BeautifulSoup

def lemmatizeText(document):
    '''
    Removes html tags
    Replaces newlines, carriage returns and multiple spaces with a single space
    Uncases text
    Parses text into lemmas excluding stopwords, symbols and pronouns
    '''
    soup = BeautifulSoup(document)
    text = soup.get_text(" ")
    text = text.strip().replace("\n", " ").replace("\r", " ").replace("\s\s+", " ")
    text = text.lower()
    tokens = nlp(text)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [None]:
lemmatizeText(train["review"][0])

In [None]:
import time

corpus = []
start = time.time()
for index, row in train.iterrows():
   lemmatized_rev = lemmatizeText(row['review'])
   corpus.append(lemmatized_rev)
   if((index % 500)==0):
        end = time.time()
        print('{} rows processed in {} seconds'.format(index,end-start))
        start = time.time() 

In [None]:
word_count = 0
for doc in corpus:
   word_count += len(doc)
print(f'The corpus has {len(corpus)} documents and {word_count} words')

In [None]:

from gensim.models import Phrases
from gensim.models.phrases import Phraser
import time

phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)
start = time.time() 
for index,sentence in enumerate(corpus):
    corpus[index] = bigram[sentence]
    if((index % 5000)==0):
        end = time.time()
        print('{} rows processed in {} seconds'.format(index,end-start))
        start = time.time() 
   

In [None]:
start =  time.time()
model = word2vec.Word2Vec(corpus, workers = 4, size = 100, min_count = 40, window = 10, sample = 0.0001)
end = time.time()
print(end-start)

In [None]:
model.init_sims(replace = True)
model.save(fname_or_handle = "w2v_imdb_100d")

In [None]:
# class MySentences(object):
#     def __init__(self, dirname):
#         self.dirname = dirname
 
#     def __iter__(self):
#         for fname in os.listdir(self.dirname):
#             for line in open(os.path.join(self.dirname, fname)):
#                 yield line.split()
 
# #sentences = MySentences('/some/directory') # a memory-friendly iterator

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import bokeh
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

imdb_model =  gensim.models.Word2Vec.load(fname_or_handle='w2v_imdb_100d')

In [None]:
def display_closestwords_tsnescatterplot(model, word):
    arr = np.empty((0,100), dtype='f')
    word_labels = [word]
    # get close words
    close_words = model.similar_by_word(word)
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

In [None]:
display_closestwords_tsnescatterplot(model, 'chef')


References:
- Training word2vec embeddings: https://rare-technologies.com/word2vec-tutorial/
- Sentiment analysis using word2vec: https://www.kaggle.com/kyen89/2-sentiment-analysis-word2vec
- Data streaming using generators: https://rare-technologies.com/data-streaming-in-python-generators-iterators-iterables/
- Using GloVE + Keras: https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout