# Word embeddings & analysis of Poetry
* Digital humanities
* Gutenberg corpus includes preamble/non poetry. Could be filtered.
* Primarily English. 
* Data may further be cleaned by using lemmatization etc'. +- pretrained w2v models, multilingual. ([Example of loading a pretrained W2V model and finetuning it](https://www.kaggle.com/rtatman/fine-tuning-word2vec)

In [None]:
from gensim.utils import simple_preprocess
from gensim.sklearn_api.phrases import PhrasesTransformer # phrases/ coallocations - https://radimrehurek.com/gensim/sklearn_api/phrases.html
from gensim.sklearn_api import phrases
from gensim.models.phrases import Phrases #, ENGLISH_CONNECTOR_WORDS
from gensim.models import Word2Vec

# import re
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
interesting_words_list = ["salt","pepper","spice","spices","herbs","herbal","sweet","spicy","salty","moist",
                          "paprika","saffron","mace","lavender","honey","honeysuckle","chile","marjoram","sugar","tea","mint",
                          "taste","smell","aroma",
                          "cinnamon","cardamom","peppercorn","turmeric","anise","zaatar",
                          "fork","knife","dish","food","plate",
                         "basil","cilantro","chili","cumin","onion","garlic","dill","horseradish","radish","mustard","peppermint","pepper","sage","vanilla","wasabi"]

In [None]:
df = pd.read_csv("../input/gutenberg-poetry-dataset/Gutenberg-Poetry.csv",
#                  nrows=1234,
                 usecols=["s"]).drop_duplicates().rename(columns={"s":"text"})
df = df.loc[df["text"].str.split().str.len()>1]
df

In [None]:
## https://stackoverflow.com/questions/51049568/attributeerror-on-spacy-token-pos
## we could also lemmatize
# from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

df["text"] = df["text"].apply(lambda x: simple_preprocess(x, deacc=True, max_len=50,min_len=2))
# df.drop_duplicates("text",inplace=True)
print(df.shape)
# ## drop duplicates - on list
# df = df[~df["text"].apply(pd.Series).duplicated()]
# print(df.shape)

## phrases coallocation
m = PhrasesTransformer(min_count=6,max_vocab_size=30000000)
# df["text"] = m.fit_transform(df["text"].values)
m.fit(df["text"])
df["text"]  = m.transform(df["text"])


sentences = df["text"]

In [None]:
df["text"].str.len().describe()

In [None]:
#an example sentence in the data
print(sentences.iloc[7])

In [None]:
#Word2Vec
#training the gensim on the data
#Using the Cbow architecture for the word2Vec

model_cbow = Word2Vec(sentences, min_count = 2, size = 200, workers = 3, window = 6)

In [None]:
#Any example word vector
print('chief\n:',model_cbow['chief']) 

In [None]:
# Similarity of the words
print(model_cbow.similarity('chief', 'indian'))

#### Most similar words to a word
* CBOW model

In [None]:
print('the 10 most similar words to indian:\n')
model_cbow.most_similar('indian')

In [None]:
for w in interesting_words_list:
    try: print(w,"\n",[i[0] for i in model_cbow.most_similar(w, topn=13)])
    except: pass

In [None]:
# defining a tsne function to visualize
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
def plot_tsne(model, num):
    labels = []
    tokens = []
    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    tsne = TSNE(perplexity = 40, n_components = 2, init = 'pca', n_iter = 1500, random_state = 23) # orig 2500 n_iter
    data = tsne.fit_transform(tokens[:num])
    x = []
    y = []
    for each in data:
        x.append(each[0])
        y.append(each[1])
    plt.figure(figsize = (13, 13))
    for i in range(num):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy = (x[i], y[i]),
                     xytext = (5,2),
                     textcoords = 'offset points',
                     ha = 'right',
                     va = 'bottom')
    plt.show()

In [None]:
#visualising the cbow architecture(only the first 120)
plot_tsne(model_cbow, 120)

In [None]:
## let's see how the skipgram model works on the data
model_skipgram = Word2Vec(sentences, min_count = 2, size = 200, workers = 3, window = 6, sg = 1)

In [None]:
#Computing the similarities of the words
print(model_skipgram.similarity('indian', 'chief'))

#### skipgram - most similar words

In [None]:
for w in interesting_words_list:
    try: print(w,"\n",[i[0] for i in model_skipgram.most_similar(w, topn=13)])
    except: pass

In [None]:
print('the 10 most similar words to indian:\n')
model_skipgram.most_similar('indian')

In [None]:
#visualising the skipgram archtecture(only the first 100)
plot_tsne(model_skipgram,100)

### glove based model/embeddings

In [None]:
#using the glove package for embeddings
!pip install glove_python

In [None]:
from glove import Corpus, Glove
corpus = Corpus()
corpus.fit(sentences, window = 5)
glove = Glove(no_components = 150, learning_rate = 0.05)
glove.fit(corpus.matrix, epochs = 30, no_threads = 4, verbose = False)
glove.add_dictionary(corpus.dictionary)

In [None]:
#Computing the similarities of the words
print(glove.most_similar('indian', number = 9))

In [None]:
for w in interesting_words_list:
    try: print(w,"\n",[i[0] for i in glove.most_similar(w, number=13)])
    except: pass

In [None]:
# now visualising first 80 words using tsne
def plot_tsne_glove(model, num):
    labels = []
    tokens = []
    for word in model.wv.vocab:
        tokens.append(glove.word_vectors[glove.dictionary[word]])
        labels.append(word)
    tsne = TSNE(perplexity = 40, n_components = 2, init = 'pca', n_iter = 1500, random_state = 23) # was n_iter 2500 originally
    data = tsne.fit_transform(tokens[:num])
    x = []
    y = []
    for each in data:
        x.append(each[0])
        y.append(each[1])
    plt.figure(figsize = (12, 12))
    for i in range(num):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy = (x[i], y[i]),
                     xytext = (5,2),
                     textcoords = 'offset points',
                     ha = 'right',
                     va = 'bottom')
    plt.title('Word vectorization using Glove')
    plt.show()

In [None]:
plot_tsne_glove(model_skipgram, 120)

#### Finetuning a pretrained model
* Example : https://www.kaggle.com/rtatman/fine-tuning-word2vec
* We will use conceptnet numberbatch embeddings - can be downloaded manually, via [Gensim's downloader api](https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html), or imported from one of the kaggle datasets (as done here)
    * https://www.kaggle.com/joeskimo/conceptnet
    * https://www.kaggle.com/blackitten13/gensim-embeddings-dataset
    * Example numberbatch loading + "cleaning" code snippets: https://gist.github.com/ixaxaar/9fc209e7ba1c88b87f287028396609f1

In [None]:
import gensim
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors

In [None]:
## load pretrained conceptnet numberatch + Clean it's format ("remove "/c/en" and similar prefixes of language?)
## takes ~ 2 minutes to load
model = KeyedVectors.load_word2vec_format("../input/conceptnet/numberbatch-en-19.08.txt",binary=False, unicode_errors='ignore',limit=800000)

In [None]:
model.most_similar('indian')

In [None]:
print("Pretrained model, without finetuning on poetry:")
for w in interesting_words_list:
    try: print(w,"\n",[i[0] for i in model.most_similar(w, topn=10)])
    except: pass

In [None]:
model_2 = Word2Vec(size=300, min_count=1)
model_2.build_vocab(sentences)
total_examples = model_2.corpus_count
model_2.build_vocab([list(model.vocab.keys())], update=True)

model_2.intersect_word2vec_format("../input/conceptnet/numberbatch-en-19.08.txt",binary=False, unicode_errors='ignore')
model_2.train(sentences, total_examples=total_examples, epochs=model_2.iter)

In [None]:
print("Finetuned model on poetry:")
for w in interesting_words_list:
    try: print(w,"\n",[i[0] for i in model_2.most_similar(w, topn=13)])
    except: pass

In [None]:
# model.train(sentences, total_examples=total_examples, epochs=model_2.iter)
### AttributeError: 'Word2VecKeyedVectors' object has no attribute 'train' 
# print("Pretrained model, without finetuning on poetry:")
# for w in interesting_words_list:
#     try: print(w,"\n",[i[0] for i in model.most_similar(w, topn=12)])
#     except: pass