In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


from ipynb.fs.full.utils_functions import load_corpus
from gensim.models import FastText

size_p = [5,12,18]
window_p = [4,5]
epochs_p = [50,200,400]

FASTTEXT_MODEL_PATH = 'models/fasttext/fasttext_'
FASTTEXT_DF_PATH = 'data/fasttext/df_from_mean_fasttext_'

In [None]:
def get_features(doc, model):
    features = []
    for sentence in doc:
        if sentence == []:
            continue
        vectors = [model.wv[word] for word in sentence]
        features.append(np.mean(vectors, axis=0))
    return features

In [None]:
# Getting data.
data, original_data = load_corpus()

# Getting list of words.

words = []
for phrase in data:
    for word in phrase:
        words.append(word)
words = list(dict.fromkeys(words))

In [None]:
# Training and saving model.

for epochsp in epochs_p:
    for sizep in size_p:
        for windowp in window_p:
            print ("Model with size={}, window={},epochs={}".format(sizep,windowp,epochsp))
            model = FastText(size=sizep, window=windowp, min_count=2, sg=1)  # instantiate
            model.build_vocab(sentences=data)
            model.train(sentences=data, total_examples=len(data), epochs=epochsp) 
            
            # Saving model
            model.save(FASTTEXT_MODEL_PATH + str("{}_{}_{}.pkl".format(sizep,windowp,epochsp)))
            # Getting and saving features of the corpus.

            data_mean_feature = get_features(data, model)
            dataframe_mean = pd.DataFrame(data_mean_feature)
            dataframe_mean.to_pickle(FASTTEXT_DF_PATH + str("{}_{}_{}.pkl".format(sizep,windowp,epochsp)))

In [None]:
# Loading models
models = []
ind = 0
for epochsp in epochs_p:
    for sizep in size_p:
        for windowp in window_p:
            print (ind,"Model with size={}, window={},epochs={}".format(sizep,windowp,epochsp))
            models.append(FastText.load(FASTTEXT_MODEL_PATH + str("{}_{}_{}.pkl".format(sizep,windowp,epochsp))))
            ind += 1

In [None]:
# Making comparation among the models to choose one.
bads = [
    0,1,2,16,26,19,22, # lunes
    12,15,20,29, # garganta
    13,14,25, # rendir
    10,27,34,35, # bondi
    24,28, # fernet
    7,33, # miercoles
]
for x in range(0,ind):
    if x in bads:
        continue
    print(x,models[x].wv.most_similar('cama',topn=5),'\n')

### Lets move with cluster 3

# Visualization of the words

In [None]:
# Setting up
model = models[3]
vecs = []

for word in words:
    vecs.append(model.wv[word])

    
# Using Singular Value Decomposition to reduce dimensions.

U, s, Vh = np.linalg.svd(vecs,full_matrices=False)

# Plotting data

for i in range(len(words)):
        fig = plt.gcf()
        fig.set_size_inches(18.5, 10.5)
        plt.text(U[i,0], U[i,1], words[i])
        plt.xlim((-0.1,-0.05))
        plt.ylim((-0.01,0.01))

plt.savefig('viz.jpg')