In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


from ipynb.fs.full.utils_functions import load_corpus
from gensim.models import FastText

size_p = [5,10,12,18]
window_p = [4,5,6]
epochs_p = [50,300,600]

FASTTEXT_MODEL_PATH = 'models/fasttext/fasttext_'

In [None]:
def get_features(doc, model):
    features = []
    for sentence in doc:
        if sentence == []:
            continue
        vectors = [model.wv[word] for word in sentence]
        features.append(np.mean(vectors, axis=0))
    return features

In [None]:
# Getting data.
data, original_data = load_corpus()

In [None]:
# Getting list of words.

words = []
for phrase in data:
    for word in phrase:
        words.append(word)
words = list(dict.fromkeys(words))

# Training and saving model.

for epochsp in epochs_p:
    for sizep in size_p:
        for windowp in window_p:
            print ("Model with size={}, window={},epochs={}".format(sizep,windowp,epochsp))
            model = FastText(size=sizep, window=windowp, min_count=2, sg=1)  # instantiate
            model.build_vocab(sentences=data)
            model.train(sentences=data, total_examples=len(data), epochs=epochsp) 
            
            # Saving model
            model.save(FASTTEXT_MODEL_PATH + str("{}_{}_{}.pkl".format(sizep,windowp,epochsp)))
            # Getting and saving features of the corpus.

            data_mean_feature = get_features(data, model)
            dataframe_mean = pd.DataFrame(data_mean_feature)
            dataframe_mean.to_pickle("data/df_from_mean_fasttext_{}_{}_{}.pkl".format(sizep,windowp,epochsp))

In [None]:
models = []
ind = 0
for epochsp in epochs_p:
    for sizep in size_p:
        for windowp in window_p:
            print (ind,"Model with size={}, window={},epochs={}".format(sizep,windowp,epochsp))
            models.append(FastText.load(FASTTEXT_MODEL_PATH + str("{}_{}_{}.pkl".format(sizep,windowp,epochsp))))
            ind += 1

In [103]:
bads = [
    0,1,2,16,26,19,22, # lunes
    12,15,20,29, # garganta
    13,14,25, # rendir
    10,27,34,35, # bondi
    24,28, # fernet
    7,33, # miercoles
]
for x in range(0,ind):
    if x in bads:
        continue
    print(x,models[x].wv.most_similar('cama',topn=5),'\n')

3 [('dormir', 0.9498540759086609), ('despertar', 0.9139439463615417), ('levante', 0.9076552391052246), ('peli', 0.8941628932952881), ('sueño', 0.8828880786895752)] 

4 [('dormir', 0.968225359916687), ('despertar', 0.9300165176391602), ('acostado', 0.9167370200157166), ('diente', 0.9046576023101807), ('living', 0.9032378792762756)] 

5 [('dormir', 0.9424377679824829), ('despertar', 0.9314313530921936), ('diente', 0.9107272624969482), ('peli', 0.8979694843292236), ('levantar', 0.8935658931732178)] 

6 [('dormir', 0.9100430011749268), ('despertar', 0.8954784274101257), ('diente', 0.8871906995773315), ('peli', 0.8625540733337402), ('dormi', 0.852523684501648)] 

8 [('dormir', 0.923529326915741), ('living', 0.9164628982543945), ('despertar', 0.9055368900299072), ('peli', 0.8891691565513611), ('dormi', 0.865797221660614)] 

9 [('living', 0.8600726127624512), ('levante', 0.8035306930541992), ('diente', 0.784979522228241), ('levantar', 0.7772561311721802), ('alcohol', 0.7713462710380554)] 

11

### Lets move with cluster 11

# Visualization of the words

In [None]:
# Setting up
model = models[11]
vecs = []

for word in words:
    vecs.append(model.wv[word])

    
# Using Singular Value Decomposition to reduce dimensions.

U, s, Vh = np.linalg.svd(vecs,full_matrices=False)

# Plotting data

for i in range(len(words)):
        fig = plt.gcf()
        fig.set_size_inches(18.5, 10.5)
        plt.text(U[i,0], U[i,1], words[i])
        plt.xlim((-0.03,-0.015))
        plt.ylim((-0.01,0.01))

plt.savefig('viz.jpg')