In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


from ipynb.fs.full.utils_functions import load_corpus
from gensim.models import FastText

In [None]:
def get_features(doc, model):
    features = []
    for sentence in doc:
        if sentence == []:
            continue
        vectors = [model.wv[word] for word in sentence]
        features.append(np.mean(vectors, axis=0))
    return features

In [None]:
# Getting data.
data, original_data = load_corpus()

# Getting list of words.

words = []
for phrase in data:
    for word in phrase:
        words.append(word)
words = list(dict.fromkeys(words))

# Training and saving model.

model = FastText(size=20, window=2, min_count=5)  # instantiate
model.build_vocab(sentences=words)
model.train(sentences=words, total_examples=len(words), epochs=5000) 

# Getting and saving features of the corpus.

data_mean_feature = get_features(data, model)
dataframe_mean = pd.DataFrame(data_mean_feature)
dataframe_mean.to_pickle("data/df_from_mean_fasttext.pkl")

# Visualization of the words

In [None]:
# Setting up

vecs = []

for word in words:
    vecs.append(model.wv[word])

    
# Using Singular Value Decomposition to reduce dimensions.

U, s, Vh = np.linalg.svd(vecs,full_matrices=False)

# Plotting data

for i in range(len(words)):
        fig = plt.gcf()
        fig.set_size_inches(18.5, 10.5)
        plt.text(U[i,0], U[i,1], words[i])
        plt.xlim((-0.05,0.05))
        plt.ylim((-0.05,0.05))


plt.savefig('viz.jpg')