<a href="https://colab.research.google.com/github/sahandv/science_science/blob/master/FastText_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FASTTEXT EMBEDDING


## Init

Local OR Colab?

In [1]:
datapath = '/home/sahand/GoogleDrive/Data/' # Local
# datapath = 'drive/My Drive/Data/' # Remote

### Clone Project Git Repo

In [3]:
!rm -rf 'science_science'
username = "sahandv"#@param {type:"string"}
# password = ""#@param {type:"string"} 

!git clone https://github.com/$username/science_science.git
!ls

Cloning into 'science_science'...
remote: Enumerating objects: 148, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (127/127), done.[K
remote: Total 565 (delta 79), reused 60 (delta 17), pack-reused 417[K
Receiving objects: 100% (565/565), 81.91 MiB | 14.20 MiB/s, done.
Resolving deltas: 100% (281/281), done.
drive  sample_data  science_science


### Mount Google Drive


In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [5]:
# Check files!
!ls 'drive/My Drive/Data-Permenant/FastText-crawl-300d-2M-subword'

crawl-300d-2M-subword.bin  crawl-300d-2M-subword.vec


### Install requirements

In [6]:
!pip install fasttext
!pip install -r 'science_science/requirements.txt'
!pip install gensim==3.8.1



### Import Libs

In [3]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import gc
import json
import re

import fasttext
import gensim
from gensim.models import FastText as fasttext_gensim
from gensim.test.utils import get_tmpfile

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sciosci.assets import keyword_assets as kw
from sciosci.assets import generic_assets as sci
from sciosci.assets import advanced_assets as aa

# from science_science.sciosci.assets import keyword_assets as kw
# from science_science.sciosci.assets import generic_assets as sci
# from science_science.sciosci.assets import advanced_assets as aa

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
stops = ['a','an','we','result','however','yet','since','previously','although','propose','proposed','e_g','method',
         'published_elsevier','b','v','problem','paper','approach','within','with','by','via','way','t','case','issue','level','area','system',
         'work','discussed','seen','put','usually','take','make','author','versus','enables','result','research','design','based']
punkts = [' ','','(',')','[',']','{','}','.',',','!','?','<','>','-','_',':',';','\\','/','|','&','%',"'s","`s",'#','$','@']

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = list(set(stopwords.words("english")))+stops+punkts
np.random.seed(50)

[nltk_data] Downloading package stopwords to /home/sahand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sahand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sahand/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Get embeddings from a pre-trained model


### Load Corpus

In [14]:
period = '2014-2016'
percentile = 97

#### Option A - Load corpus

In [15]:
directory = datapath+'Corpus/copyr_lemmatized_stopword_removed_thesaurus/by period/uni-grams/'
file_name = period+' abstract_title'#corpus abstract-title - with n-grams'
corpus = pd.read_csv(directory+file_name,names=['abstracts'])
corpus_tokens = [item for sublist in corpus['abstracts'].values.tolist() for item in sublist.split()]
gc.collect()

40

In [16]:
corpus

Unnamed: 0,abstracts
0,human machine interaction facial expression re...
1,rule base fuzzy cognitive map natural language...
2,fog compute architecture healthcare wireless p...
3,argumentation knowledge representation conflic...
4,hydrothermal coordination power system scale i...
5,statistical method manage miss data applicatio...
6,intelligent digital signal processing feature ...
7,dewey enactivism greek thought chapter examine...
8,artificial vision manufacturing system artific...
9,overview application image processing technolo...


#### Option B - Load keywords

In [0]:
directory = datapath+'LDA/'
file_name = period+' top_90-percentile_keywords_terms.csv'
corpus = pd.read_csv(directory+file_name)
corpus = corpus.fillna('this_is_null')
corpus_tokens = []
for idx,row in tqdm(corpus.iterrows(),total=corpus.shape[0]):
    for token in row.values.tolist():
        if token != 'this_is_null': 
            corpus_tokens.append(token) 
del corpus
print("\nNumber of unique tokens:",len(corpus_tokens))

100%|██████████| 6546/6546 [00:00<00:00, 8418.10it/s]


Number of unique tokens: 52365





#### Option C - Load author keywords

In [0]:
directory = datapath+'Author keywords - 29 Oct 2019/'
file_name = period+' keyword frequency'
corpus = pd.read_csv(directory+file_name,names=['keyword','frequency'])
corpus = corpus.fillna('this_is_null')
threshold = np.percentile(corpus['frequency'].values,percentile)
corpus = corpus[corpus['frequency']>threshold]

corpus_tokens = []
for idx,row in tqdm(corpus.iterrows(),total=corpus.shape[0]):
    if row['keyword'] != 'this_is_null': 
        corpus_tokens.append(row['keyword']) 
print("\nNumber of unique tokens:",len(corpus_tokens))

100%|██████████| 1376/1376 [00:00<00:00, 9769.37it/s]


Number of unique tokens: 1376





## Facebook Model

#### Load model

In [0]:
fb_model_address = datapath+'/FastText-crawl-300d-2M-subword/crawl-300d-2M-subword.bin'

In [0]:
model = fasttext.load_model(fb_model_address)




#### Get embeddings

*   Save to dictionary and json
*   This takes much less space on disk


##### No n-gram handle

In [0]:
# Save in a dict
output_dict = {}
comment_embedding = ''
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    output_dict[token] = str(model.get_word_vector(token))

100%|██████████| 280/280 [00:00<00:00, 337.81it/s]


##### Manual n-gram handle

In [0]:
# Save in a dict
comment_embedding = 'average_manual '
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    token_split = token.split(' ')
    if len(token_split) > 0:
        tmp_vector_grams = []
        for item in token_split:
            tmp_vector_grams.append(model.get_word_vector(item))
        output_dict[token] = str(list(np.array(tmp_vector_grams).mean(axis=0)))
    else:
        output_dict[token] = str(model.get_word_vector(item))

100%|██████████| 280/280 [00:00<00:00, 3792.22it/s]


##### Save to disk

In [0]:
# Save embeddings to disk
with open(directory+'vectors/100D/FastText vector '+comment_embedding+period+'.json', 'w') as json_file:
    json.dump(output_dict, json_file)

#### Get embeddings (alternative) : 

*   save to a list instead of a dicktionary and csv
*   Will have many redundant words in it and will take lots of disk space




In [0]:
# Save in a list
batches = 1000
batch_size = len(corpus_tokens)/batches

for step in tqdm(range(batches),total=batches):
    batch_tokens = corpus_tokens[int(step*batch_size):int((step+1)*batch_size)]
    corpus_vectors = [model.get_word_vector(x) for x in batch_tokens]
    corpus_vectors = pd.DataFrame(corpus_vectors)
    corpus_vectors['tokens'] = batch_tokens

    # Save embeddings to disk
    with open(directory+'vector '+period,'a') as f:
        corpus_vectors.to_csv(f,index=False,header=False)

## Gensim Model

#### Load model

Load gensim model

In [0]:
gensim_model_address = datapath+'FastText Models/50D w1/fasttext-scopus-2.2-million_docs-gensim 50D-w1.model'
model = fasttext_gensim.load(gensim_model_address)

In [9]:
gensim_model_address = datapath+'FastText Models/50D/fasttext-scopus_wos-merged-310k_docs-gensim 50D.model'
model = fasttext_gensim.load(gensim_model_address)

In [0]:
gensim_model_address = 'drive/My Drive/Data-Permenant/FastText-crawl-300d-2M-subword/crawl-300d-2M-subword.bin'
model = gensim.models.fasttext.load_facebook_model(gensim_model_address)

Simple Tests

In [0]:
print('intelligence' in model.wv.vocab)
print(model.similarity("anns", "ann"))
print(model.most_similar(['eye','vision','processing'], ['computer']))
print(model.wmdistance(['stop', 'word', 'removed', 'tokens', 'of', 'sentence 1'], ['stop word removed tokens of sentence 2']))

In [0]:
# get distance of two words
from scipy import spatial,sparse,sign
vec_a = model.wv['']
vec_b = model.wv['fpga']
distance_tmp = spatial.distance.cosine(vec_a, vec_b)
similarity_tmp = 1 - distance_tmp
similarity_tmp

0.33358505368232727

In [0]:
model.wv['artificial intelligence']

array([-4.25525   ,  0.03749213,  1.8276842 , -3.1528432 , -3.3440664 ,
        0.66207427,  1.0964861 , -2.038055  ,  3.0331683 , -2.1755521 ,
        2.1063838 ,  1.6578307 ,  1.3311137 , -2.030598  , -0.69794494,
        2.6208954 ,  1.9154872 ,  1.6715113 ,  0.23561044, -0.50721526,
        3.1775064 , -2.069317  , -2.4310536 , -1.8514946 ,  1.3029549 ,
        3.482592  , -2.1535952 ,  1.078043  , -3.8000522 ,  0.08382007,
       -0.6016187 ,  3.3550935 ,  2.5037699 , -2.8812122 , -0.11693893,
       -0.51311666,  3.1224    ,  0.46978405, -0.4427654 , -2.5400903 ,
        2.0880878 ,  3.123557  ,  0.8703581 , -1.0431769 , -2.8512125 ,
        2.2627175 ,  1.0080537 ,  0.1098367 ,  1.5881126 , -1.870272  ],
      dtype=float32)

In [0]:
(model.wv['artificial']+model.wv['intelligence'])/2

In [0]:
model.wv['artificial_intelligence']

#### Compare vectors to ACM categories

In [0]:
AI_categories = [
              'Artificial Intelligence Applications - Expert Systems',
              'Automatic Programming',
              'Deduction - Theorem Proving',
              'Knowledge Representation Formalisms - Knowledge Representation Methods',
              'Programming Languages - Software',
              'Learning',
              'Natural Language Processing',
              'Problem Solving - Control Methods - Search',
              'Robotics',
              'Vision - Scene Understanding',
              'Distributed Artificial Intelligence',
              'ARTIFICIAL INTELLIGENCE'
]

In [0]:
categories = [
              'Natural language processing - Information extraction - Machine translation - Discourse, dialogue  pragmatics - Natural language generation - Speech recognition - Lexical semantics - Phonology / morphology',
              'Knowledge representation reasoning - Description logics - Semantic networks Nonmonotonic default reasoning  belief revision - Probabilistic reasoning - Vagueness fuzzy logic - Causal reasoning  diagnostics - Temporal reasoning - Cognitive robotics - Ontology engineering - Logic programming answer set programming - Spatial  physical reasoning - Reasoning about belief  knowledge',
              'Planning  scheduling - Planning for deterministic actions - Planning under uncertainty - Multi-agent planning - Planning  abstraction  generalization - Robotic planning - Evolutionary robotics',
              'Search methodologies - Heuristic function construction - Discrete space search - Continuous space search - Randomized search - Game tree search - Abstraction  micro-operators - Search with partial observations - ',
              'Control methods - Robotic planning - Evolutionary robotics - Computational control theory - Motion path planning',
              'Philosophical theoretical foundations artificial intelligence - Cognitive science - Theory mind',
              'Distributed artificial intelligence - Multi agent systems - Intelligent agents - Mobile agents - Cooperation  coordination',
              'Computer vision - Biometrics - Scene understanding - Activity recognition  understanding - Video summarization - Visual content based indexing  retrieval - Visual inspection - Vision for robotics - Scene anomaly detection - Image  video acquisition - Camera calibration - Epipolar geometry - Computational photography - Hyperspectral imaging - Motion capture - 3D imaging - Active vision - Image representations - Shape representations - Appearance  texture - Hierarchical representations - Computer vision problems - Interest point  salient region detections - Image segmentation  - Video segmentation - Shape inference - Object detection - Object recognition - Object identification - Tracking - Reconstruction - Matching',
              
              'Learning paradigms - Supervised learning - Ranking - Learning to rank - Supervised learning  classification - Supervised learning  regression - Structured outputs - Cost sensitive learning - Unsupervised learning - Cluster analysis - Anomaly detection - Mixture modeling - Topic modeling - Source separation - Motif discovery - Dimensionality reduction  manifold learning - Reinforcement learning - Sequential decision making - Inverse reinforcement learning - Apprenticeship learning - Multi-agent reinforcement learning - Adversarial learning - Multi-task learning - Transfer learning - Lifelong machine learning - Learning under covariate shift',
              'Learning settings - Batch learning - Online learning settings - Learning from demonstrations - Learning from critiques - Learning from implicit feedback - Active learning settings - Semi supervised learning settings',
              'Machine learning approaches - Classification  regression trees - Kernel methods - Support vector machines - Gaussian processes - Neural networks - Logical  relational learning - Inductive logic learning - Statistical relational learning - Learning in probabilistic graphical models - Maximum likelihood modeling - Maximum entropy modeling - Maximum a posteriori modeling - Mixture models - Latent variable models - Bayesian network models - Learning linear models - Perceptron algorithm - Factorization methods - Non-negative matrix factorization - Factor analysis - Principal component analysis - Canonical correlation analysis - Latent Dirichlet allocation - Rule learning - Instance-based learning - Markov decision processes -  Markov decision processes - Stochastic games - Learning latent representations - Deep belief networks - Bio inspired approaches - Artificial life - Evolvable hardware - Genetic algorithms - Genetic programming - Evolutionary robotics - Generative  developmental approaches',
              'Machine learning algorithms - Dynamic programming Markov decision processes - Value iteration - Q learning - Policy iteration - Temporal difference learning - Approximate dynamic programming methods - Ensemble methods - Boosting - Bagging - Spectral methods - Feature selection - Regularization',
]

In [0]:
AI_vectors = []
labels = []
for item in categories:
    vector_tmp = []
    label = item.split('-')[0]
    for phrase in item.split('-'):
        phrase = phrase.lower().strip()
        # print(phrase)
        vector_tmp.append(model.wv[phrase])
    # print('---')
    AI_vectors.append(list(np.array(vector_tmp).mean(axis=0)))
    labels.append(label)
print(AI_vectors)

[[-4.2585297, -1.09474, -0.51740396, -1.5583799, -1.277385, 1.2230147, 3.6439853, -1.6600533, 1.4912666, -2.3865879, 1.2189151, 0.5719612, 2.7518144, -2.781408, -2.061385, 2.1494553, 1.0412952, 2.237492, 0.14161092, -1.4961629, 1.8666816, -1.0945007, -0.40818986, -1.4722152, -0.20459145, 2.328488, -0.7185293, 0.63060343, -2.3967595, -1.0597452, 1.1229446, 1.805113, 1.015998, -2.1873384, -0.12749422, -0.016316757, 1.9769442, 0.9840427, -1.7754203, -3.5659814, 3.1010442, 2.0076227, 0.61076725, -0.32515174, -3.6688373, -1.054969, 2.3143127, 0.7147262, 1.1658139, 0.8725452], [-4.7960773, -1.7704331, -0.17159314, -1.5654887, -1.696463, 1.3633636, 2.3642745, -0.60040635, 1.4829278, -2.4751508, 1.0201765, 0.6441909, 2.688061, -2.4529297, 0.47901928, 2.3566878, 1.1981105, 0.46998572, 0.58817506, -0.64123666, 2.3364296, -2.3828585, 0.27646962, -0.96989816, 0.2530441, 1.0085167, -0.6223591, 0.35445678, -2.2382672, -0.57872456, 1.659824, 2.0099118, 2.595362, -0.9997726, -0.09018091, 0.8816102, 2.

In [0]:
pd.DataFrame(AI_vectors).to_csv(datapath+'FastText doc clusters - SIP/50D/classification/ACM_classifications_vectors')
pd.DataFrame(labels,columns=['label']).to_csv(datapath+'FastText doc clusters - SIP/50D/classification/ACM_classifications_labels')

### Get Word Embeddings

##### No n-gram handle

In [0]:
# Save in a dict
comment_embedding = ''
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    output_dict[token] = str(model.wv[token])

100%|██████████| 3184174/3184174 [27:52<00:00, 1903.51it/s]


##### Manual n-gram handle

In [0]:
# Save in a dict
comment_embedding = 'average_manual '
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    token_split = token.split(' ')
    if len(token_split) > 0:
        tmp_vector_grams = []
        for item in token_split:
            tmp_vector_grams.append(model.wv[item])
        output_dict[token] = str(list(np.array(tmp_vector_grams).mean(axis=0)))
    else:
        output_dict[token] = str(model.wv[token])

100%|██████████| 1590/1590 [00:00<00:00, 10574.09it/s]


##### Save to disk

In [0]:
# Save embeddings to disk
with open(directory+'/FastText vector with n-grams '+comment_embedding+period+'.json', 'w') as json_file:
    json.dump(output_dict, json_file)

In [0]:
print(directory)

drive/My Drive/Data/corpus/improved_copyr_lemmatized_stopwords_removed_thesaurus_n-grams/


### Get Doc Embeddings (SIF)

It is not recommended to perform this on cloud, as it is not process intesive, yet takes too long depending on the doc-count. It might take over 30 hours.

Make a probability dictionary

In [17]:
corpus_tokens_s = pd.Series(corpus_tokens)
corpus_tokens_probabilities = (corpus_tokens_s.groupby(corpus_tokens_s).transform('count') / len(corpus_tokens_s)).values
corpus_tokens_probabilities = pd.DataFrame(corpus_tokens_probabilities)
corpus_tokens_probabilities['tokens'] = corpus_tokens_s
corpus_tokens_probabilities.columns = ['probability','token']
corpus_tokens_probabilities = corpus_tokens_probabilities.groupby('token').mean()
corpus_tokens_probabilities = corpus_tokens_probabilities.reset_index()
corpus_tokens_probabilities.columns = ['token','probability']

Get vectors

In [18]:
vectors = []
for token in tqdm(corpus_tokens_probabilities['token'],total=corpus_tokens_probabilities.shape[0]):
    vectors.append(model.wv[token])
corpus_tokens_probabilities['vector'] = vectors

100%|██████████| 21794/21794 [00:00<00:00, 139978.84it/s]


Calculate weighted average vectors

In [19]:
a = 1e-3
embedding_size = 50

doc_set = []
for doc in tqdm(corpus['abstracts'].values.tolist(),total=len(corpus['abstracts'].values.tolist())):
    vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
    doc_length = len(doc.split())
#     print(doc.split())
    for word in doc.split():
        a_value = a / (a + corpus_tokens_probabilities[corpus_tokens_probabilities['token']==word]['probability'].values.tolist()[0])  # smooth inverse frequency, SIF
        vs = np.add(vs, np.multiply(a_value, corpus_tokens_probabilities[corpus_tokens_probabilities['token']==word]['vector'].values.tolist()[0]))  # vs += sif * word_vector

    vs = np.divide(vs, doc_length)  # weighted average
    doc_set.append(vs)  # add to our existing re-calculated set of sentences


100%|██████████| 6218/6218 [1:16:11<00:00,  1.07s/it]


In [20]:
pd.DataFrame(doc_set).to_csv(datapath+'Document Embedding/50D/'+file_name,index=False)

# Train on a large Scopus corpus


#### Load Corpus Sentences

In [0]:
sentence_corpus = pd.read_csv(datapath+'corpus/AI ALL/1900-2019 corpus sentences abstract-title')

#### Preprocess and prepare corpus for FastText training

In [0]:
sentences = []
lemmatizer=WordNetLemmatizer()
with open(datapath+'corpus/AI ALL/1900-2019 corpus sentences abstract-title further processed.csv', 'w') as f:
    for index,row in tqdm(sentence_corpus.iterrows(),total=sentence_corpus.shape[0]):
        sentence = row['sentence']
        sentence = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",sentence)
        sentence = word_tokenize(sentence)
        sentence = [word for word in sentence if not word in punkts] 
        sentence = [lemmatizer.lemmatize(word) for word in sentence]
        # sentences.append(sentence)
        f.write("%s\n" % ' '.join(sentence))

100%|██████████| 13307816/13307816 [1:54:08<00:00, 1943.10it/s]


#### Save sentences to disk for future use -- Not needed anymore

In [0]:
pd.DataFrame([' '.join(words) for words in sentences],columns=['sentences']).to_csv(
    datapath+'corpus/AI ALL/1900-2019 corpus sentences abstract-title further processed.csv',
    header=True,index=False)

#### Load pre-processed sentences

In [0]:
sentence_corpus = pd.read_csv(datapath+'corpus/AI ALL/1900-2019 corpus sentences abstract-title further processed.csv',delimiter=";;;")
sentence_corpus.columns = ["sentences"]
sentences = []
sentence_corpus = sentence_corpus.fillna('')
for index,row in tqdm(sentence_corpus.iterrows(),total=sentence_corpus.shape[0]):
    sentences.append(row['sentences'].split(' '))

100%|██████████| 13230009/13230009 [28:30<00:00, 7736.00it/s]


In [0]:
sentence_corpus.head(10)

Unnamed: 0,sentences
0,circuit sequential circuit or vlsi chip realiz...
1,the design of efficient hardware is a fundamen...
2,because of the large cost for the physical con...
3,for these purpose data structure for boolean f...
4,the corresponding state of the art data struct...
5,efficient algorithm for the operation on obdds...
6,a generalized data structure called graph-driv...
7,the new data structure allows for many importa...
8,efficient algorithm for the operation on graph...
9,consider a finite graph g v e


### Train Fasttext - Gensim

#### Load a model to continue training

* If want to continue training, run this section

In [0]:
model = load(gensim_model_address)

In [0]:
model.build_vocab(sentences, update=True)
model.train(sentences, total_examples=len(sentences), epochs=model.epochs)

* Otherwise run this section

#### Train

In [0]:
model = fasttext_gensim(min_n=3, max_n=6, size=15, window=5, min_count=1, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)

In [0]:
fname = "datapath+Models/fasttext-scopus-2.2-million_docs-gensim 15D.model"
model.save(fname)

In [0]:
model = fasttext_gensim(min_n=3, max_n=6, size=50, window=5, min_count=1, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)
fname = datapath+"Models/fasttext-scopus-2.2-million_docs-gensim 50D.model"
model.save(fname)

In [0]:
model = fasttext_gensim(min_n=3, max_n=6, size=100, window=5, min_count=1, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)
fname = datapath+"Models/fasttext-scopus-2.2-million_docs-gensim 50D.model"
model.save(fname)

#### Test model

In [0]:
similarities = model.wv.most_similar(positive=['logic','fuzzy','expert'],negative=['deep','neural','network','cnn','ann'])
most_similar = similarities[0]

In [0]:
most_similar

('mam-rnn', 0.9763791561126709)

In [0]:
not_matching = model.wv.doesnt_match("human computer interface tree".split())

In [0]:
not_matching

'tree'

In [0]:
sim_score = model.wv.similarity('computer', 'human')

In [0]:
sim_score

0.7571839

In [0]:
print(model.wv['artificial intelligence'])
print(model.wv['artificial'])
print(model.wv['intelligence'])

[ 1.2783480e+00 -4.2018552e+00  7.1276689e-01  4.2023015e+00
 -5.0359420e-03  4.4385982e+00  6.2421050e+00 -8.9032326e+00
  1.7556003e+00  1.3425230e+00  9.4295764e-01 -4.4485557e-01
 -5.8648558e+00  2.6428668e+00 -1.2076639e+00]
[  3.7072854   -3.616749     1.3040072    0.234361    -2.753659
   7.528801    14.293305   -14.688236     5.3885765    6.496681
   1.9917868    2.855616    -0.05153261   7.8660994   -2.22459   ]
[  0.28606984  -6.971052    -0.9232919   11.48035      0.2561571
   4.084776     2.4220266   -8.616226     0.94255084  -2.2498865
   1.7112938   -3.370861   -12.577294    -1.1608386   -0.04991044]


### Train Fasttext - Facebook

In [0]:
sentences_joined = ' '.join(sentences)
model = fasttext.train_unsupervised(sentences_joined, "cbow", minn=2, maxn=5, dim=50, epoch=10,lr=0.05)

#### Test model

In [0]:
model.words

In [0]:
model.get_word_vector("the")

In [0]:
model.get_nearest_neighbors('asparagus')

In [0]:
model.get_analogies("intelligence", "math", "fuzzy")

#### Save model

In [0]:
model.save_model(datapath+"fasttext-scopus_wos-merged-310k_docs-facebook.ftz")