<a href="https://colab.research.google.com/github/sahandv/science_science/blob/master/FastText_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FASTTEXT EMBEDDING


## Init

### Import Libs

In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import gc
import json
import re

# import fasttext
from pyemd import emd
from gensim.similarities import WmdSimilarity
from gensim.models import FastText as fasttext_gensim
from gensim.test.utils import get_tmpfile

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sciosci.assets import keyword_assets as kw
from sciosci.assets import generic_assets as sci
from sciosci.assets import advanced_assets as aa


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
stops = ['a','an','we','result','however','yet','since','previously','although','propose','proposed','e_g','method',
         'published_elsevier','b','v','problem','paper','approach','within','with','by','via','way','t','case','issue','level','area','system',
         'work','discussed','seen','put','usually','take','make','author','versus','enables','result','research','design','based']
punkts = [' ','','(',')','[',']','{','}','.',',','!','?','<','>','-','_',':',';','\\','/','|','&','%',"'s","`s",'#','$','@','≅','=']

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = list(set(stopwords.words("english")))+stops+punkts
np.random.seed(50)

[nltk_data] Downloading package stopwords to /home/sahand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sahand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sahand/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Get embeddings from a pre-trained model


### Load Corpus

In [63]:
period = '2014-2016'
percentile = 97

#### Option A - Load corpus

In [64]:
directory = '/home/sahand/Data/corpus/improved_copyr_thesaurus/n-grams/'
file_name = period+' corpus abstract-title'
corpus = pd.read_csv(directory+file_name,names=['abstracts'])
corpus_tokens = [item for sublist in corpus['abstracts'].values.tolist() for item in sublist.split()]
gc.collect()

1580225

#### Option B - Load keywords

In [0]:
directory = 'drive/My Drive/Data/LDA/'
file_name = period+' top_90-percentile_keywords_terms.csv'
corpus = pd.read_csv(directory+file_name)
corpus = corpus.fillna('this_is_null')
corpus_tokens = []
for idx,row in tqdm(corpus.iterrows(),total=corpus.shape[0]):
    for token in row.values.tolist():
        if token != 'this_is_null': 
            corpus_tokens.append(token) 
del corpus
print("\nNumber of unique tokens:",len(corpus_tokens))

100%|██████████| 6546/6546 [00:00<00:00, 8418.10it/s]


Number of unique tokens: 52365





#### Option C - Load author keywords

In [0]:
directory = 'drive/My Drive/Data/Author keywords - 29 Oct 2019/'
file_name = period+' keyword frequency'
corpus = pd.read_csv(directory+file_name,names=['keyword','frequency'])
corpus = corpus.fillna('this_is_null')
threshold = np.percentile(corpus['frequency'].values,percentile)
corpus = corpus[corpus['frequency']>threshold]

corpus_tokens = []
for idx,row in tqdm(corpus.iterrows(),total=corpus.shape[0]):
    if row['keyword'] != 'this_is_null': 
        corpus_tokens.append(row['keyword']) 
print("\nNumber of unique tokens:",len(corpus_tokens))

100%|██████████| 1376/1376 [00:00<00:00, 9769.37it/s]


Number of unique tokens: 1376





## Facebook Model

#### Load model

In [0]:
fb_model_address = 'drive/My Drive/Data-Permenant/FastText-crawl-300d-2M-subword/crawl-300d-2M-subword.bin'

In [0]:
model = fasttext.load_model(fb_model_address)




#### Get embeddings

*   Save to dictionary and json
*   This takes much less space on disk


##### No n-gram handle

In [0]:
# Save in a dict
output_dict = {}
comment_embedding = ''
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    output_dict[token] = str(model.get_word_vector(token))

100%|██████████| 280/280 [00:00<00:00, 337.81it/s]


##### Manual n-gram handle

In [0]:
# Save in a dict
comment_embedding = 'average_manual '
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    token_split = token.split(' ')
    if len(token_split) > 0:
        tmp_vector_grams = []
        for item in token_split:
            tmp_vector_grams.append(model.get_word_vector(item))
        output_dict[token] = str(list(np.array(tmp_vector_grams).mean(axis=0)))
    else:
        output_dict[token] = str(model.get_word_vector(item))

100%|██████████| 280/280 [00:00<00:00, 3792.22it/s]


##### Save to disk

In [0]:
# Save embeddings to disk
with open(directory+'vectors/100D/FastText vector '+comment_embedding+period+'.json', 'w') as json_file:
    json.dump(output_dict, json_file)

#### Get embeddings (alternative) : 

*   save to a list instead of a dicktionary and csv
*   Will have many redundant words in it and will take lots of disk space




In [0]:
# Save in a list
batches = 1000
batch_size = len(corpus_tokens)/batches

for step in tqdm(range(batches),total=batches):
    batch_tokens = corpus_tokens[int(step*batch_size):int((step+1)*batch_size)]
    corpus_vectors = [model.get_word_vector(x) for x in batch_tokens]
    corpus_vectors = pd.DataFrame(corpus_vectors)
    corpus_vectors['tokens'] = batch_tokens

    # Save embeddings to disk
    with open(directory+'vector '+period,'a') as f:
        corpus_vectors.to_csv(f,index=False,header=False)

## Gensim Model

#### Load model

Load gensim model

In [65]:
gensim_model_address = '/home/sahand/Data/FastText Models/50D w1/fasttext-scopus-2.2-million_docs-gensim 50D-w1.model'
model = fasttext_gensim.load(gensim_model_address)

Simple Tests

In [66]:
print('intelligence' in model.wv.vocab)
print(model.similarity("machine learning", "artificial intelligence"))
print(model.most_similar(positive=['baghdad', 'england'], negative=['london']))
print(model.n_similarity(['neural network','deep learning'], ['ann']))
print(model.wmdistance(['stop', 'word', 'removed', 'tokens', 'of', 'sentence 1'], ['stop word removed tokens of sentence 2']))

True
0.80139047
[('bagha', 0.784936785697937), ('bagua', 0.7760497331619263), ('bagherzandi', 0.7488542795181274), ('bagaen', 0.7457777857780457), ('bagheria', 0.7434571385383606), ('baghi', 0.7406474351882935), ('bagana', 0.7358278632164001), ('bagci', 0.7353381514549255), ('kainji', 0.7331032156944275), ('we-lbv', 0.7320331335067749)]
0.73269963
25.106913131596503


In [49]:
# get distance of two words
from scipy import spatial,sparse,sign
vec_a = (model.wv['machine']+model.wv['learning'])/2
vec_b = (model.wv['artificial']+model.wv['intelligence'])/2
distance_tmp = spatial.distance.cosine(vec_a, vec_b)
similarity_tmp = 1 - distance_tmp
similarity_tmp

0.7625598907470703

In [50]:
vec_a = model.wv["machine learning"]
vec_b = model.wv["artificial intelligence"]
distance_tmp = spatial.distance.cosine(vec_a, vec_b)
similarity_tmp = 1 - distance_tmp
similarity_tmp

0.801390528678894

In [53]:
model.wv["machine learn"]

array([-0.45623803,  1.6125572 ,  1.7761922 , -0.7947932 , -5.7335396 ,
        4.5503316 ,  2.6465545 , -1.6921854 ,  0.10005848,  2.6700506 ,
       -2.770975  , -0.0614607 ,  0.65475965, -2.7329035 ,  5.578808  ,
       -2.5066867 , -1.8912905 ,  0.3634988 ,  2.7395883 , -5.5817003 ,
        0.6066571 , -2.2596807 , -2.6291995 ,  0.7854226 ,  1.9896873 ,
        2.4007287 , -1.1938859 ,  2.7635882 ,  0.4408575 , -1.9049958 ,
        0.526986  , -1.5615294 ,  1.6513894 ,  0.6793895 ,  0.5717171 ,
       -2.8145156 ,  0.07670619,  1.4759918 , -0.63885576, -3.3751516 ,
       -2.610316  , -0.7968178 ,  1.3870629 ,  0.16686264,  1.9244195 ,
       -4.654755  , -1.3727199 ,  2.095118  , -2.8104064 ,  0.7448188 ],
      dtype=float32)

In [54]:
(model.wv['machine']+model.wv['learn'])/2

array([-0.24606967,  0.02924442,  2.606117  , -1.927406  , -9.501593  ,
        6.6716948 ,  1.722619  , -2.0595136 , -0.7732086 ,  4.7558193 ,
       -7.045576  , -1.5984142 ,  2.2007296 , -6.8144236 ,  9.329351  ,
       -4.3544493 , -2.220007  , -1.5806001 ,  5.610407  , -8.2393265 ,
        0.7217357 , -3.5028415 , -3.4166799 ,  1.1223906 ,  6.230218  ,
        3.7619758 , -2.4158576 ,  4.420979  , -0.49613547, -3.1977272 ,
        0.13617638, -4.364634  ,  3.0377398 ,  0.69782805, -0.01590317,
       -2.9101572 ,  1.7207272 ,  1.0152985 , -2.8919737 , -3.701437  ,
       -4.3352633 , -3.248755  ,  1.6562223 , -0.13792694,  2.2225308 ,
       -9.367432  , -2.4254265 ,  4.229023  , -5.2904277 ,  1.785675  ],
      dtype=float32)

In [0]:
AI_categories = [
              'Artificial Intelligence Applications - Expert Systems',
              'Automatic Programming',
              'Deduction - Theorem Proving',
              'Knowledge Representation Formalisms - Knowledge Representation Methods',
              'Programming Languages - Software',
              'Learning',
              'Natural Language Processing',
              'Problem Solving - Control Methods - Search',
              'Robotics',
              'Vision - Scene Understanding',
              'Distributed Artificial Intelligence',
              'ARTIFICIAL INTELLIGENCE'
]

In [0]:
categories = [
              'Natural language processing - Information extraction - Machine translation - Discourse, dialogue  pragmatics - Natural language generation - Speech recognition - Lexical semantics - Phonology / morphology',
              'Knowledge representation reasoning - Description logics - Semantic networks Nonmonotonic default reasoning  belief revision - Probabilistic reasoning - Vagueness fuzzy logic - Causal reasoning  diagnostics - Temporal reasoning - Cognitive robotics - Ontology engineering - Logic programming answer set programming - Spatial  physical reasoning - Reasoning about belief  knowledge',
              'Planning  scheduling - Planning for deterministic actions - Planning under uncertainty - Multi-agent planning - Planning  abstraction  generalization - Robotic planning - Evolutionary robotics',
              'Search methodologies - Heuristic function construction - Discrete space search - Continuous space search - Randomized search - Game tree search - Abstraction  micro-operators - Search with partial observations - ',
              'Control methods - Robotic planning - Evolutionary robotics - Computational control theory - Motion path planning',
              'Philosophical theoretical foundations artificial intelligence - Cognitive science - Theory mind',
              'Distributed artificial intelligence - Multi agent systems - Intelligent agents - Mobile agents - Cooperation  coordination',
              'Computer vision - Biometrics - Scene understanding - Activity recognition  understanding - Video summarization - Visual content based indexing  retrieval - Visual inspection - Vision for robotics - Scene anomaly detection - Image  video acquisition - Camera calibration - Epipolar geometry - Computational photography - Hyperspectral imaging - Motion capture - 3D imaging - Active vision - Image representations - Shape representations - Appearance  texture - Hierarchical representations - Computer vision problems - Interest point  salient region detections - Image segmentation  - Video segmentation - Shape inference - Object detection - Object recognition - Object identification - Tracking - Reconstruction - Matching',
              
              'Learning paradigms - Supervised learning - Ranking - Learning to rank - Supervised learning  classification - Supervised learning  regression - Structured outputs - Cost sensitive learning - Unsupervised learning - Cluster analysis - Anomaly detection - Mixture modeling - Topic modeling - Source separation - Motif discovery - Dimensionality reduction  manifold learning - Reinforcement learning - Sequential decision making - Inverse reinforcement learning - Apprenticeship learning - Multi-agent reinforcement learning - Adversarial learning - Multi-task learning - Transfer learning - Lifelong machine learning - Learning under covariate shift',
              'Learning settings - Batch learning - Online learning settings - Learning from demonstrations - Learning from critiques - Learning from implicit feedback - Active learning settings - Semi supervised learning settings',
              'Machine learning approaches - Classification  regression trees - Kernel methods - Support vector machines - Gaussian processes - Neural networks - Logical  relational learning - Inductive logic learning - Statistical relational learning - Learning in probabilistic graphical models - Maximum likelihood modeling - Maximum entropy modeling - Maximum a posteriori modeling - Mixture models - Latent variable models - Bayesian network models - Learning linear models - Perceptron algorithm - Factorization methods - Non-negative matrix factorization - Factor analysis - Principal component analysis - Canonical correlation analysis - Latent Dirichlet allocation - Rule learning - Instance-based learning - Markov decision processes -  Markov decision processes - Stochastic games - Learning latent representations - Deep belief networks - Bio inspired approaches - Artificial life - Evolvable hardware - Genetic algorithms - Genetic programming - Evolutionary robotics - Generative  developmental approaches',
              'Machine learning algorithms - Dynamic programming Markov decision processes - Value iteration - Q learning - Policy iteration - Temporal difference learning - Approximate dynamic programming methods - Ensemble methods - Boosting - Bagging - Spectral methods - Feature selection - Regularization',
]

In [15]:
AI_vectors = []
labels = []
for item in categories:
    vector_tmp = []
    label = item.split('-')[0]
    for phrase in item.split('-'):
        phrase = phrase.lower().strip()
        # print(phrase)
        vector_tmp.append(model.wv[phrase])
    # print('---')
    AI_vectors.append(list(np.array(vector_tmp).mean(axis=0)))
    labels.append(label)
print(AI_vectors)

[[-4.2585297, -1.09474, -0.51740396, -1.5583799, -1.277385, 1.2230147, 3.6439853, -1.6600533, 1.4912666, -2.3865879, 1.2189151, 0.5719612, 2.7518144, -2.781408, -2.061385, 2.1494553, 1.0412952, 2.237492, 0.14161092, -1.4961629, 1.8666816, -1.0945007, -0.40818986, -1.4722152, -0.20459145, 2.328488, -0.7185293, 0.63060343, -2.3967595, -1.0597452, 1.1229446, 1.805113, 1.015998, -2.1873384, -0.12749422, -0.016316757, 1.9769442, 0.9840427, -1.7754203, -3.5659814, 3.1010442, 2.0076227, 0.61076725, -0.32515174, -3.6688373, -1.054969, 2.3143127, 0.7147262, 1.1658139, 0.8725452], [-4.7960773, -1.7704331, -0.17159314, -1.5654887, -1.696463, 1.3633636, 2.3642745, -0.60040635, 1.4829278, -2.4751508, 1.0201765, 0.6441909, 2.688061, -2.4529297, 0.47901928, 2.3566878, 1.1981105, 0.46998572, 0.58817506, -0.64123666, 2.3364296, -2.3828585, 0.27646962, -0.96989816, 0.2530441, 1.0085167, -0.6223591, 0.35445678, -2.2382672, -0.57872456, 1.659824, 2.0099118, 2.595362, -0.9997726, -0.09018091, 0.8816102, 2.

In [0]:
pd.DataFrame(AI_vectors).to_csv('drive/My Drive/Data/FastText doc clusters - SIP/50D/classification/ACM_classifications_vectors')
pd.DataFrame(labels,columns=['label']).to_csv('drive/My Drive/Data/FastText doc clusters - SIP/50D/classification/ACM_classifications_labels')

### Get Word Embeddings

##### No n-gram handle

In [0]:
# Save in a dict
comment_embedding = ''
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    output_dict[token] = str(model.wv[token])

100%|██████████| 3184174/3184174 [27:52<00:00, 1903.51it/s]


##### Manual n-gram handle

In [0]:
# Save in a dict
comment_embedding = 'average_manual '
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    token_split = token.split(' ')
    if len(token_split) > 0:
        tmp_vector_grams = []
        for item in token_split:
            tmp_vector_grams.append(model.wv[item])
        output_dict[token] = str(list(np.array(tmp_vector_grams).mean(axis=0)))
    else:
        output_dict[token] = str(model.wv[token])

100%|██████████| 1590/1590 [00:00<00:00, 10574.09it/s]


##### Save to disk

In [0]:
# Save embeddings to disk
with open(directory+'/FastText vector with n-grams '+comment_embedding+period+'.json', 'w') as json_file:
    json.dump(output_dict, json_file)

In [0]:
print(directory)

drive/My Drive/Data/corpus/improved_copyr_lemmatized_stopwords_removed_thesaurus_n-grams/


### Get Doc Embeddings (SIF)

It is not recommended to perform this on cloud, as it is not process intesive, yet takes too long depending on the doc-count. It might take over 30 hours.

Make a probability dictionary

In [67]:
corpus_tokens_s = pd.Series(corpus_tokens)
corpus_tokens_probabilities = (corpus_tokens_s.groupby(corpus_tokens_s).transform('count') / len(corpus_tokens_s)).values
corpus_tokens_probabilities = pd.DataFrame(corpus_tokens_probabilities)
corpus_tokens_probabilities['tokens'] = corpus_tokens_s
corpus_tokens_probabilities.columns = ['probability','token']
corpus_tokens_probabilities = corpus_tokens_probabilities.groupby('token').mean()
corpus_tokens_probabilities = corpus_tokens_probabilities.reset_index()
corpus_tokens_probabilities.columns = ['token','probability']

In [68]:
corpus_tokens_probabilities

Unnamed: 0,token,probability
0,aa,0.000039
1,aaa,0.000002
2,aaai,0.000010
3,aaai_conference,0.000015
4,aaai_fall,0.000003
...,...,...
34364,zrm,0.000005
34365,zro,0.000003
34366,zsm,0.000003
34367,zurich,0.000002


Get vectors

In [69]:
vectors = []
for token in tqdm(corpus_tokens_probabilities['token'],total=corpus_tokens_probabilities.shape[0]):
    phrase = token.replace("_", " ")
    phrase = phrase.lower().strip()
    phrase = phrase.split()
    gram_vecs = []
    for gram in phrase:
        gram_vecs.append(model.wv[gram])
    phrase_vec = np.array(gram_vecs).mean(axis=0)
    vectors.append(phrase_vec)
corpus_tokens_probabilities['vector'] = vectors
len(corpus_tokens_probabilities)



  0%|          | 0/34369 [00:00<?, ?it/s][A[A

 11%|█         | 3613/34369 [00:00<00:00, 36128.14it/s][A[A

 22%|██▏       | 7546/34369 [00:00<00:00, 37004.49it/s][A[A

 35%|███▍      | 11958/34369 [00:00<00:00, 38884.76it/s][A[A

 48%|████▊     | 16344/34369 [00:00<00:00, 40253.14it/s][A[A

 60%|██████    | 20647/34369 [00:00<00:00, 41047.58it/s][A[A

 70%|███████   | 24172/34369 [00:00<00:00, 38259.63it/s][A[A

 80%|████████  | 27630/34369 [00:00<00:00, 35683.70it/s][A[A

100%|██████████| 34369/34369 [00:00<00:00, 38439.71it/s][A[A


34369

In [70]:
corpus

Unnamed: 0,abstracts
0,human machine interaction facial_expression_re...
1,rule fuzzy_cognitive map natural_language_proc...
2,fog_computing architecture healthcare wireless...
3,argumentation knowledge_representation conflic...
4,hydrothermal coordination power system scale i...
...,...
6213,sequential fuzzy_clustering dynamic fuzzy neur...
6214,daddy social_network_analysis car artificial_i...
6215,unconventional cognitive enhancement option ad...
6216,technological_unemployment artificial_neural_n...


Calculate weighted average vectors

In [None]:
a = 1e-3
embedding_size = 50

doc_set = []
for doc in tqdm(corpus['abstracts'].values.tolist(),total=len(corpus['abstracts'].values.tolist())):
    vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
    doc_length = len(doc.split())
#     print(doc.split())
    for word in doc.split():
        a_value = a / (a + corpus_tokens_probabilities[corpus_tokens_probabilities['token']==word]['probability'].values.tolist()[0])  # smooth inverse frequency, SIF
        vs = np.add(vs, np.multiply(a_value, corpus_tokens_probabilities[corpus_tokens_probabilities['token']==word]['vector'].values.tolist()[0]))  # vs += sif * word_vector

    vs = np.divide(vs, doc_length)  # weighted average
    doc_set.append(vs)  # add to our existing re-calculated set of sentences




  0%|          | 0/6218 [00:00<?, ?it/s][A[A

  0%|          | 1/6218 [00:01<1:50:41,  1.07s/it][A[A

  0%|          | 2/6218 [00:04<2:59:35,  1.73s/it][A[A

  0%|          | 3/6218 [00:06<3:06:03,  1.80s/it][A[A

  0%|          | 4/6218 [00:07<2:58:50,  1.73s/it][A[A

  0%|          | 5/6218 [00:09<3:02:30,  1.76s/it][A[A

  0%|          | 6/6218 [00:10<2:45:47,  1.60s/it][A[A

  0%|          | 7/6218 [00:11<2:26:24,  1.41s/it][A[A

  0%|          | 8/6218 [00:13<2:18:00,  1.33s/it][A[A

  0%|          | 9/6218 [00:13<1:59:10,  1.15s/it][A[A

  0%|          | 10/6218 [00:14<1:49:49,  1.06s/it][A[A

  0%|          | 11/6218 [00:16<2:28:45,  1.44s/it][A[A

  0%|          | 12/6218 [00:19<3:04:24,  1.78s/it][A[A

  0%|          | 13/6218 [00:21<3:10:48,  1.84s/it][A[A

  0%|          | 14/6218 [00:22<2:56:09,  1.70s/it][A[A

  0%|          | 15/6218 [00:23<2:29:27,  1.45s/it][A[A

  0%|          | 16/6218 [00:25<2:44:00,  1.59s/it][A[A

  0%|          

  2%|▏         | 138/6218 [03:01<2:11:20,  1.30s/it][A[A

  2%|▏         | 139/6218 [03:02<2:00:08,  1.19s/it][A[A

  2%|▏         | 140/6218 [03:03<1:47:38,  1.06s/it][A[A

  2%|▏         | 141/6218 [03:04<1:44:55,  1.04s/it][A[A

  2%|▏         | 142/6218 [03:05<1:52:55,  1.12s/it][A[A

  2%|▏         | 143/6218 [03:07<2:03:15,  1.22s/it][A[A

  2%|▏         | 144/6218 [03:09<2:25:24,  1.44s/it][A[A

  2%|▏         | 145/6218 [03:10<2:23:07,  1.41s/it][A[A

  2%|▏         | 146/6218 [03:11<2:08:03,  1.27s/it][A[A

  2%|▏         | 147/6218 [03:13<2:23:53,  1.42s/it][A[A

  2%|▏         | 148/6218 [03:14<2:12:30,  1.31s/it][A[A

  2%|▏         | 149/6218 [03:15<2:22:53,  1.41s/it][A[A

  2%|▏         | 150/6218 [03:17<2:34:48,  1.53s/it][A[A

  2%|▏         | 151/6218 [03:19<2:37:53,  1.56s/it][A[A

  2%|▏         | 152/6218 [03:21<2:59:24,  1.77s/it][A[A

  2%|▏         | 153/6218 [03:23<3:00:45,  1.79s/it][A[A

  2%|▏         | 154/6218 [03:24<2:42:58

  4%|▍         | 274/6218 [06:18<2:06:03,  1.27s/it][A[A

  4%|▍         | 275/6218 [06:20<2:03:38,  1.25s/it][A[A

  4%|▍         | 276/6218 [06:20<1:55:51,  1.17s/it][A[A

  4%|▍         | 277/6218 [06:21<1:44:04,  1.05s/it][A[A

  4%|▍         | 278/6218 [06:23<1:56:47,  1.18s/it][A[A

  4%|▍         | 279/6218 [06:24<1:49:26,  1.11s/it][A[A

  5%|▍         | 280/6218 [06:25<1:52:22,  1.14s/it][A[A

  5%|▍         | 281/6218 [06:28<2:40:59,  1.63s/it][A[A

  5%|▍         | 282/6218 [06:29<2:31:45,  1.53s/it][A[A

  5%|▍         | 283/6218 [06:30<2:26:45,  1.48s/it][A[A

  5%|▍         | 284/6218 [06:32<2:38:49,  1.61s/it][A[A

  5%|▍         | 285/6218 [06:33<2:27:07,  1.49s/it][A[A

  5%|▍         | 286/6218 [06:35<2:23:18,  1.45s/it][A[A

  5%|▍         | 287/6218 [06:36<2:16:15,  1.38s/it][A[A

  5%|▍         | 288/6218 [06:38<2:27:19,  1.49s/it][A[A

  5%|▍         | 289/6218 [06:38<2:04:29,  1.26s/it][A[A

  5%|▍         | 290/6218 [06:40<2:02:33

  7%|▋         | 410/6218 [09:31<2:32:22,  1.57s/it][A[A

In [None]:
pd.DataFrame(doc_set).to_csv('/home/sahand/Data/corpus/improved_copyr_thesaurus/n-grams/vectors/'+period+' vectors SIF',index=False)

# Train on a large Scopus corpus


#### Load Corpus Sentences

In [4]:
sentence_corpus = pd.read_csv('/home/sahand/Data/1900-2019 corpus sentences abstract-title')

#### Preprocess and prepare corpus for FastText training

In [5]:
sentences = []
lemmatizer=WordNetLemmatizer()
with open('/home/sahand/Data/1900-2019 corpus sentences abstract-title further processed - no lem - w1.csv', 'w') as f:
    for index,row in tqdm(sentence_corpus.iterrows(),total=sentence_corpus.shape[0]):
        sentence = row['sentence']
        sentence = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",sentence)
        sentence = word_tokenize(sentence)
        sentence = [word for word in sentence if (not word in punkts) and len(word)>1] 
#         sentence = [lemmatizer.lemmatize(word) for word in sentence]
        # sentences.append(sentence)
        f.write("%s\n" % ' '.join(sentence))

100%|██████████| 13307816/13307816 [1:20:43<00:00, 2747.84it/s]


#### Save sentences to disk for future use -- Not needed anymore

In [0]:
pd.DataFrame([' '.join(words) for words in sentences],columns=['sentences']).to_csv(
    'drive/My Drive/Data/corpus/AI ALL/1900-2019 corpus sentences abstract-title further processed.csv',
    header=True,index=False)

#### Load pre-processed sentences

In [6]:
sentence_corpus = pd.read_csv('/home/sahand/Data/1900-2019 corpus sentences abstract-title further processed - no lem - w2.csv',delimiter=";;;")
sentence_corpus.columns = ["sentences"]
sentences = []
sentence_corpus = sentence_corpus.fillna('')
for index,row in tqdm(sentence_corpus.iterrows(),total=sentence_corpus.shape[0]):
    sentences.append(row['sentences'].split(' '))

100%|██████████| 13145868/13145868 [22:04<00:00, 9924.10it/s] 


In [27]:
sentence_corpus.head(10)

Unnamed: 0,article_index,sentence,year
0,0,"the hardware of computers, e.g.",1994.0
1,0,"circuits, sequential circuits or vlsi chips, r...",1994.0
2,0,the design of efficient hardware is a fundamen...,1994.0
3,0,because of the large cost for the physical con...,1994.0
4,0,for these purposes data structures for boolean...,1994.0
5,0,the corresponding state of the art data struct...,1994.0
6,0,efficient algorithms for the operations on obd...,1994.0
7,0,a generalized data structure called graph-driv...,1994.0
8,0,the new data structure allows for many importa...,1994.0
9,0,efficient algorithms for the operations on gra...,1994.0


### Train Fasttext - Gensim

#### Load a model to continue training

* If want to continue training, run this section

In [0]:
model = load(gensim_model_address)

In [0]:
model.build_vocab(sentences, update=True)
model.train(sentences, total_examples=len(sentences), epochs=model.epochs)

* Otherwise run this section

#### Train

In [0]:
model = fasttext_gensim(min_n=3, max_n=6, size=15, window=5, min_count=1, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)

In [0]:
fname = "drive/My Drive/Data/Models/fasttext-scopus-2.2-million_docs-gensim 15D.model"
model.save(fname)

In [7]:
model = fasttext_gensim(min_n=3, max_n=6, size=50, window=5, min_count=1, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)
fname = "/home/sahand/Data/models/fasttext-scopus-2.2-million_docs-gensim 50D-w1.model"
model.save(fname)

In [8]:
model = fasttext_gensim(min_n=3, max_n=6, size=100, window=5, min_count=1, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)
fname = "/home/sahand/Data/models/fasttext-scopus-2.2-million_docs-gensim 100D-w1.model"
model.save(fname)

#### Test model

In [0]:
similarities = model.wv.most_similar(positive=['logic','fuzzy','expert'],negative=['deep','neural','network','cnn','ann'])
most_similar = similarities[0]

In [0]:
most_similar

('mam-rnn', 0.9763791561126709)

In [0]:
not_matching = model.wv.doesnt_match("human computer interface tree".split())

In [0]:
not_matching

'tree'

In [0]:
sim_score = model.wv.similarity('computer', 'human')

In [0]:
sim_score

0.7571839

In [0]:
print(model.wv['artificial intelligence'])
print(model.wv['artificial'])
print(model.wv['intelligence'])

[ 1.2783480e+00 -4.2018552e+00  7.1276689e-01  4.2023015e+00
 -5.0359420e-03  4.4385982e+00  6.2421050e+00 -8.9032326e+00
  1.7556003e+00  1.3425230e+00  9.4295764e-01 -4.4485557e-01
 -5.8648558e+00  2.6428668e+00 -1.2076639e+00]
[  3.7072854   -3.616749     1.3040072    0.234361    -2.753659
   7.528801    14.293305   -14.688236     5.3885765    6.496681
   1.9917868    2.855616    -0.05153261   7.8660994   -2.22459   ]
[  0.28606984  -6.971052    -0.9232919   11.48035      0.2561571
   4.084776     2.4220266   -8.616226     0.94255084  -2.2498865
   1.7112938   -3.370861   -12.577294    -1.1608386   -0.04991044]


### Train Fasttext - Facebook

In [0]:
sentences_joined = ' '.join(sentences)
model = fasttext.train_unsupervised(sentences_joined, "cbow", minn=2, maxn=5, dim=50, epoch=10,lr=0.05)

#### Test model

In [0]:
model.words

In [0]:
model.get_word_vector("the")

In [0]:
model.get_nearest_neighbors('asparagus')

In [0]:
model.get_analogies("intelligence", "math", "fuzzy")

#### Save model

In [0]:
model.save_model("drive/My Drive/Data/fasttext-scopus_wos-merged-310k_docs-facebook.ftz")