<a href="https://colab.research.google.com/github/sahandv/science_science/blob/master/FastText_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FASTTEXT EMBEDDING


## Init

Local OR Colab?

In [1]:
datapath = '/mnt/16A4A9BCA4A99EAD/GoogleDrive/Data/' # Local
# datapath = 'drive/My Drive/Data/' # Remote

### Clone Project Git Repo

In [None]:
!rm -rf 'science_science'
username = "sahandv"#@param {type:"string"}
# password = ""#@param {type:"string"} 

!git clone https://github.com/$username/science_science.git
!ls

### Mount Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# Check files!
!ls 'drive/My Drive/Data-Permenant/FastText-crawl-300d-2M-subword'

### Install requirements

In [None]:
!pip install fasttext
!pip install -r 'science_science/requirements.txt'
!pip install gensim==3.8.1

### Import Libs

In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import gc
import json
import re

# import fasttext
import gensim
from gensim.models import FastText as fasttext_gensim
from gensim.test.utils import get_tmpfile

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sciosci.assets import keyword_assets as kw
from sciosci.assets import generic_assets as sci
from sciosci.assets import advanced_assets as aa

# from science_science.sciosci.assets import keyword_assets as kw
# from science_science.sciosci.assets import generic_assets as sci
# from science_science.sciosci.assets import advanced_assets as aa

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
stops = ['a','an','we','result','however','yet','since','previously','although','propose','proposed','e_g','method',
         'published_elsevier','b','v','problem','paper','approach','within','with','by','via','way','t','case','issue','level','area','system',
         'work','discussed','seen','put','usually','take','make','author','versus','enables','result','research','design','based']
punkts = [' ','','(',')','[',']','{','}','.',',','!','?','<','>','-','_',':',';','\\','/','|','&','%',"'s","`s",'#','$','@']

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = list(set(stopwords.words("english")))+stops+punkts
np.random.seed(50)

[nltk_data] Downloading package stopwords to /home/sahand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sahand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sahand/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Get embeddings from a pre-trained model


### Load Corpus

In [None]:
period = '2017-2018'
percentile = 97

#### Option A - Load corpus

In [5]:
directory = datapath+'Corpus/KPRIS/clean/'
file_name = 'abstract_title'#corpus abstract-title - with n-grams'
corpus = pd.read_csv(directory+file_name,names=['abstracts'])
corpus_tokens = [item.lower() for sublist in corpus['abstracts'].values.tolist() for item in sublist.split()]
gc.collect()

27

In [6]:
corpus

Unnamed: 0,abstracts
0,light system disclos compris plural imag proje...
1,photograph devic elimin photometr error occur ...
2,print demand camera system camera unit incorpo...
3,simplifi camera mechan method allow oper selec...
4,beam splitter digit camera split light pass ph...
...,...
19705,display engin video graphic system includ proc...
19706,facilit interact may enabl commun protocol api...
19707,present invent gener direct system method tran...
19708,disclos system produc imag includ applic progr...


#### Option B - Load keywords

In [None]:
directory = datapath+'LDA/'
file_name = period+' top_90-percentile_keywords_terms.csv'
corpus = pd.read_csv(directory+file_name)
corpus = corpus.fillna('this_is_null')
corpus_tokens = []
for idx,row in tqdm(corpus.iterrows(),total=corpus.shape[0]):
    for token in row.values.tolist():
        if token != 'this_is_null': 
            corpus_tokens.append(token) 
del corpus
print("\nNumber of unique tokens:",len(corpus_tokens))

#### Option C - Load author keywords

In [None]:
directory = datapath+'Author keywords - 29 Oct 2019/'
file_name = period+' keyword frequency'
corpus = pd.read_csv(directory+file_name,names=['keyword','frequency'])
corpus = corpus.fillna('this_is_null')
threshold = np.percentile(corpus['frequency'].values,percentile)
corpus = corpus[corpus['frequency']>threshold]

corpus_tokens = []
for idx,row in tqdm(corpus.iterrows(),total=corpus.shape[0]):
    if row['keyword'] != 'this_is_null': 
        corpus_tokens.append(row['keyword']) 
print("\nNumber of unique tokens:",len(corpus_tokens))

## Facebook Model

#### Load model

In [None]:
fb_model_address = datapath+'/FastText-crawl-300d-2M-subword/crawl-300d-2M-subword.bin'

In [None]:
model = fasttext.load_model(fb_model_address)

#### Get embeddings

*   Save to dictionary and json
*   This takes much less space on disk


##### No n-gram handle

In [None]:
# Save in a dict
output_dict = {}
comment_embedding = ''
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    output_dict[token] = str(model.get_word_vector(token))

##### Manual n-gram handle

In [None]:
# Save in a dict
comment_embedding = 'average_manual '
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    token_split = token.split(' ')
    if len(token_split) > 0:
        tmp_vector_grams = []
        for item in token_split:
            tmp_vector_grams.append(model.get_word_vector(item))
        output_dict[token] = str(list(np.array(tmp_vector_grams).mean(axis=0)))
    else:
        output_dict[token] = str(model.get_word_vector(item))

##### Save to disk

In [None]:
# Save embeddings to disk
with open(directory+'vectors/100D/FastText vector '+comment_embedding+period+'.json', 'w') as json_file:
    json.dump(output_dict, json_file)

#### Get embeddings (alternative) : 

*   save to a list instead of a dicktionary and csv
*   Will have many redundant words in it and will take lots of disk space




In [None]:
# Save in a list
batches = 1000
batch_size = len(corpus_tokens)/batches

for step in tqdm(range(batches),total=batches):
    batch_tokens = corpus_tokens[int(step*batch_size):int((step+1)*batch_size)]
    corpus_vectors = [model.get_word_vector(x) for x in batch_tokens]
    corpus_vectors = pd.DataFrame(corpus_vectors)
    corpus_vectors['tokens'] = batch_tokens

    # Save embeddings to disk
    with open(directory+'vector '+period,'a') as f:
        corpus_vectors.to_csv(f,index=False,header=False)

## Gensim Model

#### Load model

Load gensim model

In [None]:
gensim_model_address = datapath+'FastText Models/50D w1/fasttext-scopus-2.2-million_docs-gensim 50D-w1.model'
model = fasttext_gensim.load(gensim_model_address)

In [7]:
gensim_model_address = datapath+"FastText Models/100D Oct2020/large corpus.model"
model = fasttext_gensim.load(gensim_model_address)

In [None]:
gensim_model_address = 'drive/My Drive/Data-Permenant/FastText-crawl-300d-2M-subword/crawl-300d-2M-subword.bin'
model = gensim.models.fasttext.load_facebook_model(gensim_model_address)

Simple Tests

In [None]:
print('intelligence' in model.wv.vocab)
print(model.similarity("anns", "ann"))
print(model.most_similar(['eye','vision','processing'], ['computer']))
print(model.wmdistance(['stop', 'word', 'removed', 'tokens', 'of', 'sentence 1'], ['stop word removed tokens of sentence 2']))

In [None]:
# get distance of two words
from scipy import spatial,sparse,sign
vec_a = model.wv['']
vec_b = model.wv['fpga']
distance_tmp = spatial.distance.cosine(vec_a, vec_b)
similarity_tmp = 1 - distance_tmp
similarity_tmp

In [None]:
model.wv['artificial intelligence']

In [None]:
(model.wv['artificial']+model.wv['intelligence'])/2

In [None]:
model.wv['artificial_intelligence']

#### Compare vectors to ACM categories

In [None]:
AI_categories = [
              'Artificial Intelligence Applications - Expert Systems',
              'Automatic Programming',
              'Deduction - Theorem Proving',
              'Knowledge Representation Formalisms - Knowledge Representation Methods',
              'Programming Languages - Software',
              'Learning',
              'Natural Language Processing',
              'Problem Solving - Control Methods - Search',
              'Robotics',
              'Vision - Scene Understanding',
              'Distributed Artificial Intelligence',
              'ARTIFICIAL INTELLIGENCE'
]

In [None]:
categories = [
              'Natural language processing - Information extraction - Machine translation - Discourse, dialogue  pragmatics - Natural language generation - Speech recognition - Lexical semantics - Phonology / morphology',
              'Knowledge representation reasoning - Description logics - Semantic networks Nonmonotonic default reasoning  belief revision - Probabilistic reasoning - Vagueness fuzzy logic - Causal reasoning  diagnostics - Temporal reasoning - Cognitive robotics - Ontology engineering - Logic programming answer set programming - Spatial  physical reasoning - Reasoning about belief  knowledge',
              'Planning  scheduling - Planning for deterministic actions - Planning under uncertainty - Multi-agent planning - Planning  abstraction  generalization - Robotic planning - Evolutionary robotics',
              'Search methodologies - Heuristic function construction - Discrete space search - Continuous space search - Randomized search - Game tree search - Abstraction  micro-operators - Search with partial observations - ',
              'Control methods - Robotic planning - Evolutionary robotics - Computational control theory - Motion path planning',
              'Philosophical theoretical foundations artificial intelligence - Cognitive science - Theory mind',
              'Distributed artificial intelligence - Multi agent systems - Intelligent agents - Mobile agents - Cooperation  coordination',
              'Computer vision - Biometrics - Scene understanding - Activity recognition  understanding - Video summarization - Visual content based indexing  retrieval - Visual inspection - Vision for robotics - Scene anomaly detection - Image  video acquisition - Camera calibration - Epipolar geometry - Computational photography - Hyperspectral imaging - Motion capture - 3D imaging - Active vision - Image representations - Shape representations - Appearance  texture - Hierarchical representations - Computer vision problems - Interest point  salient region detections - Image segmentation  - Video segmentation - Shape inference - Object detection - Object recognition - Object identification - Tracking - Reconstruction - Matching',
              
              'Learning paradigms - Supervised learning - Ranking - Learning to rank - Supervised learning  classification - Supervised learning  regression - Structured outputs - Cost sensitive learning - Unsupervised learning - Cluster analysis - Anomaly detection - Mixture modeling - Topic modeling - Source separation - Motif discovery - Dimensionality reduction  manifold learning - Reinforcement learning - Sequential decision making - Inverse reinforcement learning - Apprenticeship learning - Multi-agent reinforcement learning - Adversarial learning - Multi-task learning - Transfer learning - Lifelong machine learning - Learning under covariate shift',
              'Learning settings - Batch learning - Online learning settings - Learning from demonstrations - Learning from critiques - Learning from implicit feedback - Active learning settings - Semi supervised learning settings',
              'Machine learning approaches - Classification  regression trees - Kernel methods - Support vector machines - Gaussian processes - Neural networks - Logical  relational learning - Inductive logic learning - Statistical relational learning - Learning in probabilistic graphical models - Maximum likelihood modeling - Maximum entropy modeling - Maximum a posteriori modeling - Mixture models - Latent variable models - Bayesian network models - Learning linear models - Perceptron algorithm - Factorization methods - Non-negative matrix factorization - Factor analysis - Principal component analysis - Canonical correlation analysis - Latent Dirichlet allocation - Rule learning - Instance-based learning - Markov decision processes -  Markov decision processes - Stochastic games - Learning latent representations - Deep belief networks - Bio inspired approaches - Artificial life - Evolvable hardware - Genetic algorithms - Genetic programming - Evolutionary robotics - Generative  developmental approaches',
              'Machine learning algorithms - Dynamic programming Markov decision processes - Value iteration - Q learning - Policy iteration - Temporal difference learning - Approximate dynamic programming methods - Ensemble methods - Boosting - Bagging - Spectral methods - Feature selection - Regularization',
]

In [None]:
AI_vectors = []
labels = []
for item in categories:
    vector_tmp = []
    label = item.split('-')[0]
    for phrase in item.split('-'):
        phrase = phrase.lower().strip()
        # print(phrase)
        vector_tmp.append(model.wv[phrase])
    # print('---')
    AI_vectors.append(list(np.array(vector_tmp).mean(axis=0)))
    labels.append(label)
print(AI_vectors)

In [None]:
pd.DataFrame(AI_vectors).to_csv(datapath+'FastText doc clusters - SIP/50D/classification/ACM_classifications_vectors')
pd.DataFrame(labels,columns=['label']).to_csv(datapath+'FastText doc clusters - SIP/50D/classification/ACM_classifications_labels')

### Get Word Embeddings

##### No n-gram handle

In [None]:
# Save in a dict
comment_embedding = ''
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    output_dict[token] = str(model.wv[token])

##### Manual n-gram handle

In [None]:
# Save in a dict
comment_embedding = 'average_manual '
output_dict = {}
for token in tqdm(corpus_tokens[:],total=len(corpus_tokens[:])):
    token_split = token.split(' ')
    if len(token_split) > 0:
        tmp_vector_grams = []
        for item in token_split:
            tmp_vector_grams.append(model.wv[item])
        output_dict[token] = str(list(np.array(tmp_vector_grams).mean(axis=0)))
    else:
        output_dict[token] = str(model.wv[token])

##### Save to disk

In [None]:
# Save embeddings to disk
with open(directory+'/FastText vector with n-grams '+comment_embedding+period+'.json', 'w') as json_file:
    json.dump(output_dict, json_file)

In [None]:
print(directory)

### Get Doc Embeddings (SIF)

It is not recommended to perform this on cloud, as it is not process intesive, yet takes too long depending on the doc-count. It might take over 30 hours.

Make a probability dictionary

In [None]:
corpus_tokens_s = pd.Series(corpus_tokens)
corpus_tokens_probabilities = (corpus_tokens_s.groupby(corpus_tokens_s).transform('count') / len(corpus_tokens_s)).values
corpus_tokens_probabilities = pd.DataFrame(corpus_tokens_probabilities)
corpus_tokens_probabilities['tokens'] = corpus_tokens_s
corpus_tokens_probabilities.columns = ['probability','token']
corpus_tokens_probabilities = corpus_tokens_probabilities.groupby('token').mean()
corpus_tokens_probabilities = corpus_tokens_probabilities.reset_index()
corpus_tokens_probabilities.columns = ['token','probability']

Get vectors

In [None]:
vectors = []
for token in tqdm(corpus_tokens_probabilities['token'],total=corpus_tokens_probabilities.shape[0]):
    vectors.append(model.wv[token])
corpus_tokens_probabilities['vector'] = vectors

Calculate weighted average vectors

In [None]:
a = 1e-3
embedding_size = 100

doc_set = []
for doc in tqdm(corpus['abstracts'].values.tolist(),total=len(corpus['abstracts'].values.tolist())):
    vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
    doc_length = len(doc.split())
#     print(doc.split())
    for word in doc.lower().split():
        a_value = a / (a + corpus_tokens_probabilities[corpus_tokens_probabilities['token']==word]['probability'].values.tolist()[0])  # smooth inverse frequency, SIF
        vs = np.add(vs, np.multiply(a_value, corpus_tokens_probabilities[corpus_tokens_probabilities['token']==word]['vector'].values.tolist()[0]))  # vs += sif * word_vector

    vs = np.divide(vs, doc_length)  # weighted average
    doc_set.append(vs)  # add to our existing re-calculated set of sentences


In [None]:
pd.DataFrame(doc_set).to_csv(datapath+'Corpus/KPRIS/embeddings/large corpus',index=False)

##### Wikipedia embedding

In [None]:
import os
import time
from stat import S_ISREG, ST_CTIME, ST_MODE

print("\nSearching for Wiki texts...\n")
dir_path = datapath+'Corpus/AI Wiki Classifications/applications/clean/'
data = (os.path.join(dir_path, fn) for fn in os.listdir(dir_path))
data = ((os.stat(path), path) for path in data)
data = ((stat[ST_CTIME], path) for stat, path in data if S_ISREG(stat[ST_MODE]))

names = []
files = []
for cdate, path in sorted(data):
    print('   - ', time.ctime(cdate), os.path.basename(path),int(os.path.getsize(path)/1000000),'MB')
    files.append(path)
    names.append(os.path.basename(path))

In [None]:
embedding_size = 50
all = []
for file_index,file in enumerate(files):
    print('\n',file,'\n')
    corpus = pd.read_csv(file)
    corpus_tokens = [item.lower() for sublist in corpus['sentence'].values.tolist() if pd.notnull(sublist) for item in sublist.split()]
    gc.collect()
    
    corpus_tokens_s = pd.Series(corpus_tokens)
    corpus_tokens_probabilities = (corpus_tokens_s.groupby(corpus_tokens_s).transform('count') / len(corpus_tokens_s)).values
    corpus_tokens_probabilities = pd.DataFrame(corpus_tokens_probabilities)
    corpus_tokens_probabilities['tokens'] = corpus_tokens_s
    corpus_tokens_probabilities.columns = ['probability','token']
    corpus_tokens_probabilities = corpus_tokens_probabilities.groupby('token').mean()
    corpus_tokens_probabilities = corpus_tokens_probabilities.reset_index()
    corpus_tokens_probabilities.columns = ['token','probability']

    vectors = []
    for token in tqdm(corpus_tokens_probabilities['token'],total=corpus_tokens_probabilities.shape[0]):
        vectors.append(model.wv[token])
    corpus_tokens_probabilities['vector'] = vectors

    a = 1e-3

    doc_set = []
    for doc in tqdm(corpus['sentence'].values.tolist(),total=len(corpus['sentence'].values.tolist())):
        vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
        if pd.notnull(doc):
            doc_length = len(doc.split())
        #     print(doc.split())
            for word in doc.lower().split():
                a_value = a / (a + corpus_tokens_probabilities[corpus_tokens_probabilities['token']==word]['probability'].values.tolist()[0])  # smooth inverse frequency, SIF
                vs = np.add(vs, np.multiply(a_value, corpus_tokens_probabilities[corpus_tokens_probabilities['token']==word]['vector'].values.tolist()[0]))  # vs += sif * word_vector

            vs = np.divide(vs, doc_length)  # weighted average
            doc_set.append(vs)  # add to our existing re-calculated set of sentences

    pd.DataFrame(doc_set).to_csv(datapath+'Corpus/AI Wiki Classifications/applications/clean/vectors/'+names[file_index],index=False)

    all.append(pd.DataFrame(doc_set).mean(axis=0))

all_df = pd.DataFrame(all)
all_df['clusters'] = names
all_df.to_csv(datapath+'Corpus/AI Wiki Classifications/applications/clean/vectors/all',index=False)

### Get Doc Embedding (averaging)

In [25]:
doc_vectors = []
for doc in tqdm(corpus['abstracts']):
    tokens = doc.split()
    doc_vectors.append(np.array([model.wv[token] for token in tokens]).mean(axis=0))
pd.DataFrame(doc_vectors).to_csv(datapath+'Corpus/KPRIS/embeddings/FastText Avg large corpus',index=False)

100%|██████████| 19710/19710 [00:09<00:00, 2073.87it/s]


# Train on a large Scopus corpus


#### Load Corpus Sentences

In [None]:
sentence_corpus = pd.read_csv(datapath+'Corpus/patent_wos_training_very_large')

#### Preprocess and prepare corpus for FastText training

In [None]:
sentences = []
lemmatizer=WordNetLemmatizer()
with open(datapath+'corpus/AI ALL/1900-2019 corpus sentences abstract-title further processed.csv', 'w') as f:
    for index,row in tqdm(sentence_corpus.iterrows(),total=sentence_corpus.shape[0]):
        sentence = row['sentence']
        sentence = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",sentence)
        sentence = word_tokenize(sentence)
        sentence = [word for word in sentence if not word in punkts] 
        sentence = [lemmatizer.lemmatize(word) for word in sentence]
        # sentences.append(sentence)
        f.write("%s\n" % ' '.join(sentence))

#### Save sentences to disk for future use -- Not needed anymore

In [None]:
pd.DataFrame([' '.join(words) for words in sentences],columns=['sentences']).to_csv(
    datapath+'corpus/AI ALL/1900-2019 corpus sentences abstract-title further processed.csv',
    header=True,index=False)

#### Load pre-processed sentences

In [None]:
# sentence_corpus = pd.read_csv(datapath+'Corpus/AI ALL/1900-2019 corpus sentences abstract-title further processed.csv',delimiter=";;;")
# sentence_corpus.columns = ["sentence"]
sentences = []
sentence_corpus = sentence_corpus.fillna('')
for index,row in tqdm(sentence_corpus.iterrows(),total=sentence_corpus.shape[0]):
    sentences.append(row['sentence'].split(' '))

In [None]:
sentence_corpus.head(10)

In [None]:
sentences[:5]

### Train Fasttext - Gensim

#### Load a model to continue training

* If want to continue training, run this section

In [None]:
model = load(gensim_model_address)

In [None]:
model.build_vocab(sentences, update=True)
model.train(sentences, total_examples=len(sentences), epochs=model.epochs)

* Otherwise run this section

#### Train

In [None]:
model = fasttext_gensim(min_n=3, max_n=6, size=15, window=5, min_count=1, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)

In [None]:
fname = "datapath+Models/fasttext-scopus-2.2-million_docs-gensim 15D.model"
model.save(fname)

In [None]:
model = fasttext_gensim(min_n=3, max_n=6, size=50, window=5, min_count=1, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10)
fname = datapath+"Models/fasttext-scopus-2.2-million_docs-gensim 50D.model"
model.save(fname)

In [None]:
model = fasttext_gensim(min_n=3, max_n=12, size=100, window=5, min_count=5, seed = 50)
model.build_vocab(sentences=sentences)
model.train(sentences=sentences, total_examples=len(sentences), epochs=10,threads=15)
fname = datapath+"FastText Models/100D Oct2020/largge corpus.model"
model.save(fname)

#### Test model

In [None]:
similarities = model.wv.most_similar(positive=['logic','fuzzy','expert'],negative=['deep','neural','network','cnn','ann'])
most_similar = similarities[0]

In [None]:
most_similar

In [None]:
not_matching = model.wv.doesnt_match("human computer interface tree".split())

In [None]:
not_matching

In [None]:
sim_score = model.wv.similarity('computer', 'human')

In [None]:
sim_score

In [None]:
print(model.wv['artificial intelligence'])
print(model.wv['artificial'])
print(model.wv['intelligence'])

### Train Fasttext - Facebook

In [None]:
sentences_joined = ' '.join(sentences)
model = fasttext.train_unsupervised(sentences_joined, "cbow", minn=2, maxn=5, dim=50, epoch=10,lr=0.05)

#### Test model

In [None]:
model.words

In [None]:
model.get_word_vector("the")

In [None]:
model.get_nearest_neighbors('asparagus')

In [None]:
model.get_analogies("intelligence", "math", "fuzzy")

#### Save model

In [None]:
model.save_model(datapath+"fasttext-scopus_wos-merged-310k_docs-facebook.ftz")