# FastText embedding

## training
## use
## toy classification

### homework apply FastText embedding for classification of Stack Overflow Questions with Quality Rating

In [None]:
### training the model on custom data

In [None]:
from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
print(common_texts[0])
print(len(common_texts))
model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
model.build_vocab(corpus_iterable=common_texts)
model.train(corpus_iterable=common_texts, total_examples=len(common_texts), epochs=10)  # train

['human', 'interface', 'computer']
9


(36, 290)

In [None]:
print(model.wv['human'])

[ 0.02830743  0.01806018 -0.03648041  0.0230654 ]


### for larger datasets, we recommend streaming the file, for example from disk or the network. In Gensim, we refer to such datasets as “corpora” (singular “corpus”), and keep them in the format described in LineSentence.

In [None]:
from gensim.test.utils import datapath

In [None]:
corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model3 = FastText(vector_size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)

(240956, 301935)

### creating your own model based on a text file

In [None]:
### text from https://www.gutenberg.org/files/2554/2554-0.txt
from gensim.utils import tokenize
from gensim import utils
class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with utils.open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
model4 = FastText(vector_size=4, window=3, min_count=1)
model4.build_vocab(corpus_iterable=MyIter())
total_examples = model4.corpus_count
model4.train(corpus_iterable=MyIter(), total_examples=total_examples, epochs=5)

(1507, 1995)

In [None]:
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("fasttext.model")

model.save(fname)
model = FastText.load(fname)

In [None]:
# get one out of vocab word
import numpy as np
print('computation' in model.wv.key_to_index)  # New word, currently out of vocab
print(model.wv['computation'])

False
[ 0.01539698 -0.03783063 -0.03136532 -0.01059705]


In [None]:
old_vector = np.copy(model.wv['computation'])  # Grab the existing vector
new_sentences = [
    ['computer', 'aided', 'design'],
    ['computer', 'science'],
    ['computational', 'complexity'],
    ['military', 'supercomputer'],
    ['central', 'processing', 'unit'],
    ['onboard', 'car', 'computer'],
]

model.build_vocab(new_sentences, update=True)  # Update the vocabulary
model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
print(model.wv['computation'])

[ 0.01539842 -0.03783076 -0.03136622 -0.01059766]


In [None]:
### querying with out of vocab word

In [None]:
new_vector = model.wv['computation']
print(new_vector)
np.allclose(old_vector, new_vector, atol=1e-4)  # Vector has changed, model has learnt something
'computation' in model.wv.key_to_index  # Word is still out of vocab

[ 0.01539698 -0.03783063 -0.03136532 -0.01059705]


False

In [None]:
### working with ready-made vectors

In [None]:
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_vectors
cap_path = datapath("crime-and-punishment.bin")
wv = load_facebook_vectors(cap_path)
'landlord' in wv.key_to_index  # Word is out of vocabulary
oov_vector = wv['landlord']  # Even OOV words have vectors in FastText
print(oov_vector)
'landlady' in wv.key_to_index  # Word is in the vocabulary
iv_vector = wv['landlady']
print(iv_vector)

[-0.05853396 -0.00144831  0.00096381  0.09085083  0.08532218]
[-0.06020484 -0.00170379  0.00868763  0.13152218  0.05103018]


In [None]:
### perform usual nlp tasks
similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface'])
most_similar = similarities[0]
print(most_similar)
not_matching = model.wv.doesnt_match("human computer interface tree".split())
print(not_matching)
sim_score = model.wv.similarity('computer', 'human')
print(sim_score)

('processing', 0.8094061017036438)
tree
0.058223367


### Toy classification example

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({'Rating': [3,5,1,2],
                   'Text': ["I love sunflowers",
                            "Sunflowers fill my heart with joy",
                            "I love to look into the garden and see the flowers",
                            "Flowers especially sunflowers are the most beautiful"]})

In [None]:
df

Unnamed: 0,Rating,Text
0,3,I love sunflowers
1,5,Sunflowers fill my heart with joy
2,1,I love to look into the garden and see the flo...
3,2,Flowers especially sunflowers are the most bea...


In [None]:
from nltk.tokenize import word_tokenize
df['Text_Tokenized'] = df['Text'].str.lower().apply(word_tokenize)

In [None]:
df

Unnamed: 0,Rating,Text,Text_Tokenized
0,3,I love sunflowers,"[i, love, sunflowers]"
1,5,Sunflowers fill my heart with joy,"[sunflowers, fill, my, heart, with, joy]"
2,1,I love to look into the garden and see the flo...,"[i, love, to, look, into, the, garden, and, se..."
3,2,Flowers especially sunflowers are the most bea...,"[flowers, especially, sunflowers, are, the, mo..."


In [None]:
vector_size_n_w2v = 5

w2v_model = FastText(df['Text_Tokenized'],
                     vector_size=vector_size_n_w2v,
                     window=3,
                     min_count=1,
                     sg=0, # 0=CBOW, 1=Skip-gram
                     epochs=5)

print(w2v_model)

FastText(vocab=20, vector_size=5, alpha=0.025)


In [None]:
w2v_model.save("fasttext_model")

In [None]:
w2v_model.wv.index_to_key

['sunflowers',
 'the',
 'i',
 'love',
 'flowers',
 'to',
 'fill',
 'my',
 'heart',
 'with',
 'joy',
 'beautiful',
 'most',
 'into',
 'garden',
 'and',
 'see',
 'especially',
 'are',
 'look']

In [None]:
w2v_model.wv['sunflowers']

array([-0.01041438, -0.00093864,  0.01324417, -0.00349267, -0.02209629],
      dtype=float32)

In [None]:
words = set(w2v_model.wv.index_to_key )
df['Text_vect'] = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in df['Text_Tokenized']])

  df['Text_vect'] = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [None]:
df

Unnamed: 0,Rating,Text,Text_Tokenized,Text_vect
0,3,I love sunflowers,"[i, love, sunflowers]","[[-0.020006388, -0.04222982, -0.120482996, 0.1..."
1,5,Sunflowers fill my heart with joy,"[sunflowers, fill, my, heart, with, joy]","[[-0.010414377, -0.0009386352, 0.013244171, -0..."
2,1,I love to look into the garden and see the flo...,"[i, love, to, look, into, the, garden, and, se...","[[-0.020006388, -0.04222982, -0.120482996, 0.1..."
3,2,Flowers especially sunflowers are the most bea...,"[flowers, especially, sunflowers, are, the, mo...","[[0.008210084, 0.0027682872, 0.01927444, 0.008..."


In [None]:
content_sentence1_Text_vect = list(df['Text_vect'].loc[0:0])

In [None]:
text_vect_avg = []
for v in df['Text_vect']:
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(vector_size_n, dtype=float)) # the same vector size must be used here as for model training
        
        
df['Text_vect_avg'] = text_vect_avg
df

Unnamed: 0,Rating,Text,Text_Tokenized,Text_vect,Text_vect_avg
0,3,I love sunflowers,"[i, love, sunflowers]","[[-0.020006388, -0.04222982, -0.120482996, 0.1...","[0.00031264746, -0.011010818, -0.040938333, 0...."
1,5,Sunflowers fill my heart with joy,"[sunflowers, fill, my, heart, with, joy]","[[-0.010414377, -0.0009386352, 0.013244171, -0...","[0.014096205, 0.011501317, 0.023121616, -0.001..."
2,1,I love to look into the garden and see the flo...,"[i, love, to, look, into, the, garden, and, se...","[[-0.020006388, -0.04222982, -0.120482996, 0.1...","[-0.012118164, 0.012173792, -0.013780204, -0.0..."
3,2,Flowers especially sunflowers are the most bea...,"[flowers, especially, sunflowers, are, the, mo...","[[0.008210084, 0.0027682872, 0.01927444, 0.008...","[-0.015716232, 0.0031787956, -0.010517577, -0...."


In [None]:
df_Machine_Learning = pd.DataFrame(text_vect_avg)
df_Machine_Learning

Unnamed: 0,0,1,2,3,4
0,0.000313,-0.011011,-0.040938,0.045829,-0.056768
1,0.014096,0.011501,0.023122,-0.001331,0.001796
2,-0.012118,0.012174,-0.01378,-0.011344,-0.001035
3,-0.015716,0.003179,-0.010518,-0.003246,0.024442


In [None]:
final_df = pd.concat([df[['Rating', 'Text']], df_Machine_Learning], axis=1, sort=False)


In [None]:
final_df

Unnamed: 0,Rating,Text,0,1,2,3,4
0,3,I love sunflowers,0.000313,-0.011011,-0.040938,0.045829,-0.056768
1,5,Sunflowers fill my heart with joy,0.014096,0.011501,0.023122,-0.001331,0.001796
2,1,I love to look into the garden and see the flo...,-0.012118,0.012174,-0.01378,-0.011344,-0.001035
3,2,Flowers especially sunflowers are the most bea...,-0.015716,0.003179,-0.010518,-0.003246,0.024442


In [None]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(df_Machine_Learning, final_df['Rating'])

SVC(kernel='linear')

In [None]:
new_input = ["Flowers I like to see in the park especially sunflowers", 
             "I like flowers"]
new_input_df = pd.DataFrame(new_input, columns=['New_Input'])

In [None]:
new_input_df['New_Input_Tokenized'] = new_input_df['New_Input'].str.lower().apply(word_tokenize)
new_input_df

Unnamed: 0,New_Input,New_Input_Tokenized
0,Flowers I like to see in the park especially s...,"[flowers, i, like, to, see, in, the, park, esp..."
1,I like flowers,"[i, like, flowers]"


In [None]:
words = set(w2v_model.wv.index_to_key )
new_input_df['New_Text_vect'] = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in new_input_df['New_Input_Tokenized']])
new_input_df

  new_input_df['New_Text_vect'] = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


Unnamed: 0,New_Input,New_Input_Tokenized,New_Text_vect
0,Flowers I like to see in the park especially s...,"[flowers, i, like, to, see, in, the, park, esp...","[[0.008210084, 0.0027682872, 0.01927444, 0.008..."
1,I like flowers,"[i, like, flowers]","[[-0.020006388, -0.04222982, -0.120482996, 0.1..."


In [None]:
text_vect_avg = []
for v in new_input_df['New_Text_vect']:
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(vector_size_n, dtype=float)) # the same vector size must be used here as for model training
        
        
new_input_df['Text_vect_avg'] = text_vect_avg
new_input_df

Unnamed: 0,New_Input,New_Input_Tokenized,New_Text_vect,Text_vect_avg
0,Flowers I like to see in the park especially s...,"[flowers, i, like, to, see, in, the, park, esp...","[[0.008210084, 0.0027682872, 0.01927444, 0.008...","[-0.020084772, 0.004546546, -0.0138535565, -0...."
1,I like flowers,"[i, like, flowers]","[[-0.020006388, -0.04222982, -0.120482996, 0.1...","[-0.005898152, -0.019730767, -0.050604276, 0.0..."


In [None]:
new_input_Machine_Learning_df = pd.DataFrame(text_vect_avg)
new_input_Machine_Learning_df.columns = ['Element_' + str(i+1) for i in range(0, new_input_Machine_Learning_df.shape[1])]
new_input_Machine_Learning_df

final_new_input_df = pd.concat([new_input_df[['New_Input']], new_input_Machine_Learning_df], axis=1, sort=False)
final_new_input_df
# w2v_model_reloaded = FastText.load("fasttext_model")
# words = set(w2v_model_reloaded.wv.index_to_key )
# new_input_df['New_Input_vect'] = np.array([np.array([w2v_model_reloaded.wv[i] for i in lsif i in words])
#                                            for ls in new_input_df['New_Input_Tokenized']])

# text_vect_avg = []
# for v in new_input_df['New_Input_vect']:
#     if v.size:
#         text_vect_avg.append(v.mean(axis=0))
#     else:
#         text_vect_avg.append(np.zeros(vector_size_n_reloaded, dtype=float)) # the same vector size must be used here as for model training
        
# final_new_input_df = pd.concat([new_input_df[['New_Input']], new_input_Machine_Learning_df], axis=1, sort=False)


Unnamed: 0,New_Input,Element_1,Element_2,Element_3,Element_4,Element_5
0,Flowers I like to see in the park especially s...,-0.020085,0.004547,-0.013854,-0.006871,0.001681
1,I like flowers,-0.005898,-0.019731,-0.050604,0.085693,-0.05239


In [None]:
y_pred = clf.predict(new_input_Machine_Learning_df)



In [None]:
y_pred

array([1, 3], dtype=int64)

### Classification based on Stack Overflow Questions with Quality Rating

In [None]:
### 60k Stack Overflow Questions with Quality Rating
### https://www.kaggle.com/datasets/imoore/60k-stack-overflow-questions-with-quality-rate

In [None]:
# Text Classification with fastText
# Importing libraries
import numpy as np, pandas as pd

# NLP Preprocessing
from gensim.utils import simple_preprocess #lowercases, tokenizes

# Importing the dataset
dataset = pd.read_csv('stackoverflow/train.csv')[['Body', 'Y']].rename(columns = {'Body': 'questions', 'Y': 'category'})
ds = pd.read_csv('stackoverflow/valid.csv')[['Body', 'Y']].rename(columns = {'Body': 'questions', 'Y': 'category'})

<bound method NDFrame.tail of                                                questions  category
0      I am having 4 different tables like \r\nselect...   LQ_EDIT
1      I have two table m_master and tbl_appointment\...   LQ_EDIT
2      <p>I'm trying to extract US states from wiki U...        HQ
3      I'm so new to C#, I wanna make an application ...   LQ_EDIT
4      basically i have this array:\r\n\r\n    array(...   LQ_EDIT
...                                                  ...       ...
14995  <p>I have a menu, and I'd like the div.right-c...  LQ_CLOSE
14996  <p>I try to multiply an integer by a double bu...  LQ_CLOSE
14997      *URLS.PY*\r\n    //URLS.PY FILE\r\n    fro...   LQ_EDIT
14998  <p>I have a controller inside which a server i...  LQ_CLOSE
14999  <p>So i was recently helping someone out with ...  LQ_CLOSE

[15000 rows x 2 columns]>

### simple preprocessing

In [None]:
# NLP Preprocess
dataset.iloc[:, 0] = dataset.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))
ds.iloc[:, 0] = ds.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

# Prefixing each row of the category column with '__label__'
dataset.iloc[:, 1] = dataset.iloc[:, 1].apply(lambda x: '__label__' + x)
ds.iloc[:, 1] = ds.iloc[:, 1].apply(lambda x: '__label__' + x)

In [None]:
ds.tail

<bound method NDFrame.tail of                                                questions           category
0      am having different tables like select from sy...   __label__LQ_EDIT
1      have two table m_master and tbl_appointment th...   __label__LQ_EDIT
2      trying to extract us states from wiki url and ...        __label__HQ
3      so new to wanna make an application that can e...   __label__LQ_EDIT
4      basically have this array array array sub comp...   __label__LQ_EDIT
...                                                  ...                ...
14995  have menu and like the div right controls and ...  __label__LQ_CLOSE
14996  try to multiply an integer by double but obtai...  __label__LQ_CLOSE
14997  urls py urls py file from django contrib impor...   __label__LQ_EDIT
14998  have controller inside which server is connect...  __label__LQ_CLOSE
14999  so was recently helping someone out with some ...  __label__LQ_CLOSE

[15000 rows x 2 columns]>

In [None]:
import csv
from gensim.models import FastText

# Saving the CSV file as a text file to train/test the classifier
dataset[['category', 'questions']].to_csv('train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

ds[['category', 'questions']].to_csv('test.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")