# Starter Code for fastText English Word Vectors Embedding

This kernel intends to be a starter code for anyone using the fastText Embedding. It uses Gensim to create a `KeyedVector` object (behavior similar to a dictionary). An example of tokenizing the data is also given.

In [1]:
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import nltk
from gensim.models import KeyedVectors
from sklearn.datasets import fetch_20newsgroups

In [2]:
print(os.listdir('../input'))

['quora-insincere-questions-classification', 'fasttext-wikinews']


In [3]:
FILE_PATH = '../input/fasttext-wikinews/wiki-news-300d-1M.vec'

In [4]:
# Let's read the first few lines 
with open(FILE_PATH) as f:
    for _ in range(5):
        print(f.readline()[:80])

1000000 300

, 0.1073 0.0089 0.0006 0.0055 -0.0646 -0.0600 0.0450 -0.0133 -0.0357 0.0430 -0.0
the 0.0897 0.0160 -0.0571 0.0405 -0.0696 -0.1237 0.0301 0.0248 -0.0303 0.0174 0.
. 0.0004 0.0032 -0.0204 0.0479 -0.0450 -0.1165 0.0142 0.0068 -0.0334 -0.0504 0.0
and -0.0314 0.0149 -0.0205 0.0557 0.0205 -0.0405 0.0044 -0.0118 -0.0424 -0.0490 


## Load the embedding

In [5]:
# This may take a few mins
keyed_vec = KeyedVectors.load_word2vec_format(FILE_PATH)

In [6]:
for word in ['hello', '!', '2', 'Turing', 'foobarz', 'hi!']:
    print(word, "is in the vocabulary:", word in keyed_vec.vocab)

hello is in the vocabulary: True
! is in the vocabulary: True
2 is in the vocabulary: True
Turing is in the vocabulary: True
foobarz is in the vocabulary: False
hi! is in the vocabulary: False


### Retrieving a vector with the KeyedVector

In [7]:
word_vec = keyed_vec.get_vector('foobar')
print(word_vec.shape)
print(word_vec[:25])

(300,)
[-0.1366  0.0041  0.0905  0.0684 -0.0082  0.0175 -0.1518  0.1521  0.2366
 -0.1034  0.2554 -0.1798 -0.0465  0.2005 -0.1291  0.0709 -0.258  -0.212
 -0.0824  0.0465 -0.4044 -0.2766  0.004   0.3014  0.0622]


### Creating Keras Embeddings

In [8]:
keras_embedding = keyed_vec.get_keras_embedding()
keras_embedding.get_config()

Using TensorFlow backend.


{'name': 'embedding_1',
 'trainable': False,
 'batch_input_shape': (None, None),
 'dtype': 'float32',
 'input_dim': 1000000,
 'output_dim': 300,
 'embeddings_initializer': {'class_name': 'RandomUniform',
  'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}},
 'embeddings_regularizer': None,
 'activity_regularizer': None,
 'embeddings_constraint': None,
 'mask_zero': False,
 'input_length': None}

## Applied Example: Prediction with scikit-learn

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [10]:
def mean_fasttext(arr, embedding_dim=300):
    '''
    Create the average of the fasttext embeddings from each word in a document. 
    Very slow function, needs to be optimized for larger datasets
    '''
    mean_vectors = []
    for document in arr:
        tokens = nltk.tokenize.word_tokenize(document)
        vectors = [keyed_vec.get_vector(token) for token in tokens if token in keyed_vec.vocab]
        if vectors:
            mean_vec = np.vstack(vectors).mean(axis=0)
            mean_vectors.append(mean_vec)
        else:
            mean_vectors.append(np.zeros(embedding_dim))
    embedding = np.vstack(mean_vectors)
    return embedding

In [11]:
data_sample = pd.read_csv('../input/quora-insincere-questions-classification/train.csv', nrows=6000)
train_sample = data_sample[:5000]
test_sample = data_sample[5000:]
train_sample.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [12]:
X_train = mean_fasttext(train_sample["question_text"].values)
X_test = mean_fasttext(test_sample["question_text"].values)
y_train = train_sample['target'].values
y_test = test_sample['target'].values
print(X_train.shape)
print(y_train.shape)

(5000, 300)
(5000,)


In [13]:
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)
print("Train Score:", f1_score(y_train, model.predict(X_train)))
print("Test Score:", f1_score(y_test, model.predict(X_test)))

Train Score: 0.0755813953488372
Test Score: 0.1111111111111111
