In [9]:
import os
import pandas as pd
import gzip
import json
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

def parse(path):
  g = open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


In [10]:
%%time
def finalDF(path):
    finalDF = []
    for i in path:
        df = getDF('/home/rbhavi/Desktop/NLP/fastext/data/'+i)
        df = df[['title']]
        df = df.replace(r'^\s*$', np.nan, regex=True)
        df['hsn'] = i
        df = df.dropna()
        df = df.replace(r'[^A-Za-z]+', ' ', regex=True)
        df = df.replace(r'\s', '_', regex=True)
        df['title'] = df['title'].str.lower()
        finalDF.append(df)
    return pd.concat(finalDF)
data = finalDF(os.listdir('/home/rbhavi/Desktop/NLP/fastext/data'))
le = LabelEncoder()
data['hsn'] = le.fit_transform(data['hsn'])
data['hsn'] = '__label__' + data['hsn'].astype(str)
data = data[['hsn','title']]

CPU times: user 13.4 s, sys: 1.06 s, total: 14.4 s
Wall time: 14.4 s


In [11]:
train, valid, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])
train.shape, valid.shape, test.shape

((118824, 2), (39608, 2), (39608, 2))

In [12]:
train.to_csv('final.train', index=False, header=False, sep='\t')
valid.to_csv('final.valid', index=False, header=False, sep='\t')
test.to_csv('final.test', index=False, header=False, sep='\t')

In [13]:
%%time
import fasttext
model = fasttext.train_supervised(input="final.train", autotuneValidationFile='final.valid', autotunePredictions=5)
model.test("final.test")

CPU times: user 10h 2min 56s, sys: 1min 31s, total: 10h 4min 28s
Wall time: 5min 7s


(39608, 0.9458695213088265, 0.9458695213088265)

In [14]:
def predict(row):
    return model.predict(row['title'])[0][0]
test['predictions'] = test.apply(predict,axis=1)

In [15]:
accuracy_score(test['hsn'], test['predictions'])

0.9458695213088265

In [16]:
%%time
def top5words(row):
    return model.get_nearest_neighbors(row['title'])
test['top5words'] = test.apply(top5words,axis=1)
model.save_model("amzn.bin")
test.to_csv("results.csv", index=False)

CPU times: user 8min 12s, sys: 4.05 s, total: 8min 16s
Wall time: 9min 4s


In [None]:
# train['title'].to_csv('model2.train', index=False, header=False, sep='\t')
# valid['title'].to_csv('model2.valid', index=False, header=False, sep='\t')
# test['title'].to_csv('model2.test', index=False, header=False, sep='\t')

In [None]:
# %%time
# model2 =  fasttext.train_unsupervised(input="model2.train", minn=2, maxn=2, dim=200, epoch=10, lr=0.5)
# model.save_model("model2.bin")

In [17]:
model.get_sentence_vector('amazon')

array([-0.06729136, -0.00092826, -0.07790586, -0.0044092 ,  0.05097683,
       -0.03384769, -0.05423024, -0.05745913,  0.13059792,  0.29227808,
       -0.01361836,  0.00896457,  0.15631957,  0.06047435, -0.01795117,
        0.03924633, -0.00709168,  0.05183536, -0.03886985, -0.04927716,
        0.04626067, -0.01948514, -0.06271682, -0.08577275, -0.00614547,
        0.06082454, -0.04923154,  0.04103671,  0.03868415,  0.01136865,
        0.07431151, -0.03255565,  0.02567981, -0.05055358,  0.06582287,
       -0.00829958, -0.03170949,  0.00582015, -0.04120429, -0.03589464,
        0.07349437,  0.04236813, -0.0069772 ,  0.00413841,  0.00683583,
        0.0285518 , -0.02267235,  0.05153627, -0.02544357, -0.03435444,
       -0.02221516, -0.03335093, -0.05354708,  0.01927935,  0.05318545,
        0.06520252, -0.04226614, -0.05368572], dtype=float32)

In [19]:
model.get_word_vector('amazon')

array([-0.03108178,  0.05534719, -0.05248678,  0.02645894,  0.00516311,
        0.11499557,  0.06020023,  0.00214001,  0.01730405,  0.15281966,
       -0.00185905,  0.00562164,  0.01843944,  0.00067251, -0.0006628 ,
        0.10186821,  0.02604912, -0.00090765, -0.05424493, -0.03687399,
        0.09187492, -0.03989654,  0.0398276 ,  0.02824774, -0.05477113,
        0.05789005, -0.02230135,  0.07162599,  0.01619327,  0.00342114,
        0.08553859, -0.031239  ,  0.03127193, -0.05484121,  0.07197117,
       -0.00218725, -0.06064975, -0.01008458, -0.05866987, -0.02315177,
        0.01700971,  0.0315505 , -0.01422148, -0.00130722, -0.02135808,
        0.00326049, -0.03769902,  0.08370962,  0.02006318, -0.01296705,
        0.00535279, -0.02528452, -0.02751887,  0.00963326, -0.00400634,
       -0.00747982, -0.01907618, -0.04501597], dtype=float32)

In [20]:
len(model.words)

104894

In [21]:
'amazon' in model.words

False