<a href="https://colab.research.google.com/github/sayanbanerjee32/lang_detect/blob/main/analysis_of_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# mount gdrive for data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# data and model file locations
tar_file_loc = '/content/drive/MyDrive/lang_detection/data/sentences.tar.bz2'
dest_loc = '/tmp'
file_name = 'sentences.csv'
saved_model_path = '/content/drive/MyDrive/lang_detection/models/'
supported_language_file = 'pt_supported_languages.pkl'
nb_model_file = 'nb_133_langs.pkl'
pytorch_model_file = 'pytorch_133_langs.pt'
pytorch_vocab_file = 'pytorch_133_langs_vocab'
fastai_model_file = 'fastai_133_langs_v3'

In [None]:
import pandas as pd
import tarfile
import numpy as np
from sklearn.model_selection import StratifiedKFold

from sklearn.externals import joblib
from sklearn.metrics import f1_score, accuracy_score, classification_report



In [None]:
# extrtract tar file
tar = tarfile.open(tar_file_loc, "r:bz2")  
tar.extractall(path = dest_loc)
tar.close()

In [None]:
# read data in pandas
sent_df = pd.read_csv(dest_loc +'/' + file_name, sep = '\t', names = ['id','label','text'])
# is there blank label
sent_df['label'].fillna('unk',inplace  = True)
sent_df = sent_df.loc[sent_df['label'] != 'unk',:]

In [None]:
# create test split
def get_train_test_split(df):
    train_ids, test_ids = next(StratifiedKFold(n_splits=5,random_state=1).split(df.id, df.label))
    print(f"Test: {len(test_ids)}, Train: {len(train_ids)}, Total: {len(df)}")
    dummy = [False] * len(df)
    com_list = dummy[:]
    for index in test_ids: com_list[index] = True
    return com_list

In [None]:
train_test_bool = get_train_test_split(sent_df)
sum(train_test_bool)



Test: 1977424, Train: 7909695, Total: 9887119


1977424

In [None]:
# keep test separate
test_df = sent_df.loc[train_test_bool,:]
test_df.shape

(1977424, 3)

In [None]:
# load supported language list
lang_labels = joblib.load(saved_model_path + '/' + supported_language_file)
test_sub_sample = test_df.loc[test_df['label'].isin(lang_labels)]
test_sub_sample.shape,test_df.shape

((1972648, 3), (1977424, 3))

## Performance Comparison

In [None]:
# list to store performance figure for all models
perf_dict_list = []

In [None]:
# generic function for generating performance metrics
def get_prediction_performance(model_name, model_path, predict_func, test_data):
    test_true, test_predict  = predict_func(model_path, test_data)
    return {'model_name': model_name,
            'accuracy':accuracy_score(test_true,test_predict),
            'f1_micro':f1_score(test_true,test_predict,
                                average = 'micro'),
            'f1_macro':f1_score(test_true,test_predict,
                                average = 'macro'),
            'f1_weighted':f1_score(test_true,test_predict,
                                   average = 'weighted'),
            'classification_report':pd.DataFrame(classification_report(test_true,
                                                                       test_predict,
                                                                       output_dict=True)).transpose()}

### NB model

In [None]:
# prediction function Naive Bayes
def get_nb_model_predicrtion(model_path, test_data):
    nb_model = joblib.load(model_path)
    return test_data.label, nb_model.predict(test_data.text)

In [None]:
nb_perf_dict = get_prediction_performance('NB',
                                        saved_model_path + '/' + nb_model_file,
                                          get_nb_model_predicrtion, 
                                          test_sub_sample)

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# add to final list
perf_dict_list.append(nb_perf_dict)
perf_dict_list

[{'accuracy': 0.5254409301608802,
  'classification_report':               precision    recall  f1-score       support
  afr            0.000000  0.000000  0.000000  8.060000e+02
  ara            0.076570  0.471765  0.131755  7.597000e+03
  arq            0.000000  0.000000  0.000000  4.620000e+02
  arz            0.000000  0.000000  0.000000  1.600000e+02
  asm            0.000000  0.000000  0.000000  6.390000e+02
  ...                 ...       ...       ...           ...
  zsm            0.000000  0.000000  0.000000  9.910000e+02
  zza            0.000000  0.000000  0.000000  3.390000e+02
  accuracy       0.525441  0.525441  0.525441  5.254409e-01
  macro avg      0.173999  0.198794  0.141529  1.972648e+06
  weighted avg   0.636389  0.525441  0.544557  1.972648e+06
  
  [136 rows x 4 columns],
  'f1_macro': 0.14152895068272514,
  'f1_micro': 0.5254409301608802,
  'f1_weighted': 0.5445568839850479,
  'model_name': 'NB'}]

### PyTorch Model

In [None]:
%%capture
!pip install datasets

In [None]:
import torch
import torchtext
import torch.nn as nn
from datasets import Dataset
import functools
import tqdm
import sys

In [None]:
# supporting function for pytorch model evaluation
tokenizer = lambda x:list(x)
def tokenize_data(example, tokenizer, max_length):
    tokens = tokenizer(example['text'])[:max_length]
    length = len(tokens)
    return {'tokens': tokens, 'length': length}

def numericalize_data(example, vocab):
    ids = [vocab[token] for token in example['tokens']]
    return {'ids': ids}

def label_to_idx(example, lang_labels):
    return {'label':lang_labels.index(example['label'])}

class LangDetect(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
                 dropout_rate, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
                            dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, ids, length):
        # ids = [batch size, seq len]
        # length = [batch size]
        embedded = self.dropout(self.embedding(ids))
        # embedded = [batch size, seq len, embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True, 
                                                            enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        # output = [batch size, seq len, hidden dim * n directions]
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
            # hidden = [batch size, hidden dim * 2]
        else:
            hidden = self.dropout(hidden[-1])
            # hidden = [batch size, hidden dim]
        prediction = self.fc(hidden)
        # prediction = [batch size, output dim]
        return prediction

def collate(batch, pad_index):
    batch_ids = [i['ids'] for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_length = [i['length'] for i in batch]
    batch_length = torch.stack(batch_length)
    batch_label = [i['label'] for i in batch]
    batch_label = torch.stack(batch_label)
    batch = {'ids': batch_ids,
             'length': batch_length,
             'label': batch_label}
    return batch

def evaluate(dataloader, model, device, lang_labels):
    
    model.eval()
    epoch_label = []
    epoch_predicted = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            predicted_classes = prediction.argmax(dim=-1)
            predicted_classes_np = predicted_classes.detach().cpu().numpy()
            label_np = label.detach().cpu().numpy()
            epoch_label.extend(label_np)
            epoch_predicted.extend(predicted_classes_np)
    epoch_label = [lang_labels[idx] for idx in epoch_label]
    epoch_predicted = [lang_labels[idx] for idx in epoch_predicted]
    return epoch_label, epoch_predicted


In [None]:
# load vocab used to build pytorch model
vocab = joblib.load(saved_model_path + '/' + pytorch_vocab_file)
pad_index = vocab['<pad>']
collate = functools.partial(collate, pad_index=pad_index)

In [None]:
# functin to provide perpromance metrics on batch data based on pytorch models
def get_pt_model_predicrtion(model_path, test_data, max_length = 256,
                             vocab = vocab, lang_labels= lang_labels,
                             embedding_dim = 150, hidden_dim = 150,
                            n_layers = 1, bidirectional = True, 
                             dropout_rate = 0.5, batch_size = 512,
                             pad_index = pad_index, collate_fn = collate):
    test_data = Dataset.from_pandas(test_data.loc[:,['label','text']])
    test_data = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer,
                                                        'max_length': max_length})
    test_data = test_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
    test_data = test_data.map(label_to_idx, fn_kwargs={'lang_labels': lang_labels})
    test_data = test_data.with_format(type='torch', columns=['ids', 'label', 'length'])

    vocab_size = len(vocab)
    pad_index = vocab['<pad>']
    output_dim = len(lang_labels)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model = LangDetect(vocab_size, embedding_dim, hidden_dim, output_dim, 
                    n_layers, bidirectional, dropout_rate, pad_index)
    model = model.to(device)
    model.load_state_dict(torch.load(saved_model_path + pytorch_model_file, 
                                    map_location=device))
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
                                                  collate_fn=collate_fn) 
    return evaluate(test_dataloader, model, device, lang_labels)

In [None]:
pt_perf_dict = get_prediction_performance('pytorch',
                                        saved_model_path + '/' + pytorch_model_file,
                                          get_pt_model_predicrtion, 
                                          test_sub_sample)

  0%|          | 0/1972648 [00:00<?, ?ex/s]

  0%|          | 0/1972648 [00:00<?, ?ex/s]

  0%|          | 0/1972648 [00:00<?, ?ex/s]

  "num_layers={}".format(dropout, num_layers))


evaluating...: 100%|██████████| 3853/3853 [09:11<00:00,  6.98it/s]


In [None]:
# add to final list
perf_dict_list.append(pt_perf_dict)

### Fastai

In [None]:
%%capture
!pip install fastai -Uq

In [None]:
from fastai.text.all import *
from fastai.callback.fp16 import *

from pathlib import Path

In [None]:
# functions to support pytorch model prediction
from collections.abc import Iterable

def flatten(l):
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el
class CharTokenizer():
        
    def __call__(self, items):
        
        # List where I temporarly store the tokens ['xxbos', 'h', 'e', 'l', 'l', 'o', 'xxeos'] as 
        # they are being parsed.
        final_list = []
        
        # We don't want to mess with the special fastai tokens
        special_chars = ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj']
        
        # Break up string into words, if word in special_chars dont touch it. Otherwise break up each
        # word into each character.
        for words in items:
            tmp = list(flatten([list(word) if word not in special_chars else word 
                                                for word in words.split()]))
            # tmp has each token 'xxbos', 'xxmaj', 'h', 'e', 'l', 'l', 'o', ',', 'w', 'h', ....]
            # We need to put the tmp list into another list to generate a generator below
            final_list.append(tmp)
        
        # Returns a generator
        return (t for t in final_list)



In [None]:
# functin to provide perpromance metrics on batch data based on fastai models
def get_fastai_model_prediction(model_file, test_data, cpu = False):
    
    learner = load_learner(Path(model_file), cpu = cpu)
    test_dl = learner.dls.test_dl(test_data)
    trained_labels = learner.dls.vocab[1]
    prob_tensor, _, pred_tensor = learner.get_preds(dl = test_dl, with_decoded=True)

    pred_list = [trained_labels[pred_idx] for pred_idx in pred_tensor]
    
    return test_data.label, pred_list

In [None]:
fastai_perf_dict = get_prediction_performance('fastai',
                                        saved_model_path + '/' + fastai_model_file,
                                          get_fastai_model_prediction, 
                                          test_sub_sample)

In [None]:
# add to final list
perf_dict_list.append(fastai_perf_dict)

## Consolidate reports

### Metrics comparison for all models

In [None]:
# performance metrics comparison for all 3 models
pd.DataFrame(perf_dict_list).drop(columns = ['classification_report'])

Unnamed: 0,model_name,accuracy,f1_micro,f1_macro,f1_weighted
0,NB,0.525441,0.525441,0.141529,0.544557
1,pytorch,0.918159,0.918159,0.75919,0.921834
2,fastai,0.918662,0.918662,0.75921,0.920472


### Top 10 languages F1 score

In [None]:
# list top 10 languiages based on number of records for each language
cound_df = test_sub_sample['label'].value_counts()
top_10_langs = cound_df[:10].index.tolist()
top_10_langs

['eng', 'rus', 'ita', 'tur', 'epo', 'deu', 'ber', 'kab', 'fra', 'por']

In [None]:
def get_f1_for_langs(perf_dict_list, langs_list):
    df_list = [d['classification_report'].loc[langs_list,'f1-score'] for d in perf_dict_list]
    df = pd.concat(df_list, axis =1)
    df.columns = [d['model_name'] for d in perf_dict_list]
    return df

In [None]:
# f1 scores for top 10 languages
get_f1_for_langs(perf_dict_list, top_10_langs)

Unnamed: 0,NB,pytorch,fastai
eng,0.773956,0.983598,0.980345
rus,0.373773,0.954546,0.960528
ita,0.724649,0.970603,0.965501
tur,0.684316,0.97875,0.978363
epo,0.807042,0.974055,0.979997
deu,0.732569,0.985353,0.988311
ber,0.511514,0.705455,0.70649
kab,0.312722,0.553429,0.468574
fra,0.717824,0.975776,0.978661
por,0.417648,0.952458,0.95273


### Bottom 10 languages F1 score

In [None]:
# list bottom 10 languiages based on number of records for each language
bottom_10_langs = cound_df[-10:].index.tolist()
bottom_10_langs

['hrx', 'mal', 'ltz', 'pms', 'arz', 'nst', 'lij', 'jav', 'hoc', 'zlm']

In [None]:
# f1 scores for bottom 10 languages
get_f1_for_langs(perf_dict_list, bottom_10_langs)

Unnamed: 0,NB,pytorch,fastai
hrx,0.0,0.293706,0.363636
mal,0.0,1.0,1.0
ltz,0.0,0.525316,0.601626
pms,0.0,0.670241,0.574822
arz,0.0,0.068182,0.0
nst,0.0,0.94,0.939297
lij,0.0,0.564315,0.41769
jav,0.0,0.700935,0.687783
hoc,0.0,0.638596,0.676471
zlm,0.0,0.275,0.186335


## Inference Performance test - on CPU

In [None]:
# perf test on 1000 records
perf_test_data = test_sub_sample[:1000].copy()

### NB model

In [None]:
# load model
nb_model = joblib.load(saved_model_path + '/' + nb_model_file)

In [None]:
%%timeit
nb_model.predict([test_sub_sample.text[0]])

1000 loops, best of 5: 773 µs per loop


## pytorch model

In [None]:
# load model
device = torch.device('cpu')
embedding_dim = 150
hidden_dim = 150
n_layers = 1
bidirectional = True
dropout_rate = 0.5
vocab_size = len(vocab)
pad_index = vocab['<pad>']
output_dim = len(lang_labels)
    
model = LangDetect(vocab_size, embedding_dim, hidden_dim, output_dim, 
                n_layers, bidirectional, dropout_rate, pad_index)
model = model.to(device)
model.load_state_dict(torch.load(saved_model_path + pytorch_model_file, 
                                map_location=device))

  "num_layers={}".format(dropout, num_layers))


<All keys matched successfully>

In [None]:
# predict function for single record for pytorch model
def pt_predict(model, test_data, max_length = 256,
                             vocab = vocab, lang_labels= lang_labels,
                             collate_fn = collate, device = device, batch_size = 1):
    test_data = Dataset.from_pandas(test_data.loc[:,['label','text']])
    test_data = test_data.map(tokenize_data, fn_kwargs={'tokenizer': tokenizer,
                                                        'max_length': max_length})
    test_data = test_data.map(numericalize_data, fn_kwargs={'vocab': vocab})
    test_data = test_data.map(label_to_idx, fn_kwargs={'lang_labels': lang_labels})
    test_data = test_data.with_format(type='torch', columns=['ids', 'label', 'length'])

    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
                                                  collate_fn=collate_fn)
    epoch_predicted = []
    with torch.no_grad():
        for batch in test_dataloader:
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            predicted_classes = prediction.argmax(dim=-1)
            predicted_classes_np = predicted_classes.detach().cpu().numpy()
            epoch_predicted.extend(predicted_classes_np)
    
    epoch_predicted = [lang_labels[idx] for idx in epoch_predicted] 
    return epoch_predicted

In [None]:
%%timeit
_ = pt_predict(model,test_sub_sample.iloc[[0],:])

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ex/s]

10 loops, best of 5: 223 ms per loop


## fastai model

In [None]:
# load fastai model
fastai_model = load_learner(saved_model_path + '/' + fastai_model_file)

In [None]:
%%timeit
_ = fastai_model.predict(test_sub_sample.text[0])

10 loops, best of 5: 188 ms per loop
