In [1]:
    #########   ##########  Training pipeline for Lounge feedback    ###########   ##############

In [8]:
## Loading the dataset ..
import pandas as pd
import numpy as np
import ktrain
from ktrain import text
import tensorflow as tf

tf.__version__
pd.set_option("display.max_colwidth",None)

In [9]:
train_df = pd.read_csv(r'train_data.csv')[['text', 'category']]
test_df = pd.read_csv(r'test_data.csv')[['text', 'category']]
print(train_df.count())

text        5100
category    5100
dtype: int64


In [10]:
print(test_df.count())

text        900
category    900
dtype: int64


In [11]:
text.print_text_classifiers()

fasttext: a fastText-like model [http://arxiv.org/pdf/1607.01759.pdf]
logreg: logistic regression using a trainable Embedding layer
nbsvm: NBSVM model [http://www.aclweb.org/anthology/P12-2018]
bigru: Bidirectional GRU with pretrained fasttext word vectors [https://fasttext.cc/docs/en/crawl-vectors.html]
standard_gru: simple 2-layer GRU with randomly initialized embeddings
bert: Bidirectional Encoder Representations from Transformers (BERT) from keras_bert [https://arxiv.org/abs/1810.04805]
distilbert: distilled, smaller, and faster BERT from Hugging Face transformers [https://arxiv.org/abs/1910.01108]


In [15]:
(train, val, preproc) = text.texts_from_df(train_df=train_df,
                                           text_column='text',
                                           label_columns='category',
                                           val_df = test_df,
                                           maxlen = 400,
                                           preprocess_mode = 'distilbert')

['food', 'lounge', 'others']
   food  lounge  others
0   1.0     0.0     0.0
1   1.0     0.0     0.0
2   1.0     0.0     0.0
3   1.0     0.0     0.0
4   1.0     0.0     0.0
['food', 'lounge', 'others']
   food  lounge  others
0   1.0     0.0     0.0
1   1.0     0.0     0.0
2   1.0     0.0     0.0
3   1.0     0.0     0.0
4   1.0     0.0     0.0


Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

preprocessing train...
language: en
train sequence lengths:
	mean : 88
	95percentile : 232
	99percentile : 375


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 94
	95percentile : 231
	99percentile : 395


In [16]:
model = text.text_classifier(name = 'distilbert', train_data = train, preproc=preproc)
model.summary()

Is Multi-Label? False
maxlen is 400
done.
Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,779
Trainable params: 66,955,779
Non-trainable params: 0
_________________________________________________________________


In [17]:
learner = ktrain.get_learner(model = model,
                             train_data = train,
                             val_data = val,
                             batch_size = 10)

In [None]:
#learner.fit_onecycle(lr = 2e-5, epochs=20)

In [18]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.save('DistillBert')

In [19]:
data = ['Although I can not say that these are the same thing as potato chips, they are a good alternative and very tasty. The flavors are much like the potato chips you would get at the store; it is the texture that is different. They are less oil (a good thing) and lighter than potato chips almost like a rice cake except much thinner and crisper. All of the flavors are good except for the cheese one.']
predictor.predict(data, return_proba=True)



array([[0.31290114, 0.33846018, 0.34863865]], dtype=float32)