In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-0.3.8.tar.gz (66 kB)
[K     |████████████████████████████████| 66 kB 1.7 MB/s 
[?25hCollecting transformers<3.4.0,>=3.1.0
  Downloading transformers-3.3.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 7.7 MB/s 
Collecting tokenizers==0.8.1.rc2
  Downloading tokenizers-0.8.1rc2-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 54.1 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-0.3.8-py3-none-any.whl size=101994 sha256=d0c0fc21a4ee49e3a67f91302bac0daa6eaeee70228836e825a195b81db9833a
  Stored in directory: /root/.cache/pip/wheels/1c/43/65/fe0f3ea9327623e749a79eb5dfad85a809c84064b1cc4682c1
Successfully built sentence-transformers
Installing collected packa

In [2]:
import numpy as np 
import pandas as pd 
import os
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from tokenizers import BertWordPieceTokenizer
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import lightgbm as lgb
import keras.layers as L
from keras.models import Model as M
from keras.optimizers import Adam as A



In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn import preprocessing

In [4]:
TEXT_COL = 'Utterance'
EMB_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
train=pd.read_csv('../input/data-set-augment-intent/data_file.csv')

In [5]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def load_embeddings(embed_dir=EMB_PATH):
    
    embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in tqdm(open(embed_dir)))
    return embedding_index

def build_embedding_matrix(word_index, embeddings_index, max_features, lower = True, verbose = True):
    
    embedding_matrix = np.zeros((max_features, 300))
    for word, i in tqdm(word_index.items(),disable = not verbose):
        if lower:
            word = word.lower()
        if i >= max_features: continue
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = embeddings_index["unknown"]
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def build_matrix(word_index, embeddings_index):
    
    embedding_matrix = np.zeros((len(word_index) + 1,300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embeddings_index[word]
        except:
            embedding_matrix[i] = embeddings_index["unknown"]
    return embedding_matrix

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
labels = le.fit_transform(train['Intent'].values)

In [7]:
def build_model_lstm(embedding_matrix , word_index , verbose = False, compile = True):
    
    sequence_input = L.Input(shape=(maxlen,), dtype='int32')
    embedding_layer = L.Embedding(len(word_index) + 1,
                                300,
                                weights=[embedding_matrix],
                                input_length=maxlen,
                                trainable=False)
    x = embedding_layer(sequence_input)
    x = L.SpatialDropout1D(0.2)(x)
    x = L.Bidirectional(L.GRU(128, return_sequences=True))(x)
    avg_pool1 = L.GlobalAveragePooling1D()(x)
    preds = L.Dense(144, activation='softmax')(avg_pool1)


    model = M(sequence_input, preds)
    if verbose:
        model.summary()
    if compile:
        model.compile(loss='categorical_crossentropy',optimizer=A(0.0001),metrics=['acc'])
    return model


def fast_encode(texts, tokenizer, chunk_size=256, maxlen=128):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

def build_model_bert(transformer, max_len=512):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(144, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

LGB_BO = {'feature_fraction': 0.05,
 'lambda_l1': 0.0,
 'lambda_l2': 5.0,
 'learning_rate': 0.01,
 'max_depth': 4.882082219992168,
 'min_data_in_leaf': 5.07479887170428,
 'min_gain_to_split': 0.0,
 'min_sum_hessian_in_leaf': 0.01,
 'num_leaves': 5.0}

param_lgb = {
        'num_leaves': int(LGB_BO['num_leaves']), # remember to int here
        'max_bin': 63,
        'min_data_in_leaf': int(LGB_BO['min_data_in_leaf']), # remember to int here
        'learning_rate': LGB_BO['learning_rate'],
        'min_sum_hessian_in_leaf': LGB_BO['min_sum_hessian_in_leaf'],
        'bagging_fraction': 1.0, 
        'bagging_freq': 5, 
        'num_class':144,
        'feature_fraction': LGB_BO['feature_fraction'],
        'lambda_l1': LGB_BO['lambda_l1'],
        'lambda_l2': LGB_BO['lambda_l2'],
        'min_gain_to_split': LGB_BO['min_gain_to_split'],
        'max_depth': int(LGB_BO['max_depth']), # remember to int here
        'save_binary': True,
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'multi_error',
        'is_unbalance': True,
        'boost_from_average': False}


# Configuration
EPOCHS = 20
BATCH_SIZE = 16
MAX_LEN = 64
maxlen = 64
max_features = 100000
embed_size = 300



tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer.save_pretrained('.')
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
transformer_layer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [8]:
def predict_func(input_data,bert_weight,lgbm_weight):
    """ 
    Okay so this is the most important part of the code
    
    this function predicts the data with the models trained
    with the augmented data on the models
    
    1. DistillBERT base uncase
    2. LGBM trained and hypertuned on the 
        token embeddings a pre-trained bert model
    3. naive LSTM model with fasttext pre trained embeddings
    
    parameters:-
    input_data : this where you put data to be predict an intent
                 so how not to break a code 
                 you can pass a series of data like 
                 train['Utterance'] shown in the short example
    
    bert_weight: weight of the bert in the ensemble
    lgbm_weight: weight of lgbm model in the ensemble
    lstm_weight: 1-bert_weight-lgbm_weight 
    
    returns: A list of outputs of the instances as labels
    
    """
    
    
    lstm_weight = 1 - bert_weight - lgbm_weight
    
    """ BERT Inference """
    x_test = fast_encode(input_data.astype(str), fast_tokenizer, maxlen=MAX_LEN)
    model_bert = build_model_bert(transformer_layer, max_len=MAX_LEN)
    bert_5_model_preds = []
    for i in os.listdir('../input/classfy-intent-distillbert/'):
        try:
            model_bert.load_weights('../input/classfy-intent-distillbert/'+i)
            preds_bert = model_bert.predict(x_test)
            bert_5_model_preds.append(preds_bert)
        except:
            pass
    print('bert_inference_completed')
    
    """ LGBM Inference """
    model_lgbm = SentenceTransformer('distilbert-base-nli-mean-tokens')
    embeddings = model_lgbm.encode(input_data.astype(str), show_progress_bar=True)
    x_test = pd.DataFrame(embeddings)
    lgbm_5_model_preds = []
    for i in os.listdir('../input/cluster-bert-lgbm/'):
        try:
            model_lgbm = lgb.Booster(model_file='../input/cluster-bert-lgbm/'+i)
            preds_lgbm = model_lgbm.predict(x_test)
            lgbm_5_model_preds.append(preds_lgbm)
        except:
            pass
    print('lgbm_inference_completed')
    
    """ LSTM Inference """
    tokenizer = Tokenizer(num_words=max_features, lower=True)
    train=pd.read_csv('../input/data-set-augment-intent/data_file.csv')
    tokenizer.fit_on_texts(list(train['Utterance']))
    word_index = tokenizer.word_index
    X_train = tokenizer.texts_to_sequences(list(input_data))
    X_train = pad_sequences(X_train, maxlen=maxlen)    
    embedding_matrix = build_matrix(tokenizer.word_index, load_embeddings())
    model_lstm = build_model_lstm(embedding_matrix=embedding_matrix,word_index=word_index)
    lstm_5_model_preds = []
    
    for i in os.listdir('../input/intent-classification-lstm/'):
        model_lstm.load_weights('../input/intent-classification-lstm/'+i)
        preds_lstm = model_lstm.predict(X_train)
        lstm_5_model_preds.append(preds_lstm)
    print('lstm_inference_completed')
    
    bert_avg_preds = np.average(bert_5_model_preds,axis=0)
    lgbm_avg_preds = np.average(lgbm_5_model_preds,axis=0)
    lstm_avg_preds = np.average(lstm_5_model_preds,axis=0)
    
    final_average = bert_avg_preds*bert_weight + lgbm_avg_preds*lgbm_weight + lstm_avg_preds*lstm_weight
    return np.argmax(final_average,axis=1)

In [9]:
final_average = predict_func(train['Utterance'][:20],0.5,0.3)

100%|██████████| 1/1 [00:00<00:00, 500.81it/s]


bert_inference_completed


100%|██████████| 245M/245M [00:14<00:00, 16.9MB/s]


HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…


lgbm_inference_completed


2000001it [05:15, 6329.80it/s]


lstm_inference_completed
