In [1]:
!pip install umap-learn
!pip install -U sentence-transformers

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
Collecting sentence-transformers
  Downloading sentence-transformers-0.3.7.2.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 430 kB/s 
[?25hCollecting transformers<3.4.0,>=3.1.0
  Downloading transformers-3.3.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 1.4 MB/s 
Collecting tokenizers==0.8.1.rc2
  Downloading tokenizers-0.8.1rc2-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 5.1 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-0.3.7.2-py3-none-any.whl size=91186 sha256=0be461cee263c998155d3012b90c95b58d360e9d911c843300a8709249bcbcc0
  Stored in directory: /root/.cache/pip/wheels/df/42/15/b8329fd622

In [2]:
import numpy as np
import pandas as pd
import umap
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata
import lightgbm as lgb
from sklearn import metrics
import gc
import warnings



In [3]:
np.random.seed(0)
data = pd.read_csv('../input/data-set-augment-intent/data_file.csv')['Utterance'].values
labels = pd.read_csv('../input/data-set-augment-intent/data_file.csv')['Intent'].values

In [4]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(data, show_progress_bar=True)

100%|██████████| 245M/245M [00:17<00:00, 14.0MB/s]


HBox(children=(FloatProgress(value=0.0, description='Batches', max=301.0, style=ProgressStyle(description_widt…




In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
labels = le.fit_transform(labels)

In [6]:
df = pd.DataFrame(embeddings)
df['label'] = pd.Series(labels)

In [7]:
##splitting the data for finding better hyperparamters
df1 = df.drop(columns=['label'])
target = 'label'
predictors = df.columns[:-1]
bayesian_tr_index, bayesian_val_index  = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=1).split(df1[predictors], df[target].values))[0]

In [8]:
train_df = df1[predictors]
predictor = df1.columns[:-1].values.astype('str')
def LGB_bayesian(
    num_leaves,  # int
    min_data_in_leaf,  # int
    learning_rate,
    min_sum_hessian_in_leaf,    # int  
    feature_fraction,
    lambda_l1,
    lambda_l2,
    min_gain_to_split,
    max_depth):
    
    # LightGBM expects next three parameters need to be integer. So we make them integer
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int

    param = {
        'num_leaves': num_leaves,
        'max_bin': 63,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': learning_rate,
        'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'num_class':144,
        'feature_fraction': feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_gain_to_split': min_gain_to_split,
        'max_depth': max_depth,
        'save_binary': True, 
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'multi_error',
        'is_unbalance': True,
        'boost_from_average': False,   

    }    
    
    
    xg_train = lgb.Dataset(df1.iloc[bayesian_tr_index].values,
                           label=df.iloc[bayesian_tr_index][target].values,
                           free_raw_data = False
                           )
    xg_valid = lgb.Dataset(df1.iloc[bayesian_val_index].values,
                           label=df.iloc[bayesian_val_index][target].values,
                           free_raw_data = False
                           )   

    num_round = 1500
    clf = lgb.train(param, xg_train, num_round, valid_sets =[xg_valid], verbose_eval=250, early_stopping_rounds = 200)
    
    predictions = clf.predict(df1.iloc[bayesian_val_index].values, num_iteration=clf.best_iteration)   
    
    score = metrics.accuracy_score(df.iloc[bayesian_val_index][target].values, np.argmax(predictions,axis=1))
    
    return score

In [9]:
# Bounded region of parameter space
bounds_LGB = {
    'num_leaves': (5, 20), 
    'min_data_in_leaf': (5, 20),  
    'learning_rate': (0.01, 0.3),
    'min_sum_hessian_in_leaf': (0.00001, 0.01),    
    'feature_fraction': (0.05, 0.5),
    'lambda_l1': (0, 5.0), 
    'lambda_l2': (0, 5.0), 
    'min_gain_to_split': (0, 1.0),
    'max_depth':(3,15)
}

In [10]:
from bayes_opt import BayesianOptimization
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=13)

In [11]:
init_points = 10
n_iter = 5

print('-' * 130)
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_da... | min_ga... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[24]	valid_0's multi_error: 0.212987
|  1        |  0.787    |  0.4      |  1.188    |  4.121    |  0.2901   |  14.67    |  11.8     |  0.609    |  0.007758 |  14.62    |
Training until validation scores don't improve for 200 rounds
[250]	valid_0's multi_error: 0.178701
Early stopping, best iteration is:
[156]	valid_0's multi_error: 0.174026
|  2        |  0.826    |  0.3749   |  0.1752   |  1.492    |  0.02697  |  13.28    |  10.59    |  0.6798   |  0.00257  |  10.21    |
Training until validation 

In [12]:
# parameters that produce best results on auc
LGB_BO.max['params']

{'feature_fraction': 0.05,
 'lambda_l1': 0.0,
 'lambda_l2': 5.0,
 'learning_rate': 0.01,
 'max_depth': 4.882082219992168,
 'min_data_in_leaf': 5.07479887170428,
 'min_gain_to_split': 0.0,
 'min_sum_hessian_in_leaf': 0.01,
 'num_leaves': 5.0}

In [13]:
LGB_BO.probe(
    params={'feature_fraction': LGB_BO.max['params']['feature_fraction'], 
            'lambda_l1':LGB_BO.max['params']['lambda_l1'], 
            'lambda_l2': LGB_BO.max['params']['lambda_l2'], 
            'learning_rate': LGB_BO.max['params']['learning_rate'], 
            'max_depth': LGB_BO.max['params']['max_depth'], 
            'min_data_in_leaf': LGB_BO.max['params']['min_data_in_leaf'] ,
            'min_gain_to_split': LGB_BO.max['params']['min_gain_to_split'], 
            'min_sum_hessian_in_leaf': LGB_BO.max['params']['min_sum_hessian_in_leaf'], 
            'num_leaves': LGB_BO.max['params']['num_leaves']},
    lazy=True,
)

for i, res in enumerate(LGB_BO.res):
    print("Iteration {}: \n\t{}".format(i, res))
    
param_lgb = {
        'num_leaves': int(LGB_BO.max['params']['num_leaves']), # remember to int here
        'max_bin': 63,
        'min_data_in_leaf': int(LGB_BO.max['params']['min_data_in_leaf']), # remember to int here
        'learning_rate': LGB_BO.max['params']['learning_rate'],
        'min_sum_hessian_in_leaf': LGB_BO.max['params']['min_sum_hessian_in_leaf'],
        'bagging_fraction': 1.0, 
        'bagging_freq': 5, 
        'num_class':144,
        'feature_fraction': LGB_BO.max['params']['feature_fraction'],
        'lambda_l1': LGB_BO.max['params']['lambda_l1'],
        'lambda_l2': LGB_BO.max['params']['lambda_l2'],
        'min_gain_to_split': LGB_BO.max['params']['min_gain_to_split'],
        'max_depth': int(LGB_BO.max['params']['max_depth']), # remember to int here
        'save_binary': True,
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'multi_error',
        'is_unbalance': True,
        'boost_from_average': False}



Iteration 0: 
	{'target': 0.787012987012987, 'params': {'feature_fraction': 0.3999660847582191, 'lambda_l1': 1.1877061001745615, 'lambda_l2': 4.1213926633068425, 'learning_rate': 0.2900672674324699, 'max_depth': 14.67121336685872, 'min_data_in_leaf': 11.801738711259683, 'min_gain_to_split': 0.6090424627612779, 'min_sum_hessian_in_leaf': 0.007757509880902418, 'num_leaves': 14.624200171386038}}
Iteration 1: 
	{'target': 0.825974025974026, 'params': {'feature_fraction': 0.3749082032826262, 'lambda_l1': 0.17518262050718658, 'lambda_l2': 1.492247354445897, 'learning_rate': 0.026968622645801674, 'max_depth': 13.284731311046386, 'min_data_in_leaf': 10.592810418122113, 'min_gain_to_split': 0.679847951578097, 'min_sum_hessian_in_leaf': 0.002570236693773035, 'num_leaves': 10.21371822728738}}
Iteration 2: 
	{'target': 0.8358441558441558, 'params': {'feature_fraction': 0.05423574653643624, 'lambda_l1': 1.7916689135248487, 'lambda_l2': 4.745470908391052, 'learning_rate': 0.07319071264818977, 'max_d

In [14]:
import json
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)

oof = np.zeros(len(train_df))
#predictions = np.zeros((len(test_df),nfold))
feature_importance_df = pd.DataFrame()
    
i = 1
for train_index, valid_index in skf.split(df1.values, df[target].values):
    print("\nfold {}".format(i))
    xg_train = lgb.Dataset(df1.iloc[train_index].values,
                           label=df.iloc[train_index][target].values,
                           free_raw_data = False
                           )
    xg_valid = lgb.Dataset(df1.iloc[valid_index].values,
                           label=df.iloc[valid_index][target].values,
                           free_raw_data = False
                           )   

    
    clf = lgb.train(param_lgb, xg_train, 5000, valid_sets = [xg_valid], verbose_eval=250, early_stopping_rounds = 200)
    oof[valid_index] = np.argmax(clf.predict(df1.iloc[valid_index].values, num_iteration=clf.best_iteration),axis=1)
    clf.save_model(str(i)+'_model.txt')
    
    model_json = clf.dump_model()

    with open('model.json', 'w+') as f:
        json.dump(model_json, f, indent=4)
    #predictions[:,i-1] += clf.predict(test_df[predictors], num_iteration=clf.best_iteration)
    i = i + 1

print("\n\nCV AUC: {:<0.2f}".format(metrics.accuracy_score(df[target].values, oof)))


fold 1
Training until validation scores don't improve for 200 rounds
[250]	valid_0's multi_error: 0.17974
[500]	valid_0's multi_error: 0.163117
[750]	valid_0's multi_error: 0.156364
[1000]	valid_0's multi_error: 0.153247
[1250]	valid_0's multi_error: 0.148052
[1500]	valid_0's multi_error: 0.143896
[1750]	valid_0's multi_error: 0.141818
[2000]	valid_0's multi_error: 0.141299
[2250]	valid_0's multi_error: 0.139221
Early stopping, best iteration is:
[2114]	valid_0's multi_error: 0.138182

fold 2
Training until validation scores don't improve for 200 rounds
[250]	valid_0's multi_error: 0.201663
[500]	valid_0's multi_error: 0.182432
[750]	valid_0's multi_error: 0.175676
[1000]	valid_0's multi_error: 0.162162
[1250]	valid_0's multi_error: 0.159044
[1500]	valid_0's multi_error: 0.156445
[1750]	valid_0's multi_error: 0.153846
[2000]	valid_0's multi_error: 0.150728
[2250]	valid_0's multi_error: 0.150208
Early stopping, best iteration is:
[2061]	valid_0's multi_error: 0.150208

fold 3
Training 