## Import libraries

In [None]:
! pip install -U lightautoml

In [None]:
import gc
import pickle
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer

from lightautoml.tasks import Task
from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML

## Load processed datasets

In [None]:
with open("../input/tps-sep-cooking-data/TPS_Sep_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

## Build and validate the model

In [None]:
FOLD = 5
N_THREADS = 4
TIMEOUT = 3600 * 5
RANDOM_STATE = 42

model = TabularUtilizedAutoML(
    task = Task('binary',), 
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': FOLD, 'random_state': RANDOM_STATE},
    #general_params = {'use_algos': [['lgb', 'lgb_tuned', 'cb_tuned'],['linear_l2','lgb']]},
    selection_params = {'mode': 0}
)

y_pred_meta_lama = model.fit_predict(train_df, roles={'target':'claim'})
print("\n\ny_pred_meta_lama: {}".format(y_pred_meta_lama.shape))

In [None]:
oof_score = roc_auc_score(train_df['claim'].values, y_pred_meta_lama.data[:,-1])
print("Aggregate OOF Score: {}".format(oof_score))

In [None]:
y_pred = (y_pred_meta_lama.data[:,-1]>0.5).astype(int)
print(classification_report(train_df['claim'].values, y_pred))

In [None]:
def plot_confusion_matrix(cm, classes):

    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix', fontweight='bold', pad=15)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontweight='bold')
    plt.xlabel('Predicted label', fontweight='bold')
    plt.tight_layout()

In [None]:
cnf_matrix = confusion_matrix(train_df['claim'].values, y_pred, labels=[0, 1])
np.set_printoptions(precision=2)
plt.figure(figsize=(12, 5))
plot_confusion_matrix(cnf_matrix, classes=[0, 1])

In [None]:
y_pred_final_lama = model.predict(test_df)

In [None]:
np.savez_compressed('./LAMA_Meta_Features.npz',
                    y_pred_meta_lama=y_pred_meta_lama.data, 
                    oof_score=oof_score,
                    y_pred_final_lama=y_pred_final_lama.data)

## Create submission file

In [None]:
submit_df = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submit_df['claim'] = y_pred_final_lama.data[:,-1]
submit_df.to_csv("LAMA_Submission.csv", index=False)
submit_df.head()