# Tabular Playground Series - Sep 2021

## Setup
___

In [None]:
!pip install pycaret[full]

In [None]:
!pip install shap

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
import gc

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pycaret.classification import *
import shap

## Overview
___
There are missing values, but since PyCaret assigns the average value by default We will leave it as it is.

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

train = reduce_mem_usage(df_train.drop(['id'], axis=1))
test = reduce_mem_usage(df_test.drop(['id'], axis=1))

del df_train
del df_test
gc.collect()

In [None]:
train

In [None]:
test

In [None]:
train.info()

In [None]:
test.info()

## Predict with PyCaret (LightGBM)
___

In [None]:
def do_pycaret(target, train, test):
    clf = setup(data=train, target=target, silent= True, session_id=42)
    #add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target='pred_proba')
    add_metric('roc_auc', 'roc_auc', roc_auc_score, greater_is_better=False, target='pred_proba')
    lightgbm = create_model("lightgbm", fold=5)
    tuned = tune_model(lightgbm, fold=5)
    predh = predict_model(tuned)
    final = finalize_model(tuned)
    prep_pipe = get_config('prep_pipe')
    prep_pipe.steps.append(['trained_model', final])
    pred = prep_pipe.predict_proba(test)
    return(final, pred)

In [None]:
final, pred = do_pycaret('claim', train, test)

In [None]:
evaluate_model(final)

In [None]:
pred

## Model Analysis

In [None]:
plot_model(final, plot="auc")

In [None]:
plot_model(final, plot="feature")

In [None]:
interpret_model(final)

## Submission

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submission.claim  = pred[:, 1]
submission.to_csv('submission.csv',index=False)
submission