In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, gc, warnings
import random
import datetime
from tqdm.notebook import tqdm

from scipy import stats

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# from pandas.plotting import register_matplotlib_converters
# register_matplotlib_converters()

import sklearn
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

import optuna
import lightgbm as lgb
from optuna.integration import LightGBMPruningCallback

In [None]:
path = '../input/tabular-playground-series-sep-2021/'
# Input data files are available in the "../input/" directory.
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def load_data(source, dtypes, path=path):
    ''' load tables '''
    assert source in ['train', 'test']
    df = pd.read_csv(f'{path}/{source}.csv', index_col="id", dtype= dtypes)
    return df

In [None]:
%%time
train = load_data('train', None)
print(f"Data shape: {train.shape}")
train.sample(4)

In [None]:
%%time
test = load_data('test', None)
print(f"Data shape: {test.shape}")
test.sample(2)

In [None]:
target_name = "claim"
features = [col for col in train.columns if col not in [target_name]]

# Some initial analize

## Check if in the data set there is missing data.

In [None]:
def missing_statistics(df):    
    statitics = pd.DataFrame(df.isnull().sum()).reset_index()
    statitics.columns=['COLUMN NAME',"MISSING VALUES"]
    statitics['TOTAL ROWS'] = df.shape[0]
    statitics['% MISSING'] = round((statitics['MISSING VALUES']/statitics['TOTAL ROWS'])*100,2)
    return statitics

In [None]:
miss = missing_statistics(train)
miss

In [None]:
miss["% MISSING"].describe()

In [None]:
del miss

## Discrete features?
* **All the features are decimal, no categoraical input parameters.**

## Distribution Check.

In [None]:
train.describe().T

# Important step
* To create in cross validation a proper missing data distribution check "n_missing" dataset implementation.

In [None]:
train["std"] = train[features].std(axis=1)
test["std"]  = test[features].std(axis=1)
train["n_missing"] = train[features].isna().sum(axis=1)
test["n_missing"]  = test[features].isna().sum(axis=1)
features += ['std', 'n_missing']
n_missing = train["n_missing"].copy()

# Filling missign data with mean
* An mean of 1.597059% data is missing in each input column of this data set
* Due to I have now context about each row and there is less than 2& missing data, I decided to use the mean value of the column to fill each NaN gap.

In [None]:
train[features] = train[features].fillna(train[features].mean())
test[features]  = test[features].fillna(test[features].mean())

# Feature scalin

In [None]:
scaler = RobustScaler()

train[features] = scaler.fit_transform(train[features])
test[features]  = scaler.transform(test[features])

## Fisher asymmetry
* If the value is close to 0, it means: normal distribution
* If it is more positive: left skeewed distribution
* If it is more negative: rigth skeewed distribution

In [None]:
stats.skew([1,2,3,4,5])

## Kurtosis
* If near 0: the distribuiton is cole to a normal one.
* If it is positive, the values are really proxim to the central value, the data has no big tails.
* If it is negative, less values centread in mean and big tails

In [None]:
fig, axes = plt.subplots(11,11,figsize=(16, 16))
axes = axes.flatten()

for idx, ax in tqdm(enumerate(axes)):
    try:
        idx += 1
        values = train[f"f{idx}"].values
        sns.kdeplot(data=train, x=f'f{idx}', 
                    fill=True, 
                    ax=ax)
        sns.kdeplot(data=test, x=f'f{idx}', 
                    fill=True, 
                    ax=ax)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_xlabel(f'skew:{round(stats.skew(values), 2)}, kurt:{round(stats.kurtosis(values),2)}')
        ax.set_ylabel('')
        ax.spines['left'].set_visible(False)
        ax.set_title(f'f{idx}', loc='right', weight='bold', fontsize=10)
    except Exception as e:
        print(e)

fig.supxlabel('Average by class (by feature)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

# Determine wich columns are skewed
* If stats.skew is higher than 1 is skewed data with right tail.
* If stats.skew is less than -1 is skewed data with left tail.

In [None]:
def determine_skewed_columns(df, skew_top_threshold, skew_low_threshold):
    col_names = df.columns[:-1]
    skew = stats.skew(df.values)[:-1]
    mask = (skew >= skew_top_threshold) | (skew <= skew_low_threshold)
    
    return col_names[mask]

In [None]:
skew_columns_train = determine_skewed_columns(train, 1, -1)

In [None]:
skew_columns_test = determine_skewed_columns(test, 1, -1)

In [None]:
list(set(skew_columns_train) - set(skew_columns_test))

In [None]:
skew_columns = skew_columns_test

# Confusion matrix

In [None]:
%%time
fig, ax = plt.subplots(1, 1, figsize=(12 , 12))

corr = train.corr()

mask = np.zeros_like(corr, dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, ax=ax,
        square=True, center=0, linewidth=1,
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        cbar_kws={"shrink": .82},    
        mask=mask
       ) 

ax.set_title(f'Correlation', loc='left', fontweight='bold')     

plt.show()

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 10000
EARLY_STOPING_ROUND = 200
VERBOSE = 1000
SEED = 2021

N_BINS = 20

In [None]:
best_params = {
    'objective': 'binary',
    'n_estimators' : N_ESTIMATORS,
    'random_state' : SEED,
    'learning_rate': 0.030305148136078583,
    'subsample'    : 0.5150617351169511,
    'reg_alpha'    : 0.2491671010019858,
    'reg_labmda'   : 0.03618390402626644,
    'subsample_freq': 1,
    'colsample_bytree': 0.3917166178297055,
    'min_child_weight': 2,
    'min_child_sample': 48,
    'max_depth': 7,
}

# LGBMRegressor model prepare

In [None]:
def fit_regressor(df, tr_idx, val_idx, features_arr, target_str, params):
    # train
    tr_x, tr_y = df[features_arr].iloc[tr_idx], df[target_str][tr_idx]
    # evaluating ("test")
    vl_x, vl_y = df[features_arr].iloc[val_idx], df[target_str][val_idx]
    print({'df size':len(tr_x), 'eval size':len(vl_x)})

    clf = lgb.LGBMClassifier(**params)
    # Metric: Root Mean Square Error (RMSE), it tells you how concentrated the data is around the line of best fit.
    clf.fit(tr_x, tr_y,
            eval_set=[(vl_x, vl_y)],
            early_stopping_rounds=EARLY_STOPING_ROUND,
            eval_metric="auc",
            verbose=VERBOSE)
    #"l2"
    return clf

### Note:
**StratifiedKFold**: We are forcing the model to train with the missing data properly distributed in each train/test sample.

In [None]:
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED) # Provides train/test indices to split data in train/test sets.
# kf = KFold(n_splits=folds, shuffle=True, random_state=seed) #n_splits=folds
    
## generating 5 train/test pair of index_arrays, and analizing wich give the better results.
models = []
for tr_idx, val_idx in tqdm(kf.split(X=train, y=n_missing), total=N_SPLITS): # train/test indices
    clf = fit_regressor(train, tr_idx, val_idx, features, target_name, best_params)
    models.append(clf)
    
gc.collect() # trigger a manual garbage collection process, cleans up a huge amount of objects.

# Evaluation method in train data

In [None]:
def evaluate(valid_targets, probs, name):
    from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score
    
    y_pred = np.array(probs > 0.5, dtype=int)
    acc = accuracy_score(valid_targets, y_pred)
    loss = log_loss(valid_targets, y_pred)
    auc = roc_auc_score(valid_targets, probs)
    print("Accuracy score: %.2f"%(acc))
    print("Log loss: %.2f"%(loss))
    print("AUC score:", auc)
    print("Classification report:")
    print(classification_report(valid_targets, y_pred))
    return {
        "name": name, 
        "accuracy_score": acc, 
        "log_loss": loss, 
        "auc": auc
    }

In [None]:
probs = [model.predict_proba(train[features]) for model in models]

In [None]:
np.shape(probs)

In [None]:
probs = np.mean(probs, axis=0)
probs = probs.T[1]

In [None]:
evaluate(train[target_name], probs, "LGBMClassifier")

# Algorithm ID3 regresion
1. Calculate the initial system entropy based on the **objective** variable to predict.
    * Entropy: Determine wich parameters are more important than others to have a better sort in the tree.

In [None]:
_ = lgb.plot_importance(models[0], importance_type='split', figsize=(20,20)) , #"gain"

# Check in prediction sample

In [None]:
# preds = [model.predict(test_update) for model in models]
test_probs = [model.predict_proba(test[features]) for model in models]
out_loss = np.mean(test_probs, axis=0) # Using all the models and making the mean between each other.

# Save submision

In [None]:
submission = pd.read_csv(f'{path}/sample_solution.csv')
submission['claim'] = out_loss.T[1]

In [None]:
submission

In [None]:
submission.to_csv(f'./submission.csv', index=False)
submission.head(9)