Thsi notebook will first serve as a baseline LightGBM run, which I will type by hand - based on "LightGBM Starter" by firefliesqn. Once I've established this sort of baseline, I'll branch out on my own and experiment with some additional FE and maybe some other models. This notebook will probably NOT include any NN; I might come back to this later. 

I would ideally fork the notebook, but I would like to practice by typing out all the code myself.

In [None]:
#The initial dependencies; will be updated if needed


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datatable as dt
import optuna

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import RobustScaler #I'll look into other scaling methods in the future versions
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import warnings
warnings.simplefilter('ignore')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = dt.fread('/kaggle/input/tabular-playground-series-sep-2021/train.csv').to_pandas()
test_df = dt.fread('/kaggle/input/tabular-playground-series-sep-2021/test.csv').to_pandas()
sample_df = dt.fread('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv').to_pandas()

Datasets imported, let's now check the shape of those datasets:

In [None]:
print(f'Shape of train_df: {train_df.shape}')
print(f'Shape of test_df: {test_df.shape}')
print(f'Shape of sample_df: {sample_df.shape}')

In [None]:
train_df.head()

Now we can split the datasets into the appropriate Xs and ys:

In [None]:
X_train = train_df.drop(['id', 'claim'], axis=1)
y_train = train_df['claim'].copy()

X_test = test_df.drop('id', axis=1)

Soo... Browsing through the discussions, I've learned that a big lesson to be learned from this dataset is that the missing values are not put there at random. Or, put another way, it is a feature itself which may help establish if there was a claim or not. Well, let's add this feature as well as the standard deviation to our dataset to help us make a better prediction:

In [None]:
for column in X_train.columns:
    print(f'{column}: {X_train[column].isna().sum()}')

Approximately 15k out of nearly 1M rows; I think it's safe to impute these values with the median value and not affect the data too severely, but at least get rid of the NaNs

In [None]:
X_train['n_miss'] = X_train.isna().sum(axis=1)
X_test['n_miss'] = X_test.isna().sum(axis=1)

#Now I realize why there was a list of columns, if we compute the standard deviation as is, 'n_miss' will be also taken into account
#I'll stick to using a slice of the dataframe:
X_train['std'] = X_train[:-1].std(axis=1)
X_test['std'] = X_test[:-1].std(axis=1)

In [None]:
X_train.head()

In [None]:
#Now in the original author's notebook, the NaN's were imputed with mean values, but I would like to stick to median values:
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

print(f'NaNs in X_train: {X_train.isna().sum().sum()}')
print(f'NaNs in X_test: {X_test.isna().sum().sum()}')

Great! Now that the missing values have been taken care of, we can handle some scaling efforts:

In [None]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
'''A function to reduce the amount of memory taken up by each feature by compressing it to the appropriate datatype
verbose parameter is used to output a message regarding the exact memory usage reduction'''
def reduce_memory_usage(df, verbose=True):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2 #initial memory usage to compare to
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            #extract the min and max values
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                #elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2 #new memory_usage
    if verbose:
        print(
            "Memory usage decreased to: {:.2f} Mb - {:.1f}% reduction".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
                
                )
            )
    return df

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [None]:
print("X_train redution:")
X_train = reduce_memory_usage(X_train)
print("X_test reduction:")
X_test = reduce_memory_usage(X_test)

In [None]:
#X_train_df = pd.DataFrame(X_train)
#X_train_df.hist(bins=50, figsize=(20,15))
#plt.show()

Lots of different distributions, some normal, some bi- or multimodal; tough luck... Probably the best solution would be to apply a transformation across the board.


Below are some initial params of the LightGBM algorhithm; I'll write them down for now. But probably I'll extend them to be lists of hyperparameters to tune:

The initial n_estimators did not result in early stopping, so it's probably wise to continue with the estimators - Early stopping will help us achieve this goal

In [None]:
x_tra, x_val, y_tra, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=42)

In [None]:
def objective(trial):
    num_leaves = trial.suggest_int("num_leaves", 20, 40)
    n_estimators = trial.suggest_int("n_estimators", 500, 2000)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 200, 750)
    learning_rate = trial.suggest_uniform('learning_rate', 0.10, 0.30)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.50, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.50, 1.0)
    
    model = lgb.LGBMClassifier(
        objective='binary',
        metric='auc',
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        learning_rate=learning_rate,
        colsample_bytree=colsample_bytree,
        random_state=42,
    )
    
    model.fit(x_tra, y_tra)
    #see link in markdown above for this next line
    score = roc_auc_score(y_val, model.predict_proba(x_val)[:,1])
    return score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
params = study.best_params #getting best params from study

In [None]:
lgb_params = {
    'objective': 'binary',
    'n_estimators': 20000, #worth tuning
    'random_state': 42,
    'learning_rate': 4e-3, #worth tuning
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 12.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
}

In [None]:
#lowercase and shortened to distinguish from the 'original' train sets
#x_tra, x_val, y_tra, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=42)

lgb_classifier = lgb.LGBMClassifier(**params)

lgb_classifier.fit(x_tra, y_tra, eval_set=[(x_val, y_val)],
                  eval_metric='auc', early_stopping_rounds=200,
                  verbose=500,
                  )
y_pred = lgb_classifier.predict_proba(X_test)

In [None]:
sample_df['claim'] = y_pred[:,1]

In [None]:
sample_df.to_csv('submission_8.csv', index=False)