In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv', index_col='id')
test_data = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col='id')

## EDA

In [None]:
train_data.head()

In [None]:
# Note that features have null values
train_data.describe()

In [None]:
# Claim has no null values, so we use that number for total records
total_records = len(train_data['claim'])
total_records

In [None]:
features = train_data.columns.drop('claim')
label = ['claim']
features

In [None]:
# Show percentage of missing values for each feature
# Note low percentage of missing values 
[str(round(null_count/total_records*100,2))+ '%' for null_count in train_data[features].isnull().sum()]

In [None]:
# Distributions of features
# Many have right skew
feature_histograms = train_data[features].hist(figsize = (120, 640), bins=50, grid = False, xlabelsize=8, ylabelsize=8, layout = (101,4))

In [None]:
# Correlation heatmap
# Roughly no correlation between features
fig = plt.gcf()
fig.set_size_inches(120, 120)
correlations = train_data[features].corr()
sns.heatmap(data=correlations, annot=True, cmap='mako')

In [None]:
# Are there discrete features?
# Blank array = no discrete features; all continuous
discrete_cols = []

for col in features:
    if np.array_equal(train_data[col].values, train_data[col].values.astype(int)):
        discrete_cols.append(col)
print(discrete_cols)

In [None]:
# Check for class imbalance
counts = train_data[label].value_counts()
counts

In [None]:
# Classes are roughly balanced
plt.bar([0,1], counts)
plt.xticks(ticks=[0,1])

## Data cleaning

In [None]:
# Have a column that counts nulls - https://www.kaggle.com/c/tabular-playground-series-sep-2021/discussion/270206 (TBD)
train_data['num_of_nulls'] = train_data[features].isnull().sum(axis=1)
test_data['num_of_nulls'] = test_data[features].isnull().sum(axis=1)

train_data['standard_deviations'] = train_data[features].std(axis=1)
test_data['standard_deviations'] = test_data[features].std(axis=1)

train_data['min'] = train_data[features].min(axis=1)
test_data['min'] = train_data[features].min(axis=1)

train_data['max'] = train_data[features].max(axis=1)
test_data['max'] = train_data[features].max(axis=1)

features = train_data.columns.drop('claim')

In [None]:
train_data

In [None]:
X = train_data[features].values
y = train_data[label].values
X_test = test_data.values

In [None]:
# Scale values to roughly be in the same range
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_test = scaler.transform(X_test)

X

In [None]:
# Fill in missing values with mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer.fit(X)
X = imputer.transform(X)
X_test = imputer.transform(X_test)

X

In [None]:
# Fix skew with Yeo Johnson transformation (to address positive and negative values)
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='yeo-johnson', standardize=False)
pt.fit(X)
X = pt.transform(X)
X_test = pt.transform(X_test)

X

In [None]:
# Turn unskewed features to dataframe so we can plot it on histograms
unskewed_features = pd.DataFrame(data=X, columns=features)
# Skews not fixed completely but, outliers aren't as extreme (fatter tails)
unskewed_histograms = unskewed_features.hist(figsize = (120, 640), bins=50, grid = False, xlabelsize=8, ylabelsize=8, layout = (101,4))

## Baseline models

In [None]:
from sklearn.metrics import auc, roc_curve

# This function takes the model and data and returns auc
def get_auc(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    val_predictions = model.predict(X_val)
    fpr, tpr, thresholds = roc_curve(y_val, val_predictions)
    return auc(fpr, tpr)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,y, shuffle=True)

In [None]:
# RGF (Regularized Greedy Forest) - https://www.kaggle.com/carlmcbrideellis/introduction-to-the-regularized-greedy-forest
!pip install rgf_python

In [None]:
# With results, we can decide which models are worth tuning hyperparameters on
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from rgf.sklearn import FastRGFClassifier
from catboost import CatBoostClassifier

model_list = [CatBoostClassifier(eval_metric='AUC'), FastRGFClassifier(), LogisticRegression(), RandomForestClassifier(), XGBClassifier(), LGBMClassifier(objective='binary'), HistGradientBoostingClassifier()]

for model in model_list:
    print('{model_name} AUC: {auc:.3f}'.format(model_name = str(model), auc = get_auc(model, X_train, y_train.ravel(), X_val, y_val.ravel())))