In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Based on *''7 Cool Python Packages Kagglers Are Using Without Telling You''*****
Link: https://towardsdatascience.com/7-cool-python-packages-kagglers-are-using-without-telling-you-e83298781cf4

Note: I try to improve the original articles, by fixing issues and implementing cooler stuff, if they are easy to find.

## 0 - Basic data import using conventional names for the variables



In [None]:
df = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
df.head(5)

In [None]:
df.info()

In [None]:
FEATURES = [ f"f{i}" for i in range(1,119)]
TARGET = "claim"
X = df[FEATURES]
X = X.fillna(X.median()) # Basic data cleaning to avoid NaN
y = df[[TARGET]]

## 1 - UMAP: 2D projection

In [None]:
import umap
subsample = 10000 # To limit computation time
mapper = umap.UMAP()
mapper.fit(X.head(subsample), y.head(subsample))

In [None]:
import umap.plot
umap.plot.output_notebook()

y_array = y.head(subsample)['claim'].values

p = umap.plot.interactive(mapper, labels=y_array)
umap.plot.show(p)

## 2 - Datatable: faster than pandas

In [None]:
import datatable as dt

frame = dt.fread("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
frame.head(5)

> I put #3 at the bottom, because it will need to install new libraries with some dependency mess!
## 4 - Optuna: Hyperparameters optimization

In [None]:
import optuna
import xgboost as xgb
subsamples = 1000

def objective(trial, X=X.head(subsamples), y=y.head(subsamples)):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 0.5),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
    }
    
    scores = xgb.cv(
        params=params, 
        dtrain=xgb.DMatrix(data=X,label=y), 
        nfold=2,
        num_boost_round=10,
        early_stopping_rounds=10, 
        metrics="auc", # Receiver Operating Characteristic Area under the Curve
        seed=123
    )
    return scores.loc[scores.index[-1], "test-auc-mean"]

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_contour(study, 
    params=[
    'max_depth',
    'learning_rate'])

## 5 - SHAP: feature importance

In [None]:
import shap  # pip install shap
import xgboost as xgb

subsamples = 10000
model = xgb.XGBRegressor().fit(X.head(subsamples), y.head(subsamples))

explainer = shap.Explainer(model)
shap_values = explainer(X.head(subsamples))
print(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.initjs()
sample = 10
shap.force_plot(
    base_value=explainer.expected_value, 
    shap_values=shap_values.values[sample],
    features=X.head(subsamples).iloc[sample]
)

## 6 - Rapids cuDF: GPU managed dataframes
No need here

## 7 - Automatic EDA libraries
Check https://www.kaggle.com/andreshg/automatic-eda-libraries-comparisson/notebook#6.-%F0%9F%93%8A-D-Tale-%F0%9F%93%9A 

## 3 - LazyPredict: models benchamark with one line of code

In [None]:
# Now you understand why I placed this at the bottom: it is updating the version of many other packages!
! pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split

subsample = 2000 # To stay under 30 seconds

X_train, X_test, y_train, y_test = train_test_split(X.head(subsample), y.head(subsample), test_size=0.2)

reg = LazyClassifier(
    ignore_warnings=True, 
    random_state=1121218, 
    verbose=False
  )
models, predictions = reg.fit(X_train, X_test, y_train, y_test)  # pass all sets

In [None]:
models