## Summary

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
directory = os.path.join("/kaggle/input","tabular-playground-series-feb-2021/")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the currentsession
##print('install optuna')
##!pip install optuna==2.5.0

import random

## ================================================================================
## additional imports
## ================================================================================
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
##import lightgbm as lgb

## use this for integrated hperparameter search
import optuna.integration.lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression


## Load data

In [None]:
print(directory)
df = pd.read_csv(os.path.join(directory,'train.csv'));
df.head()

## Check first for missing/bad entries

In [None]:
print('per column: ',df.isnull().sum());
print('total: ', df.isnull().sum().sum())
print('no additional processing needed')
target = 'target';

## Analyze Continuous Features
Because the features are anonymized, there isn't much qualitative information we can gather, but we can analyze the "suitability" of the features are for modeling.

1. Look at distributions of features
2. Look at collinearity: while collinearity is easily known to be undesirable for linear regression, they can also be undesirable for more complex models, such as trees, where it can confound feature importance analyses

In [None]:
continuous_features = [i for i in df.columns if 'cont' in i]
cat_features = [i for i in df.columns if 'cat'in i]

continuous = df.filter(items=continuous_features)
## a lot of useful info are simple functions in pandas, like corr
corr_mat= continuous.corr();

mask = np.triu(np.ones_like(corr_mat, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(13,10));

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True);

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_mat, mask=mask, cmap=cmap, vmax=.7, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True);

## Feature Interactions
Feature interactions typically occur in most modelling problems. We cannot decompose our model into a sum of individual feature effects since the effect of one feature will depend on the concurrent value of another one. It might be useful to see if any of the features interact with respect to the target

## Distributions of continuous features
Questions to think about
1. Outliers? Consider removal.
2. Skewness? Consider transformations like sqrt or log.
3. Multimodal?

Most of the distributions are multimodal

In [None]:
continuous.hist(figsize=(15, 10), bins = 100, edgecolor='black', linewidth=0.2);
plt.show();

## Feature cont1 and cont8
It does not appear to be 100% continuous, what are the gaps? More importantly, does this change how we treat the feature? i.e. we might consider binning this variable and make it categorical.

Agglomerative clustering: Unfavorable memory requirements $O(n^3)$

In [None]:
f1 = continuous['cont1'];
plt.figure(figsize = (10,10))
plt.plot(f1,'.', markersize = 0.3);

## cluster these values into bins
#kmeans wants a 2d array
init = np.array([0.8,0.7, 0.61, 0.55, 0.48, 0.42,0.36, 0.29,0.27,0.1]).reshape(-1,1)
## k-means experiment
km = KMeans(n_clusters=10, init = init).fit(f1.values.reshape(-1,1));
print(km);
for i in km.cluster_centers_:
    plt.axhline(i[0], color = 'green');
plt.yticks(np.linspace(0,1,11));
plt.show();


## Feature to Target Correlations
nice, feature to target correlations are all very weak

In [None]:
cm = df[[target]+continuous_features].corr();
cm['target'][continuous_features].plot.bar(edgecolor='black', linewidth=0.2);

## Comparison with Mutual Information
mutual info regression does not scale well with large data. We need to subsample and also reduce the variance in the MI estimate (so increase nearest neighbors arguments)

Mutual information has the one advantage in that it encodes relational info that can be nonlinear unlike correlation. Fascinatingly, the mutual info metrics on the features generally appear weak.

Additionally, we see that four categorical features have the least mutual information: cat4, cat7, cat6, cat0. We will see in the next section, that these labels are heavily imbalanced, which likely explains the issue. We might consider removing these from the model.

In [None]:
# Utility functions from Tutorial
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, n_neighbors = 20,discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    color = np.array(["C0"] * scores.shape[0])
    # Color red for probes
    idx = [i for i, col in enumerate(scores.index)
           if col.startswith("PROBE")]
    color[idx] = "C3"
    # Create plot
    plt.barh(width, scores, color=color)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    
## WE have to downsample the data substantially to do this
n_samples = 100000;
sample_inds = random.sample(range(0, len(df)), n_samples)
mi_scores = make_mi_scores(df[cat_features+continuous_features].iloc[sample_inds], df[target].iloc[sample_inds])
plt.figure(figsize = (10,6))
plot_mi_scores(mi_scores)

## Categorical Features
How do we want to encode categorical features? If we actually use a tree-based model, we can leave the categorical features as is since a tree just operates on splits

Things to check:
1. any categorical features with highly unbalanced labels (i.e. 100% one category means there's no information to help in the model).
2. Cardinality of categorical features (will inform what kind of encoding)

### Types of categoricals
Due to anonymization, we can't really figure out nominal or ordinal
1. Binary: one versus the other
2. Nominal: multi-group but no ordering of categories
3. Ordinal: multi-group but ordering of categories

Last time, we fit a model only on the continuous features for an RMSE of 0.86. Let's see what the categorical features add for us.

In [None]:
categoricals = df[cat_features]

## describe is much more useful for getting quick stats on categoricals than continuous
categoricals.describe()




## Final Processing for modeling
If we want to add in categorical features with no processing, we have to convert the type from string to categoricals

In [None]:
#print(X.dtypes)
if('cat4' in cat_features):
    cat_features.remove('cat4')
if('cat7' in cat_features):
    cat_features.remove('cat7')
X = df[continuous_features+cat_features]
y = df[target]
X[cat_features] = X[cat_features].astype("category")
## convert all cat features to type categorical (from object)
print(X.dtypes)

## Basic models

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
feature_names = continuous_features+cat_features
lgtrain = lgb.Dataset(X_train, label=y_train, feature_name=feature_names, categorical_feature=cat_features)

lgval = lgb.Dataset(X_val, label=y_val, feature_name=feature_names, categorical_feature=cat_features)
# reg = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=31);
# reg.fit(train_data);
# print('train: ',reg.score(X_train, y_train))
# print('test: ',reg.score(X_val,y_val))
params = {
    "objective": "regression",
    "metric": "rmse",
    "num_leaves": 10,
    "learning_rate": 0.1,
    "bagging_fraction": 0.7,
    "feature_fraction": 0.7,
    "bagging_frequency": 5,
    "verbosity": -1
}
best_params, tuning_history = dict(), list()
evals_result = {}
reg = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
                  evals_result=evals_result)

## get rmse
print('final val rmse: ',(np.mean((reg.predict(X_val)-y_val)**2))**0.5)

## Quickly look at the best hyperparams

In [None]:
# print('best: ',best_params)
# print();
# print(tuning_history)
#print(evals_result)

In [None]:
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
X_test = test[feature_names]
X_test[cat_features] = X_test[cat_features].astype('category')
# print(X_test.dtypes)
preds = reg.predict(X_test)
submission = pd.DataFrame({'id':test.id,'target':preds})
submission.to_csv('submission.csv', index=False)
submission.head()
