In [None]:
%matplotlib inline
import os, optuna
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from pandas_profiling import ProfileReport

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_selection import mutual_info_classif
from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report

from catboost import CatBoostRegressor, CatBoostClassifier

import seaborn as sns
from matplotlib import pyplot as plt

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# 1. Read in the Data and do Exploratory Data Analysis 
- look at the nature of the target variable
- look at the other variables for any distribution skews, missing data, data types (i.e. numerical, categorical, etc.)
- See if there are variable problems like high caridnality in the categorical data or different scales for the numerical data (i.e. values ranging from 0-1, 1-100, etc.)
- look at any correlations present

In [None]:
df = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv")
df = df.set_index('id')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Se if there is any target imbalabnce
np.unique(df['target'], return_counts=True)

For the binary target variable, the numbers of each label are very close with their bing a few more 1's than 0's

In [None]:
ProfileReport(df)

Mix of numerical and catgeorical variables. It looks like some variables could either be interger values or categorical as well (7-18). The numerical variables all look to be roughly Gaussian with different means and slightly different standard deviations. f28, f29, and f30 look to have some correlations with different groupings of the numerical variables. f_27 has extremely high cardinality (around 80% unique). No missing data. 

# 2. Read in and Process the Data
- Do any preprocessing neccesary, to include encoding catgeoricals

In [None]:
# Some helper functions for different types of encodings of categorical data

def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

def one_hot_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category", "object"]):
        X = X.join(pd.get_dummies(X[colname], prefix=colname))
        X = X.drop(colname, axis=1)
    return X

In [None]:
# categorical feature encoding

def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name].cat.add_categories("None", inplace=True)
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

In [None]:
# Wrapper function to read in, encode and impute missing values for the data

def load_data():
    df_train = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv", index_col="id")
    df_test = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv", index_col="id")
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test])
    # Preprocessing
    df = encode(df)
    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test

In [None]:
# Specify Categoricals

features_nom = [
    'f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 
    'f_14', 'f_15', 'f_16', 'f_17',
    'f_18','f_27', 'f_29', 'f_30', 'target'
]

ordered_levels = {}

# Add a None level for missing values
ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}


In [None]:
df_train, df_test = load_data()

# 3. Establish a baseline

In [None]:
def score_dataset(X, y, model=CatBoostClassifier(iterations=200, task_type="GPU", devices='0', silent=True), 
                  l_encode=True):
    # Label encoding for categoricals
    if l_encode:
        X = label_encode(X)
    else:
        X = one_hot_encode(X)
        
    y = y.cat.codes
    
    score = cross_val_score(
        model, X, y, cv=5, scoring='f1'
    )
    
    return score.mean()

In [None]:
X = df_train.copy()
y = X.pop('target')

score_dataset(X, y)

# 4. Feature Engineering
For this section, I will try out a couple of engineered features to see if I can get better performance from a model. Inn particular, I'll try:
- removing uninformative features
- mathematical transforms
- binning, or clustering

In [None]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    y= y.cat.codes
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
X = df_train.copy()
y = X.pop('target')
mi_scores = make_mi_scores(X, y)
mi_scores

In [None]:
# Try removing some of the uninformative features to see if that improves scores
uninformative_features = mi_scores[-2:].index.tolist()

X = df_train.copy()
y = X.pop('target')
X = X.loc[:,~X.columns.isin(uninformative_features)]

score_dataset(X, y)

Removing low information features did not help the model

### 4(a) Evaluate Interactions
Since the heart and soul of this competition is interactions, we want to invest some time in looking at all kinds of interactions
- define functions for elementals of interactions
- define a function that adds in all the interactions we want
- gow through combinations of possible interactions and record the scores

In [None]:
def ratio_interaction(col_1, col_2):
    diff = col_1-col_2
    combine = col_1+col_2
    return diff.abs() / combine

def levels_interaction(num_col, cat_col):
    interaction_df = pd.get_dummies(cat_col.astype("category"), prefix=cat_col.name).mul(num_col, axis=0)
    interaction_df.columns = [i+"_x_"+num_col.name for i in interaction_df.columns]
    return interaction_df

In [None]:
def create_interactions(df):
    interaction_df = pd.DataFrame(index=df.index)
    # create ratio interactions
    col_1 = ['f_22','f_00','f_03','f_03','f_22', 'f_00','f_05','f_20','f_01','f_21','f_22','f_00','f_20','f_01','f_06','f_05','f_25','f_00','f_01','f_00']
    col_2 = ['f_26','f_01','f_28','f_05','f_23','f_21','f_21','f_24','f_05','f_22','f_28','f_26','f_25','f_22','f_28','f_23','f_28','f_24','f_02','f_28']
    
    for i,j in zip(col_1, col_2):
        interaction_df[i+"_"+j+"_ratio"] = ratio_interaction(df[i], df[j])
        
    num_col = ['f_01','f_00_f_01_ratio','f_00_f_28_ratio','f_00_f_24_ratio','f_19','f_00_f_01_ratio','f_03','f_20_f_25_ratio','f_06_f_28_ratio','f_22_f_28_ratio',
               'f_01_f_05_ratio','f_00_f_21_ratio','f_01_f_22_ratio','f_26','f_00_f_28_ratio','f_22_f_28_ratio','f_00_f_24_ratio','f_22_f_26_ratio',
               'f_20_f_25_ratio','f_05_f_21_ratio']
    cat_col = ['f_13','f_29','f_16','f_08','f_07','f_14','f_15','f_17','f_14','f_13','f_16','f_29','f_16','f_17','f_14','f_14','f_17','f_16','f_13','f_29']
    
    for_combo_df = interaction_df.copy()
    for_combo_df = interaction_df.join(df)
    for i,j in zip(num_col, cat_col):
        interaction_df = interaction_df.join(levels_interaction(for_combo_df[i], for_combo_df[j]))
        
    return interaction_df

In [None]:
# Loop through all the possible ratio interactions with all of the numerical columns

num_cols = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28']
results = []
for i in range(len(num_cols)):
    for j in range(i+1,len(num_cols)):
        X = df_train.copy()
        y = X.pop('target')
        X[num_cols[i]+"_"+num_cols[j]+"_ratio"] = ratio_interaction(X[num_cols[i]], X[num_cols[j]])
        score = score_dataset(X, y)
        results.append({"col_1":num_cols[i], 'col_2':num_cols[j], 'score':score})

results_df = pd.DataFrame(results)

In [None]:
results_df.to_csv("numerical_ratio_results.csv")

In [None]:
results_df.sort_values("score", ascending=False)[0:20]

It looks like roughly the first 20 ratio style interactions improve the results. Let's add them to our data and now consider interactions between numerical and categorical data

In [None]:
# Loop through all the possible num-cat interactions with all of the numerical columns (including ratios) and categorical columns

num_cols = ['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28',
           'f_22_f_26_ratio', 'f_00_f_01_ratio','f_03_f_28_ratio', 'f_03_f_05_ratio', 'f_22_f_23_ratio','f_00_f_21_ratio', 'f_05_f_21_ratio', 'f_20_f_24_ratio',
            'f_01_f_05_ratio', 'f_21_f_22_ratio', 'f_22_f_28_ratio','f_00_f_26_ratio', 'f_20_f_25_ratio', 'f_01_f_22_ratio','f_06_f_28_ratio', 'f_05_f_23_ratio', 
            'f_25_f_28_ratio','f_00_f_24_ratio', 'f_01_f_02_ratio', 'f_00_f_28_ratio'
           ]
cat_cols = ['f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17',
            'f_18', 'f_29', 'f_30']

results = []
for i in range(len(num_cols)):
    for j in range(len(cat_cols)):
        X = df_train.copy()
        y = X.pop('target')
        X = X.join(create_interactions(X))
        X = X.join(levels_interaction(X[num_cols[i]], X[cat_cols[j]]))
        score = score_dataset(X, y)
        results.append({"num_col":num_cols[i], 'cat_col':cat_cols[j], 'score':score})

results_df = pd.DataFrame(results)

In [None]:
results_df.to_csv("nnumerical_categorical_interaction_results.csv")

In [None]:
results_df.sort_values("score", ascending=False)[0:20]

Once again, basically the first 20 seem to help a little

In [None]:
# Evaluate results with interactions
X = df_train.copy()
y = X.pop('target')
X = X.join(create_interactions(X))

score_dataset(X, y)

We are able to get some slight improvement with the new interaction features. We will probably need to investigate this area more. Finally, lets see if any of the features no longer help, post creating interactions

In [None]:
X = df_train.copy()
y = X.pop('target')
X = X.join(create_interactions(X))
mi_scores = make_mi_scores(X, y)

In [None]:
mi_scores[-103:]

In [None]:
# Try removing some of the uninformative features to see if that improves scores
uninformative_features = mi_scores[-103:].index.tolist()

X = df_train.copy()
y = X.pop('target')
X = X.loc[:,~X.columns.isin(uninformative_features)]

score_dataset(X, y)

Removing the uniformative features still reduces performance

### 4(b) Binning Variables
Now, Lets try some clustering some variables to see if the resulting binned variable get any better results.
- define some functions to create cluster labels and cluster distances
- create some possible groups of features that might make good cluster features

In [None]:
def cluster_labels(df, features, n_clusters=10):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = one_hot_encode(X_scaled)
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / (X_scaled.std(axis=0)+0.000001)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50)
    X_new = pd.DataFrame(index=X.index)
    X_new["Cluster"] = kmeans.fit_predict(X_scaled)
    X_new["Cluster"] = X_new["Cluster"].astype("category")
    return X_new


def cluster_distance(df, features, n_clusters=10):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = one_hot_encode(X_scaled)
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / (X_scaled.std(axis=0)+0.000001)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50)
    X_cd = kmeans.fit_transform(X_scaled)
    # Label features and join to dataset
    X_cd = pd.DataFrame(
        X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])], index=X.index
    )
    return X_cd

In [None]:
# Try some clusters based on observed correlations

variables_1 = [
    'f_28', 'f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_07'
]

variables_2 = [
    'f_30','f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26'
]

variables_3 = [
    'f_29','f_07', 'f_08','f_09', 'f_10', 'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17','f_18'
]

First set of variables

In [None]:
X = df_train.copy()
y = X.pop('target')
X = X.join(cluster_distance(X, variables_1, n_clusters=20))

score_dataset(X, y)

In [None]:
X = df_train.copy()
y = X.pop('target')
X = X.join(cluster_labels(X, variables_1, n_clusters=20))

score_dataset(X, y)

Second set of variables

In [None]:
X = df_train.copy()
y = X.pop('target')
X = X.join(cluster_distance(X, variables_2, n_clusters=20))

score_dataset(X, y)

In [None]:
X = df_train.copy()
y = X.pop('target')
X = X.join(cluster_labels(X, variables_2, n_clusters=20))

score_dataset(X, y)

Third set of variables

In [None]:
X = df_train.copy()
y = X.pop('target')
X = X.join(cluster_distance(X, variables_3, n_clusters=20))

score_dataset(X, y)

In [None]:
X = df_train.copy()
y = X.pop('target')
X = X.join(cluster_labels(X, variables_3, n_clusters=20))

score_dataset(X, y)

The clusters really don't seem to help much at all. I'll leave them out for now and revisit better possible binnings

# 5. Combine any engineered features together and check model performance
- define a function to add in all the features to a data set
- define a class to use cross-fold validation with target encoding
- try some target encoding for better performance?
- check model performance

In [None]:
class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

In [None]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop('target')
    y = y.cat.codes
    
    if df_test is not None:
        X_test = df_test.copy()
        X = pd.concat([X, X_test])
        
        
    # Add in engineered features
    X = label_encode(X)
    X = X.join(create_interactions(X))

    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)
    '''
    # Target Encoder
    encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    cols_to_target_encode = ["f_27"]
    X = X.join(encoder.fit_transform(X, y, cols=cols_to_target_encode))
    X = X.loc[:,~X.columns.isin(cols_to_target_encode)]
    if df_test is not None:
        X_test = X_test.join(encoder.transform(X_test))
        X_test = X_test.loc[:,~X_test.columns.isin(cols_to_target_encode)]
    '''

    if df_test is not None:
        return X, X_test
    else:
        return X

In [None]:
X_train = create_features(df_train)
y_train = df_train.loc[:, 'target']

score_dataset(X_train, y_train)

Interestingly, trying to target encode the high-cardinality f_27 variable really did not help performance.

# 6. Tune hyperparameters
- define an optuna study function, with associated parameter space, to find optimal hyperparameters for our model
- run the optuna study and visualize results

In [None]:
def objective(trial):
    cat_params = dict(
        iterations = 1000,
        task_type="GPU", 
        devices='0',  
        silent=True,
        depth=trial.suggest_int("depth", 2, 16),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.9, log=True),
        random_strength=trial.suggest_int('random_strength', 0, 100),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-4, 1e2, log=True),
        bootstrap_type = trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        boosting_type = trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        od_type = trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    )
    
    if cat_params["bootstrap_type"] == "Bayesian":
        cat_params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif cat_params["bootstrap_type"] == "Bernoulli":
        cat_params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    cat_boost = CatBoostClassifier(**cat_params)
    return score_dataset(X_train, y_train, cat_boost)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
cat_params = study.best_params

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
print(cat_params)

I found the best catboost hyperparamters to be the following (with cross validation F1 score around 0.8789):

```python
cat_params = {'depth': 8, 'learning_rate': 0.32859826626165944, 'random_strength': 75, 'l2_leaf_reg': 0.2664027110002379, 'bootstrap_type': 'Bernoulli', 'boosting_type': 'Ordered', 'od_type': 'IncToDec', 'subsample': 0.8982887767077825}.
```

## 6. Fit the final model and submit Predictions
- do the final model fit
- place predictions in the submissions format
- check training predictions with the actual results as a sanity check

In [None]:
X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:, 'target']

In [None]:
#Train the final model on all of the available data

X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:, 'target'].cat.codes

final_model = CatBoostClassifier(iterations = 1000, task_type="GPU", devices='0', silent=True,**cat_params)
final_model.fit(X_train, y_train)


In [None]:
#Take a look at model performance

print(classification_report(y_train, final_model.predict(X_train)))

In [None]:
# get out final predictions.

predictions = final_model.predict(X_test)

In [None]:
# Save predictions for submission

output = pd.DataFrame({'id': X_test.index, 'target': predictions})
output.to_csv('submission.csv', index=False)