# TPS May 2022
This edition is about a binary classification problem that includes a number of different feature interactions.

## Import modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy

from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

import tensorflow as tf

## Load data

In [None]:
X = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv", index_col="id")

In [None]:
X.head()

### Separate target

In [None]:
y = X.pop("target")

### Copy data

In [None]:
X_tmp, y_tmp = copy.copy(X), copy.copy(y)

## Exploratory analysis

In [None]:
X.info()

Some of the features are continuous, other are categorical and there is an object feature (a string).

In [None]:
fig, ax = plt.subplots(1, figsize=(10,6))
ax = sns.boxplot(data=X.select_dtypes("int64").stack().reset_index().join(y, on='id'), x="level_1", y=0, hue="target")
ax.set_title("Boxplots of categorical variables")
ax.set_xlabel("Features")
ax.set_ylabel("Values")
plt.show()

In [None]:
X.select_dtypes("int64").describe()

The categorical variables have almost the same distribution for each target, none of this features is a strong indicator of the state by itself.

In [None]:
fig, ax = plt.subplots(1, figsize=(10,6))
ax = sns.boxplot(data=X.select_dtypes("float64").drop("f_28", axis=1).stack().reset_index().join(y, on='id'), x="level_1", y=0, hue="target")
ax.set_title("Boxplots of categorical variables")
ax.set_xlabel("Features")
ax.set_ylabel("Values")
plt.show()

In [None]:
X.select_dtypes("float64").describe()

With the continuous variables we can see some features that have slightly different distributions between targets, so this features are helpful to differentiate between states. It seems that there are a lot of outliers but they are very close to the whiskers so I won't remove any of them.

In [None]:
y.value_counts()

The two classes to predict are balanced.

In [None]:
X.select_dtypes("object").describe()

This object feature looks like some kind of encoding, I will split the letters and encode them to numbers and check if it gives some information about the target.

# Preprocessing

First I'm going to encode the feature f_27

In [None]:
# separate f_27
f_27 = X_tmp.pop("f_27")

# split letters in columns
tmp = []
for row in f_27:
    tmp.append(list(row))

# encode letters
oe = OrdinalEncoder()

f_27_enc = oe.fit_transform(tmp)

# add features to df
f_27_enc = pd.DataFrame(f_27_enc, index=X_tmp.index, columns=['f_27_0','f_27_1','f_27_2','f_27_3','f_27_4','f_27_5','f_27_6','f_27_7','f_27_8','f_27_9'])
X_tmp = X_tmp.join(f_27_enc)

print(mutual_info_classif(f_27_enc, y_tmp, discrete_features=True))

Using mutual information metric we can check these new features give some information about the target so I will keep them.

In [None]:
# save the original columns names
CAT_COLS = X_tmp.select_dtypes("int").columns
CON_COLS = X_tmp.select_dtypes("float").columns
X_COLS = X_tmp.columns

The best method to discover feature interactions and create new features based on them is domain knowledge, since there is no more information about the data provided let's create some new features combining previous ones and check if they provide some information.<br><br>
Let's create a new feature clustering the previous ones.

In [None]:
# create cluster features
k = range(2, 12)
inertias_all = []
inertias_cat = []
inertias_con = []

for i in k:
    kmeans = KMeans(i).fit(X_tmp[X_COLS].sample(10000))  # speed up
    inertias_all.append(kmeans.inertia_)
    kmeans = KMeans(i).fit(X_tmp[CAT_COLS].sample(10000))  # speed up
    inertias_cat.append(kmeans.inertia_)
    kmeans = KMeans(i).fit(X_tmp[CON_COLS].sample(10000))  # speed up
    inertias_con.append(kmeans.inertia_)

# plot values to choose best k
fig, ax = plt.subplots(1,3, figsize=(15,4))

ax[0].plot(inertias_all)
ax[0].set_xticks(range(len(k)))
ax[0].set_xticklabels(k)
ax[0].set_title("All features")
ax[0].set_xlabel("K")
ax[0].set_ylabel("Inertia")

ax[1].plot(inertias_cat)
ax[1].set_xticks(range(len(k)))
ax[1].set_xticklabels(k)
ax[1].set_title("Categorical features")
ax[1].set_xlabel("K")
ax[1].set_ylabel("Inertia")

ax[2].plot(inertias_con)
ax[2].set_xticks(range(len(k)))
ax[2].set_xticklabels(k)
ax[2].set_title("Continuous features")
ax[2].set_xlabel("K")
ax[2].set_ylabel("Inertia")

fig.tight_layout()
plt.show()

It seems that clustering the categorical variables is a bad choice since there is no great number of clusters to use, the graph shows no elbow. Using 4 or 5 clusters seems a good choice to group the continuous data.

In [None]:
# create feature
kmeans = KMeans(5).fit_predict(X_tmp[CON_COLS])
print(mutual_info_classif(kmeans.reshape(-1,1), y_tmp, discrete_features=True))

And mutual information confirms that this new feature gives some information about the target.<br><br>
Let's create some features grouping by categorical variables and using the mean of the continuous columns.

In [None]:
# groupby features
f_mean = []
tmp = copy.copy(X_tmp.sample(10000, random_state=123))  # reduce memory usage

for i in CAT_COLS:
    f_mean.append(tmp.groupby(i)[CON_COLS].mean())
    
for i in f_mean:
    tmp = tmp.join(i, on=i.index.name, rsuffix='groupby'+i.index.name+"mean")

# get most important features
mi = mutual_info_classif(tmp.filter(regex=("mean"), axis=1), y_tmp[tmp.index], random_state=123)
print(pd.Series(mi, tmp.filter(regex=("mean"), axis=1).columns).sort_values(ascending=False)[:10])

Some of these interactions give information about the target so I will use some of the best ones.<br><br>
Other features to consider are polynomial interactions.

In [None]:
# polynomial features
tmp = copy.copy(X_tmp.sample(10000, random_state=123))  # reduce memory usage
poly = PolynomialFeatures(degree=3, interaction_only=True)

interactions = pd.DataFrame(poly.fit_transform(tmp[X_COLS]), columns=poly.get_feature_names_out(X_COLS))

# use only the interactions for mutual information calculation
tmp_cols = list(X_COLS)
tmp_cols.append("1")
interactions.drop(tmp_cols, axis=1, inplace=True)

mi = mutual_info_classif(interactions, y_tmp[tmp.index], random_state=123)

# top 10 polynomial interactions
print(pd.Series(mi, interactions.columns).sort_values(ascending=False)[:10])

Again mutual information confirms that these new features give some information.

# Model validation

The new features that are going to be calculated are:
- f_27 splitted and encoded
- group by features:
    - f_27_5groupbyf_11mean
    - f_26groupbyf_30mean
    - f_05groupbyf_17mean
    - f_27_5groupbyf_16mean
    - f_22groupbyf_11mean
- kmeans feature with k=5 using the continuous features
- PCA features
- polynomial features:
    - f_29 f_27_8 f_27_9
    - f_17 f_29 f_27_8
    - f_21 f_27_9
    - f_27_7 f_27_9
    - f_24 f_27_7 f_27_9

In [None]:
class DatasetTransformer:
    def __init__(self):
        self.groupbyFeatures = []
        self.kmeans = KMeans(5)
        self.pca = PCA()
    
    def getObjectTypeEncoded(self, X):
        obj = X.select_dtypes("object")
        oe = OrdinalEncoder()
        arr = oe.fit_transform([list(row[0]) for row in obj.values])
        return pd.DataFrame(arr, index=X.index, columns=['f_27_0','f_27_1','f_27_2','f_27_3','f_27_4','f_27_5','f_27_6','f_27_7','f_27_8','f_27_9'])
    
    def getGroupbyFeatures(self, X, fit):
        tmp = copy.copy(X)
        
        if fit:
            self.groupbyFeatures.append(X.groupby("f_11")[["f_27_5", "f_22"]].mean())
            self.groupbyFeatures.append(X.groupby("f_30")["f_26"].mean())
            self.groupbyFeatures.append(X.groupby("f_17")["f_05"].mean())
            self.groupbyFeatures.append(X.groupby("f_16")["f_27_5"].mean())
        
        for gf in self.groupbyFeatures:
            tmp = tmp.join(gf, on=gf.index.name, rsuffix='groupby'+gf.index.name)
            
        return tmp.drop(X.columns, axis=1)

    def getKmeanFeatures(self, X, fit):
        con = X.select_dtypes("float64")
        if fit:
            self.kmeans.fit(con)
        
        return self.kmeans.predict(con)
    
    def getPCAFeatures(self, X, fit):
        if fit:
            self.pca.fit(X)
        
        return pd.DataFrame(self.pca.transform(X), index=X.index).add_prefix('pca_')
    
    def getPolyFeatures(self, X):
        
        return pd.concat([X['f_29'] * X['f_27_8'] * X['f_27_9'],
                          X['f_17'] * X['f_29'] * X['f_27_8'],
                          X['f_21'] * X['f_27_9'],
                          X['f_27_7'] * X['f_27_9'],
                          X['f_24'] * X['f_27_7'] * X['f_27_9']], axis=1).rename(columns={0:'poly1',
                                                                                      1:'poly2',
                                                                                      2:'poly3',
                                                                                      3:'poly4',
                                                                                      4:'poly5'})

In [None]:
# split data and create data transformer
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)
dt = DatasetTransformer()

In [None]:
# calculate new features
X_train = X_train.join(dt.getObjectTypeEncoded(X_train))
X_val = X_val.join(dt.getObjectTypeEncoded(X_val))
X_train.drop(["f_27"], axis=1, inplace=True)
X_val.drop(["f_27"], axis=1, inplace=True)

gbf_train = dt.getGroupbyFeatures(X_train, fit=True)
gbf_val = dt.getGroupbyFeatures(X_val, fit=False)

kmf_train = dt.getKmeanFeatures(X_train, fit=True)
kmf_val = dt.getKmeanFeatures(X_val, fit=False)

pca_train = dt.getPCAFeatures(X_train, fit=True)
pca_val = dt.getPCAFeatures(X_val, fit=False)

poly_train = dt.getPolyFeatures(X_train)
poly_val = dt.getPolyFeatures(X_val)

In [None]:
# add new features
X_train = X_train.join(gbf_train).join(pca_train).join(poly_train)
X_val = X_val.join(gbf_val).join(pca_val).join(poly_val)
X_train["kmeans"] = kmf_train
X_val["kmeans"] = kmf_val

I'm going to use two models to make the final predictions: LightGBM and a Deep Neural Network. The predictions will be the average of both models.

In [None]:
# tree based model
lgbm = LGBMClassifier(num_leaves=1000, num_iterations=250)  
lgbm.fit(X_train, y_train)

preds = lgbm.predict_proba(X_train)
print("Train AUC: ", roc_auc_score(y_train, preds[:,1]))

preds = lgbm.predict_proba(X_val)
print("Validation AUC: ", roc_auc_score(y_val, preds[:,1]))

In [None]:
# save lgbm predictions
lgbm_preds = lgbm.predict_proba(X_val)[:,1]

In [None]:
# standardize data for the DNN
ss = StandardScaler().fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)

In [None]:
# NN based model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dropout(0.05))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dropout(0.05))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dense(2, activation="softmax"))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=[tf.keras.metrics.AUC()])

In [None]:
history = model.fit(X_train, pd.get_dummies(y_train), epochs=130, batch_size=64, validation_data=(X_val, pd.get_dummies(y_val)))

In [None]:
fig, ax = plt.subplots(1,2, figsize=(16,8))
ax[0].plot(history.history['loss'], c='blue')
ax[0].plot(history.history['val_loss'], c='orange')
ax[1].plot(history.history['auc'], c='blue')
ax[1].plot(history.history['val_auc'], c='orange')
plt.show()

In [None]:
# combine DNN and LGBM
preds = pd.DataFrame(model.predict(X_val)[:,1], columns=["nn"])
preds["lbgm"] = lgbm_preds

print(roc_auc_score(y_val, preds.mean(axis=1)))

# Predictions

Once tested the model let's make the final predictions using the whole dataset as training.

In [None]:
X = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv", index_col="id")
X_test = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv", index_col="id")

In [None]:
y = X.pop("target")
dt = DatasetTransformer()

In [None]:
X = X.join(dt.getObjectTypeEncoded(X))
X_test = X_test.join(dt.getObjectTypeEncoded(X_test))
X.drop(["f_27"], axis=1, inplace=True)
X_test.drop(["f_27"], axis=1, inplace=True)

gbf_train = dt.getGroupbyFeatures(X, fit=True)
gbf_test = dt.getGroupbyFeatures(X_test, fit=False)

kmf_train = dt.getKmeanFeatures(X, fit=True)
kmf_test = dt.getKmeanFeatures(X_test, fit=False)

pca_train = dt.getPCAFeatures(X, fit=True)
pca_test = dt.getPCAFeatures(X_test, fit=False)

poly_train = dt.getPolyFeatures(X)
poly_test = dt.getPolyFeatures(X_test)

X = X.join(gbf_train).join(pca_train).join(poly_train)
X_test = X_test.join(gbf_test).join(pca_test).join(poly_test)
X["kmeans"] = kmf_train
X_test["kmeans"] = kmf_test

In [None]:
lgbm_model = LGBMClassifier(num_leaves=1000, num_iterations=250)
lgbm_model.fit(X, y)
lgbm_preds = lgbm_model.predict_proba(X_test)[:,1]

In [None]:
ss = StandardScaler().fit(X)
X = ss.transform(X)
X_test = pd.DataFrame(ss.transform(X_test), columns=X_test.columns, index=X_test.index)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dropout(0.05))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dropout(0.05))
model.add(tf.keras.layers.Dense(100, activation="relu"))
model.add(tf.keras.layers.Dense(2, activation="softmax"))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=[tf.keras.metrics.AUC()])

In [None]:
history = model.fit(X, pd.get_dummies(y), epochs=130, batch_size=64)

In [None]:
preds = pd.DataFrame(model.predict(X_test)[:,1], columns=["nn"])
preds["lbgm"] = lgbm_preds

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-may-2022/sample_submission.csv")
submission.head()

In [None]:
X_test["target"] = preds.mean(axis=1).values

In [None]:
submission.drop("target", axis=1, inplace=True)

In [None]:
submission = submission.join(X_test["target"], on="id")
submission.head()

In [None]:
submission.to_csv("./submission.csv", index=False)