# Introduction

For this challenge, you are given (simulated) manufacturing control data and are tasked to predict whether the machine is in state `0` or state `1`. 

The data has various feature interactions that may be important in determining the machine state.

## Files

* `train.csv` - the training data, which includes normalized continuous data and categorical data
* `test.csv` - the test set; your task is to predict binary target variable which represents the state of a manufacturing process
* `sample_submission.csv` - a sample submission file in the correct format


## Setup

In [None]:
import os
import warnings

warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model

from IPython.display import display
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import classification_report, accuracy_score

import catboost
import ipywidgets

from catboost import CatBoostClassifier, Pool

In [None]:
print(catboost.__version__)

In [None]:
# matplotlib
plt.rc('font', size=14)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")
sns.set_palette("rocket", 8, .75)

In [None]:
TRAIN_DATA_FILE = '../input/tabular-playground-series-may-2022/train.csv'
TEST_DATA_FILE = '../input/tabular-playground-series-may-2022/test.csv'
SAMPLE_SUBMISSION_FILE = '../input/tabular-playground-series-may-2022/sample_submission.csv'
SUBMISSION_FILE = 'submission.csv'

SAMPLE_SIZE = 1
TEST_SIZE = 0.25

INDEX = 'id'
TARGET = 'target'
FEATURES = ['f_{:02d}'.format(x) for x in range(0, 31)]

___
# Explore data

## Import data

At first, we import the data and look at some statistical values.

In [None]:
def read_data():
    """Imports the data sets.
    """
    train = pd.read_csv(TRAIN_DATA_FILE, index_col=INDEX)
    test = pd.read_csv(TEST_DATA_FILE, index_col=INDEX)
    submission = pd.read_csv(SAMPLE_SUBMISSION_FILE, index_col=INDEX)
    
    return train, test, submission

In [None]:
train, test, submission = read_data()

In [None]:
print(f'Train set size: {train.shape}')
print(f'Test set size: {test.shape}')

In [None]:
display(train.head())
display(test.head())

display(train.describe().T)

In [None]:
pd.DataFrame({
    'feature': train.columns,
    'dtype': train.dtypes
}).set_index('feature')

In [None]:
NUM_FEATURES = list(train.select_dtypes(include='float').columns)
INT_FEATURES = list(train.select_dtypes(include='int').columns)
INT_FEATURES.remove(TARGET)

**Insight**

* We have 900,000 training samples and 700,000 test samples.
* Both data sets has an `id` column (index).
* The variable to be predicted is `target`.
* There are 31 features in both data sets
    * 16 numerical features, and 14 with an integer type
    * `f_27` is of type string.

## Missing values

In [None]:
def total_missing_values(train, test):
    df = pd.DataFrame({
        'data_set': ['train', 'test'],
        'missing': [ train.isna().sum().sum(), test.isna().sum().sum()],
    }).set_index('data_set')

    df['missing_%'] = df['missing'] / len(train) 
    return df
    
total_missing_values(train, test) 

**Insight**

* There are not missing values.

___
# Exploratory data analysis (EDA)

## Target

In [None]:
def plot_count(data, feature, target=TARGET, ax=None, percent=True):
    if ax is None:
        ax = plt.gca()

    sns.countplot(
        data=data, 
        x=feature,
        hue=target,
        palette='rocket', 
        alpha=0.75,
        ax=ax)
        
    ax.set_title(f'Count {feature}')

In [None]:
fig, ax = plt.subplots(figsize=(5, 4))
plot_count(train, TARGET, target=None, ax=ax)

plt.tight_layout()
plt.show()

In [None]:
pd.DataFrame((train.target.value_counts() / len(train)).round(2))

**Insight**

* The target is binary, class `0` and `1`. 
* There are 49% samples of class `1` and 51% of class `0`.

## Unique values

In [None]:
unique_values = pd.DataFrame({
    'feature': train[FEATURES].columns,
    'train': train[FEATURES].nunique(),
    'test': test[FEATURES].nunique(),
}).set_index('feature')

unique_values

**Insight**

* The numerical features `f_00`-`f_06` and `f_19`-`f_28` are continuous.
* The integer features `f_07`-`f_16` has a number of unique values between 13 and 16.
* The feature `f_29` has 2 unique values, and `f_30` has 3 unique values.

## Continuous features

The 16 continuous feature are all normal distributed.

In [None]:
CONTINUOUS_FEATURES = list(train.select_dtypes(include='float').columns)

In [None]:
pd.DataFrame({
    'feature': CONTINUOUS_FEATURES,
    'mean': train[CONTINUOUS_FEATURES].mean(),
    'std': train[CONTINUOUS_FEATURES].std()
}).set_index('feature')

In [None]:
def plot_hist(data, feature, target=TARGET, ax=None):
    if ax is None:
        ax = plt.gca()
    
    sns.histplot(
        data=data, 
        x=feature, 
        legend=True,
        hue=target,
        kde=True,
        bins=30,
        palette='rocket',
        alpha=.60,
        ax=ax)

    ax.set_title(f'Distribution {feature}')

In [None]:
data = train.sample(frac=0.01).reset_index()

fig, axis = plt.subplots(nrows=4, ncols=4, figsize=(15, 13))
for feature, ax in zip(CONTINUOUS_FEATURES, axis.flatten()):
    plot_hist(data, feature, ax=ax)

plt.tight_layout()
plt.show()

**Insight**

* The histogram of the numerical features `f_00` - `f_06` shows that these features has a standard normal distribution (std=1, mean=0).
* The standard deviation for the feature `f_19` - `f_26` is between about 2.3 and 2.5.
* The std of `f_28` is almost 240. 

## Discrete features

In [None]:
DISCRETE_FEATURES = list(train.select_dtypes(include='int').columns)
DISCRETE_FEATURES.remove(TARGET)

In [None]:
data = train.sample(frac=0.01).reset_index()

fig, axis = plt.subplots(nrows=2, ncols=7, figsize=(24, 6))
for feature, ax in zip(DISCRETE_FEATURES, axis.flatten()):
    plot_count(data, feature, ax=ax)

plt.tight_layout()
plt.show()

**Insight**

* The discrete features `f_07` - `f_18` have values between 0 and 14.
* The feature `f_29` is binary. 
* The feature `f_30` is ternary. 

## Outlier detection

In [None]:
fig, axis = plt.subplots(nrows=4, ncols=4, figsize=(15, 8))
for feature, ax in zip(CONTINUOUS_FEATURES, axis.flatten()):
    sns.boxplot(
        data=train,
        x=feature,
        ax=ax,
        boxprops=dict(alpha=.75),
        palette='rocket')

plt.tight_layout()
plt.show()

### Categorical features

The string feature `f_27` is special. It has always a lenght of 10 and contains upper letters.  

In [None]:
train[['f_27']].head()

In [None]:
unique_values_f_27 = pd.DataFrame({
    'pos': range(0, 10),
    'train': [train['f_27'].map(lambda x: x[i]).nunique() for i in range(10)],
    'test': [test['f_27'].map(lambda x: x[i]).nunique() for i in range(10)]
}).set_index('pos')

unique_values_f_27    

In [None]:
CATEGORICAL_FEATURES = [f'p_{i}' for i in range(10)]
F_27_UNIQUE = 'f_27_unique'

def create_f27_data(data):
    categories = [chr(c) for c in range(65, 85)]
    
    df = pd.DataFrame({'id': data.index})
    for i in range(0, 10):
        df[f'p_{i}'] = list(data['f_27'].map(lambda x: x[i]))
        df[f'p_{i}'] = pd.Categorical(df[f'p_{i}'], categories=categories)
      
    df[F_27_UNIQUE] = list(data['f_27'].apply(lambda x: len(set(x))))
    df.set_index(INDEX, inplace=True)
    
    return data.merge(df, on=INDEX)

In [None]:
f27_train = create_f27_data(train.sample(frac=0.1))
display(f27_train[CATEGORICAL_FEATURES].head())

In [None]:
fig, axis = plt.subplots(nrows=4, ncols=3, figsize=(15, 10))
for feature, ax in zip(CATEGORICAL_FEATURES + [F_27_UNIQUE], axis.flatten()):
    plot_count(f27_train, feature, target=None, ax=ax)

plt.tight_layout()
plt.show()

## Correlation

In [None]:
def corr_rank(data, p=0.5):
    """Gets the top correlations.
    """
    corr_matrix = data.corr().abs()
    corr = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
                      .stack()
                      .sort_values(ascending=False))

    df = pd.DataFrame(corr, columns=['Correlation'])
    df = df[df['Correlation'] >= p]
    
    features = df.reset_index()['level_0'].append(df.reset_index()['level_1']).drop_duplicates()
    return df, list(features)

In [None]:
corr_df, feature_high_corr  = corr_rank(train[CONTINUOUS_FEATURES], p=0.175)
corr_df

In [None]:
data = train[feature_high_corr + [TARGET]].sample(frac=.0001)
sns.pairplot(data=data, corner=True, palette='rocket', hue=TARGET, aspect=1.5, height=1.4)

plt.show()

**Insight**

* None of the features show a strong linear correlation with the `target`.

In [None]:
corr = train[feature_high_corr].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(
    corr, 
    mask=mask, 
    cmap='rocket', 
    vmax=.3,
    vmin=-.3,
    center=0,
    square=True,
    annot=True,
    fmt='.1f',
    linewidths=.5, 
    cbar_kws={"shrink": .5})

plt.show()

## Nonlinear dependencies 

See notebook: [TPSMAY22 EDA which makes sense](https://www.kaggle.com/code/ambrosm/tpsmay22-eda-which-makes-sense#The-integer-features)

In [None]:
data = train.sample(frac=0.3).reset_index()

fig, axis = plt.subplots(nrows=4, ncols=4, figsize=(12, 10))
for feature, ax in zip(CONTINUOUS_FEATURES, axis.flatten()):
    df = pd.DataFrame({
        feature: data[feature].values, 
        'state': data[TARGET].values})
    df = df.sort_values(feature)
    df.reset_index(inplace=True)
     
    sns.scatterplot(
        x=df[feature], 
        y=df.state.rolling(5000, center=True).mean(),
        palette='rocket',
        alpha=0.4,
        markers='o',
        s=0.3,
        ax=ax)

plt.tight_layout()
plt.show()

**Insight**

* There are many non-linear dependencies between `target` and the countinouous features.


___

# Feature engineering (FE)

In [None]:
NUMERICAL_FEATURES = CONTINUOUS_FEATURES + DISCRETE_FEATURES

In [None]:
def feature_engineering(data):
    """
    """
    def create_features(X, is_train=True):
        df = create_f27_data(X)
        cols = list(df.columns.difference([TARGET, 'f_27']))

        if is_train:
            return df[cols], X[TARGET]
        
        return df[cols]
    
    return create_features

In [None]:
data = train.sample(frac=SAMPLE_SIZE)

fe = feature_engineering(data)
X_data, y_data = fe(data)

___
# Modeling

## Splitting data

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(
    X_data, 
    y_data, 
    test_size=TEST_SIZE, 
    random_state=42)

print(f'Train size: {X_train.shape[0]}')
print(f'Test size: {X_validation.shape[0]}')

## Model

In [None]:
iterations = 3500
params = {
    'iterations': iterations,
    'learning_rate': 0.02,
    'early_stopping_rounds': 150,
    'max_depth': 5,
    'eval_metric': 'Accuracy',
    'loss_function': 'Logloss',
    'random_seed': 2022,
    'use_best_model': True,
    'train_dir': 'tsp_may_2022',
    'verbose': int(iterations/10)
}

train_pool = Pool(
    X_train, 
    y_train, 
    cat_features=CATEGORICAL_FEATURES)

validate_pool = Pool(
    X_validation, 
    y_validation, 
    cat_features=CATEGORICAL_FEATURES)

In [None]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool);

In [None]:
y_pred = model.predict(X_validation)
y_proba = model.predict_proba(X_validation)

print(classification_report(y_validation, y_pred))

print(f'Best iteration: {model.get_best_iteration()}')
print(f'Learning rate: {model.learning_rate_}')

___
# Model analysis

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(16, 5))
disp = ConfusionMatrixDisplay.from_predictions(
    y_validation,
    y_pred,
    cmap='Reds',
    colorbar=False, 
    ax=ax1)
ax1.set_title('Confusion matrix')

RocCurveDisplay.from_predictions(
    y_validation, 
    y_pred, 
    ax=ax2)
ax2.set_title('ROC')

sns.histplot(
    data=y_proba,
    palette='rocket',
    stat='probability',
    legend=True,
    bins=100,
    kde=True,
    ax=ax3)
ax3.set_title('Prediction probapility')

plt.tight_layout()
plt.show()

In [None]:
metrics = ['AUC', 'Logloss', 'Precision', 'Recall', 'Accuracy', 'F1']

eval_metrics = model.eval_metrics(validate_pool, metrics, plot=False)
df = pd.DataFrame(eval_metrics)

fig, ax = plt.subplots(figsize=(10, 6))
sns.lineplot(data=df[::int(len(df)/60)], ax=ax)

ax.set_title('Model metrics')
ax.set_xlabel('Number of iterations')
ax.set_ylabel('Metrics')

plt.tight_layout()
plt.show()

## Feature Importance 

In [None]:
feature_names = X_data.columns

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': model.get_feature_importance(train_pool)
}).set_index('feature')

feature_importance.sort_values(by='importance', ascending=False, inplace=True)
feature_importance.head(15)

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))

df = feature_importance.head(50)
sns.barplot(
    data=df,
    y=df.index,
    x='importance',
    palette='rocket',
    orient='h',
    alpha=0.75,
    ax=ax)

ax.set_title('Feature Importance')

plt.tight_layout()
plt.show()

# Submission

In [None]:
X_test = fe(test, is_train=False)
y_pred_submission = model.predict_proba(X_test)

In [None]:
submission_data = pd.DataFrame({
    INDEX: X_test.index,
    TARGET: y_pred_submission[:, 1],
}).set_index(INDEX)

submission_data

In [None]:
# save submission file
submission_data.to_csv(SUBMISSION_FILE)

<h4>Thanks for reading. If this notebook was helpful for you, please vote for it.</h4>