# TPS - Feb 2022

In this notebook I am trying several approaches.  

Since in the dataset there are many rows which are duplicated there is still a debate over whether remove them or not.  
Check discussions,   
https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/305364  
https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/305733  

According to the suggestions of [AmbrosM](https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants), when the duplicated rows are removed, it is better using a sample weight.

Also, in this notebook there is a postprocessing step which tunes the probabilities.

So, I have tried to combine all those approaches and get some scores, so that it will be possible to decide about which one is the best.

<table>
  <caption>Submissions of model run with 300 estimators, Version 18:</caption>
  <tr>
    <td></td>
    <th scope="col">Postprocessing with predetermined tune value</th>
    <th scope="col">Postprocessing with calculation of the tune value</th>
  </tr>
  <tr>
    <th scope="row">Without duplicates (using sample weight)</th>
    <td align="center">0.98599</td>
    <td align="center">0.98674</td>
  </tr>
  <tr>
    <th scope="row">Without duplicates and sample weight</th>
    <td align="center">0.98659</td>
    <td align="center">0.98729</td>
  </tr>
  <tr>
    <th scope="row">With duplicates</th>
    <td align="center">0.98624</td>
    <td align="center">0.98709</td>
  </tr>
</table>

# Importing Libraries and Loading datasets

In [None]:
import os

import numpy as np
import pandas as pd

# Plot
import seaborn as sns
import matplotlib.pyplot as plt

# Encoding
from sklearn.preprocessing import LabelEncoder

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Cross-Validation
from sklearn.model_selection import StratifiedKFold

# Ensemble
from scipy import stats

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col=0)
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

# Explore Data

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
print("Columns: \n{0}".format(list(train.columns)))

# Basic Data Check

In [None]:
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)

## Missing values

In [None]:
missing_values_train = train.isna().any().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

## Duplicates

In [None]:
duplicates_train = train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

## Sample weight
### Credits to [AmbrosM](https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants)

In [None]:
# Check out the discussions,
# https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/305364
# https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/305733
sample_weight = train.value_counts().values

## Dropping duplicated rows

In [None]:
# Keep the train data with duplicates because why not try both?
train_with_duplicates = train.copy()
train.drop_duplicates(keep='first', inplace=True)
duplicates_train = train.duplicated().sum()

print('Train data shape:', train.shape)
print('Duplicates in train data: {0}'.format(duplicates_train))

# Features

## Numerical Features

In [None]:
numerical_features = train.columns[:-1] # drop target column
print("Numerical Columns: \n{0}".format(list(numerical_features)))

In [None]:
train[numerical_features].describe()

## Target Distribution

In [None]:
def plot(train, title):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xticks(rotation=30, ha='right')
    ax = sns.countplot(x=train['target'], data=train)
    print(pd.Series(train['target'], index=train.index).value_counts().sort_index() / len(train) * 100)

### Without duplicates

In [None]:
plot(train, "Target distribution without duplicates")

### With duplicates

In [None]:
plot(train_with_duplicates, "Target distribution with duplicates")

# Reduce memory usage

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
reduce_mem_usage(train_with_duplicates)
reduce_mem_usage(train)
reduce_mem_usage(test)
reduce_mem_usage(sub)

# Modelling
### Credits to [Maxence Fuzellier](https://www.kaggle.com/maxencefzr/tps-feb22-eda-extratrees)

In [None]:
N_SPLITS = 10
ESTIMATORS = 300

target_encoder = LabelEncoder()
def run_model(train, sample_weight = []):
    df = train.copy()
    df["target"] = target_encoder.fit_transform(df["target"])

    X = df.drop(["target"], axis=1)
    y = df["target"]

    scores = []
    y_probs = []
    folds = StratifiedKFold(n_splits=N_SPLITS, random_state=1, shuffle=True)
    for fold, (train_id, test_id) in enumerate(folds.split(X, y)):  
        X_train = X.iloc[train_id]
        y_train = y.iloc[train_id]
        X_valid = X.iloc[test_id]
        y_valid = y.iloc[test_id]

        model = ExtraTreesClassifier(
            n_estimators=ESTIMATORS,
            random_state=1,
            n_jobs=-1
        )

        if len(sample_weight) == 0:
            model.fit(X_train, y_train)
        else:
            sample_weight_train = sample_weight[train_id]
            model.fit(X_train, y_train, sample_weight_train)

        valid_pred = model.predict(X_valid)
        
        valid_score = 0
        if len(sample_weight) == 0:
            valid_score = accuracy_score(y_valid, valid_pred)
        else:
            sample_weight_valid = sample_weight[test_id]
            valid_score = accuracy_score(y_valid, valid_pred, sample_weight=sample_weight_valid)

        print("Fold:", fold + 1, "Accuracy:", valid_score)
        scores.append(valid_score)
        y_probs.append(model.predict_proba(test))

    print("Mean accuracy score:", np.array(scores).mean())
    return y_probs

Let's run the model with or without the duplicates and also try using a sample weight when duplicates are removed to see the difference between all those approaches.

## Without duplicates (using sample weight)

In [None]:
y_probs = run_model(train, sample_weight)

## Without duplicates and sample weight

In [None]:
y_probs_without_sample_weight = run_model(train)

## With duplicates

In [None]:
y_probs_with_duplicates = run_model(train_with_duplicates)

# Postprocessing

In [None]:
def post_processing(y_probs, train, tune = []):
    y_prob = sum(y_probs) / len(y_probs)
    target_distribution = train['target'].value_counts().sort_index() / len(train) * 100
    def get_diff(tune):
        y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob + tune, axis=1))
        return target_distribution - pd.Series(y_pred_tuned).value_counts().sort_index() / len(test) * 100

    if len(tune) == 0:
        tune = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        diff = get_diff(tune)
        while abs(diff).max() > 0.1:
            for i in range(len(diff)):
                if diff[i] > 0.1:
                    tune[i] += 0.001
                    break
                if diff[i] < -0.1:
                    tune[i] -= 0.001
                    break
            diff = get_diff(tune)

    # Credits to https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants
    print(tune)
    y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob + tune, axis=1))
    print(pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100)
    return y_pred_tuned

Apply the post processing step using a predetermined tune value or by trying to find a tune value. So, it will be possible to see difference between those two approaches.

Also, it will be possible comparing three models run with or without duplicates (also with sample weight) when the tune value is the same.

In [None]:
# Value from AmbrosM's notebeook, to understand whether
# my approach to find a value for tuning is right or wrong.
tune = [0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0]

## Without duplicates (using sample weight)

In [None]:
y_pred_tuned = post_processing(y_probs, train)
y_pred_tuned_with_tune = post_processing(y_probs, train, tune)

## Without duplicates and sample weight

In [None]:
y_pred_tuned_without_sample_weight = post_processing(y_probs_without_sample_weight, train)
y_pred_tuned_without_sample_weight_with_tune = post_processing(y_probs_without_sample_weight, train, tune)

## With duplicates

In [None]:
y_pred_tuned_with_duplicates = post_processing(y_probs_with_duplicates, train_with_duplicates)
y_pred_tuned_with_duplicates_with_tune = post_processing(y_probs_with_duplicates, train_with_duplicates, tune)

# Intersection between Training and Test sets
### Credits to [XYZT](https://www.kaggle.com/thexyzt/intersection-between-training-and-test-sets)

It appears there are some rows which are same both in training and test set.  
So, at least be sure that we are not predicting them wrong. :)

In [None]:
# According to XYZT, there are 1521 rows which are both included in the
# training and test data. 486 of them are unique and contained by the
# test data. So, find those rows, keep their indices and use them to
# update our submissions if there is any difference.
intersection = test.copy()
intersection['copy_index'] = intersection.index
intersection = intersection.merge(train, on=list(test.columns), how="inner")
print("There are {0} rows which are included in both training and test sets.".format(len(intersection)))

In [None]:
def update_target(data, intersection):
    count = 0
    for index, row in intersection.iterrows():
        if data[int(row['copy_index']) - 200000] != row.target:
            count = count + 1
            data[int(row['copy_index']) - 200000] = row.target
    print("Updated {0} rows.".format(count))

In [None]:
update_target(y_pred_tuned, intersection)
update_target(y_pred_tuned_with_tune, intersection)
update_target(y_pred_tuned_without_sample_weight, intersection)
update_target(y_pred_tuned_without_sample_weight_with_tune, intersection)
update_target(y_pred_tuned_with_duplicates, intersection)
update_target(y_pred_tuned_with_duplicates_with_tune, intersection)

# Submission

In [None]:
def submission(name, y_pred_tuned):
    sub["target"] = y_pred_tuned
    sub.to_csv(name, index=False)

# Without duplicates (using sample weight)
submission("submission.csv", y_pred_tuned)
submission("submission_with_tune.csv", y_pred_tuned_with_tune)

# Without duplicates and sample weight
submission("submission_without_sample_weight.csv", y_pred_tuned_without_sample_weight)
submission("submission_without_sample_weight_with_tune.csv", y_pred_tuned_without_sample_weight_with_tune)

## With duplicates
submission("submission_with_duplicates.csv", y_pred_tuned_with_duplicates)
submission("submission_with_duplicates_with_tune.csv", y_pred_tuned_with_duplicates_with_tune)

# Ensemble
### Credits to [Sy-Tuan Nguyen](https://www.kaggle.com/sytuannguyen/early-ensemble?scriptVersionId=87338628)

In [None]:
preds = []
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        if (dirname != '/kaggle/input/tabular-playground-series-feb-2022') & ('.csv' in filename):
            df = pd.read_csv(os.path.join(dirname, filename))
            preds.append(df['target'])
submission("ensemble.csv", stats.mode(np.array(preds), axis=0)[0].transpose())