# 1. Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

from scipy.stats import mode

# 2. Additionals functions

1. reduce_mem_usage(df): --> df
Reduce dataframe memory usage.
2. get_diff(target_distrib, y_proba_list, test, bias): --> diff
Count difference between class probabilty in test dataste and in resutls. 
3. custom_bias(diff, bias): --> bias
Change bias values.

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def get_diff(bias):
    y_pred_tuned = np.argmax(y_proba_list + bias, axis=1)
    share_train = target_distrib['share_w_drop'].sort_index().values
    share_pred = pd.Series(y_pred_tuned).value_counts().sort_index() / len(test) * 100
    diff = share_train - share_pred
    
    return diff

In [None]:
def custom_bias(diff, bias):
    while abs(diff).max() > 0.1:
        for i in range(len(diff)):
            if diff[i] > 0.1:
                bias[i] += 0.0001
                break
            if diff[i] < -0.1:
                bias[i] -= 0.0001
                break

        diff = get_diff(bias)
    
    return bias

# 3. Data download and simple review 

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv',index_col=0)
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv',index_col=0)
subs = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
subs = reduce_mem_usage(subs)

In [None]:
print('Train data size:', train.shape)
print('Test data size:', test.shape)

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
vc = train.value_counts()

In [None]:
dup_train = pd.DataFrame([list(tup) for tup in vc.index.values], columns=train.columns)
dup_train['sample_weight'] = vc.values
sample_weight = dup_train['sample_weight']

# 4. Modelling 

In [None]:
TARGET = train.columns.difference(test.columns)[0]

FEATURES = train.columns[train.columns != TARGET]

In [None]:
le = LabelEncoder()

X = dup_train[FEATURES]
y = pd.DataFrame(le.fit_transform(dup_train[TARGET]), columns=[TARGET])

In [None]:
'''
final_res = []


for i in range(2, 15):
    y_pred_list, y_proba_list, scores = [], [], []
    split = i # Need to check different values

    skf = StratifiedKFold(n_splits=split, random_state=42, shuffle=True)


    for fold, (train_id, test_id) in enumerate(tqdm(skf.split(X, y), total=split)):

        # Splitting
        X_train, y_train, sample_weight_train = X.iloc[train_id], y.iloc[train_id], sample_weight.iloc[train_id]
        X_test, y_test, sample_weight_test = X.iloc[test_id], y.iloc[test_id], sample_weight.iloc[test_id]

        # Model
        params = {'n_estimators': 500,}

        model = ExtraTreesClassifier( **params, n_jobs=-1, random_state=42 )

        # Training
        model.fit(X_train,  np.ravel(y_train), sample_weight_train)

        # Testing
        y_pred = model.predict(X_test)
        test_score = accuracy_score(y_test, y_pred, sample_weight=sample_weight_test)
        print(f'Accuracy score: {test_score:5f}\n')
        scores.append(test_score)

        # Prediction
        y_pred_list.append(model.predict(test))
        y_proba_list.append(model.predict_proba(test))
    
    final_res.append(scores)
'''

In [None]:
'''
for i in range(len(final_res)):
    score = np.array(final_res[i]).mean()
    print(f'Folds: {i+5}, mean accuracy score: {score:6f}')
'''

Folds: 2, mean accuracy score: 0.943448 \
Folds: 3, mean accuracy score: 0.952939 \
Folds: 4, mean accuracy score: 0.954864\
Folds: 5, mean accuracy score: 0.956265\
Folds: 6, mean accuracy score: 0.957851 \
Folds: 7, mean accuracy score: 0.957731\
Folds: 8, mean accuracy score: 0.957987\
Folds: 9, mean accuracy score: 0.957468\
Folds: 10, mean accuracy score: 0.958801\
Folds: 11, mean accuracy score: 0.958243\
Folds: 12, mean accuracy score: 0.959242\
Folds: 13, mean accuracy score: 0.959600\
Folds: 14, mean accuracy score: 0.959287 

In [None]:
y_pred_list, y_proba_list, scores = [], [], []
split = 20

skf = StratifiedKFold(n_splits=split, random_state=42, shuffle=True)


for fold, (train_id, test_id) in enumerate(tqdm(skf.split(X, y), total=split)):

    # Splitting
    X_train, y_train, sample_weight_train = X.iloc[train_id], y.iloc[train_id], sample_weight.iloc[train_id]
    X_test, y_test, sample_weight_test = X.iloc[test_id], y.iloc[test_id], sample_weight.iloc[test_id]

    # Model
    params = {'n_estimators': 500,}

    model = ExtraTreesClassifier( **params, n_jobs=-1, random_state=42 )

    # Training
    model.fit(X_train,  np.ravel(y_train), sample_weight_train)

    # Testing
    y_pred = model.predict(X_test)
    test_score = accuracy_score(y_test, y_pred, sample_weight=sample_weight_test)
    print(f'Accuracy score: {test_score:5f}\n')
    scores.append(test_score)

    # Prediction
    y_pred_list.append(model.predict(test))
    y_proba_list.append(model.predict_proba(test))

# 5. Ensambling & Postprocessing

In [None]:
inverse_pred_list = mode(y_pred_list).mode[0]
inverse_pred_list = le.inverse_transform(inverse_pred_list)

In [None]:
target_distrib = pd.DataFrame({
    'count': dup_train.target.value_counts(),
    'share': dup_train[TARGET].value_counts() / dup_train.shape[0] * 100
})
target_distrib.sort_index()

In [None]:
target_distrib['count_w_drop'] = dup_train.target.value_counts()
target_distrib['share_w_drop'] = target_distrib['count_w_drop'] / dup_train.shape[0] * 100

target_distrib.sort_index()

In [None]:
target_distrib['pred_count'] = pd.Series(inverse_pred_list, index=test.index).value_counts()
target_distrib['pred_share'] = target_distrib['pred_count'] / len(test) * 100
target_distrib.sort_index()

In [None]:
y_proba_list = sum(y_proba_list) / len(y_proba_list)
bias = np.zeros(train[TARGET].nunique())

diff = get_diff(bias)

In [None]:
bias = custom_bias(diff, bias)

In [None]:
y_proba_list += bias
y_pred_tuned = le.inverse_transform(np.argmax(y_proba_list, axis=1))

# 6. Data upload

In [None]:
subs[TARGET] = y_pred_tuned
subs.to_csv('./submission.csv', index=False)
subs