## 1. Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold


## 2. Load The Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv',index_col=0)
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv',index_col=0)
subs = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')


## 2. Exploring the Data

In [None]:
train.head()

In [None]:
train.describe(include = 'all')


In [None]:
train.info()

#### Explore Test Data

In [None]:
test.head()

In [None]:
print('Train data size:', train.shape)
print('Test data size:', test.shape)


### Searching for missing values:


In [None]:
print('Missing Values in Train data: ', train.isna().any().sum())
print('Missing Values in Test data: ',test.isna().any().sum())


### Searching for Duplicates:

In [None]:
print('Duplicate in Train data: ', train.duplicated().sum())
print('Duplicate in Test data: ', test.duplicated().sum())


## 3. Handling with Duplicates:

We shouldn't drop the duplicates only! because this would change the probability distribution.
- Assuming one specific estimation result has been estimated multiple times, it ought to have higher load than a result which has been estimated just a single time. To handle this we can use the optional parameter in scikit-learn estimators 'sample_weight'.

In [None]:
vc = train.value_counts()
dup_train = pd.DataFrame([list(tup) for tup in vc.index.values], columns=train.columns)
dup_train['sample_weight'] = vc.values
sample_weight = dup_train['sample_weight']

dup_train

In [None]:
TARGET = train.columns.difference(test.columns)[0]

FEATURES = train.columns[train.columns != TARGET]


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical features
le = LabelEncoder()

X = dup_train[FEATURES]
y = pd.DataFrame(le.fit_transform(dup_train[TARGET]), columns=[TARGET])


In [None]:
from tqdm import tqdm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

y_pred_list, y_proba_list, scores = [], [], []
split = 15

skf = StratifiedKFold(n_splits=split, random_state=42, shuffle=True)


for fold, (train_id, test_id) in enumerate(tqdm(skf.split(X, y), total=split)):
    
    # Splitting
    X_train, y_train, sample_weight_train = X.iloc[train_id], y.iloc[train_id], sample_weight.iloc[train_id]
    X_test, y_test, sample_weight_test = X.iloc[test_id], y.iloc[test_id], sample_weight.iloc[test_id]
    
    # Model
    params = {'n_estimators': 500,}
    
    model = ExtraTreesClassifier( **params, n_jobs=-1, random_state=42 )
    
    # Training
    model.fit(X_train,  np.ravel(y_train), sample_weight_train)
        
    # Testing
    y_pred = model.predict(X_test)
    test_score = accuracy_score(y_test, y_pred, sample_weight=sample_weight_test)
    print(f'Accuracy score: {test_score:5f}\n')
    scores.append(test_score)
    
    # Prediction
    y_pred_list.append(model.predict(test))
    y_proba_list.append(model.predict_proba(test))
    


In [None]:
score = np.array(scores).mean()
print(f'Mean accuracy score: {score:6f}')


In [None]:
y_pred_list


## 4. Ensembling

In [None]:
from scipy.stats import mode

# Using Majority vote
inverse_pred_list = mode(y_pred_list).mode[0]
inverse_pred_list = le.inverse_transform(inverse_pred_list)


In [None]:
inverse_pred_list

### Let's check the distribution of classes in training and our predictions.

In [None]:
target_distrib = pd.DataFrame({
    'count': dup_train.target.value_counts(),
    'share': dup_train[TARGET].value_counts() / dup_train.shape[0] * 100
})
target_distrib.sort_index()



In [None]:
target_distrib['count_w_drop'] = dup_train.target.value_counts()
target_distrib['share_w_drop'] = target_distrib['count_w_drop'] / dup_train.shape[0] * 100

target_distrib.sort_index()


In [None]:
target_distrib['pred_count'] = pd.Series(inverse_pred_list, index=test.index).value_counts()
target_distrib['pred_share'] = target_distrib['pred_count'] / len(test) * 100
target_distrib.sort_index()


#### As we see the distribution of classes in training is difference in our predictions.

## 5. Postprocessing

- The probabilities in [y_proba_list] are very important because our classifier predicts calsses and probabilities. We will tune these probabilities manually by adding a small bias.
- You can try to submit without tunning and compare between the accuracy.

In [None]:
def get_diff(bias):
    y_pred_tuned = np.argmax(y_proba_list + bias, axis=1)
    share_train = target_distrib['share_w_drop'].sort_index().values
    share_pred = pd.Series(y_pred_tuned).value_counts().sort_index() / len(test) * 100
    diff = share_train - share_pred
    
    return diff

def custom_bias(diff, bias):
    while abs(diff).max() > 0.1:
        for i in range(len(diff)):
            if diff[i] > 0.1:
                bias[i] += 0.001
                break
            if diff[i] < -0.1:
                bias[i] -= 0.001
                break

        diff = get_diff(bias)
    
    return bias


In [None]:
y_proba_list = sum(y_proba_list) / len(y_proba_list)
bias = np.zeros(train[TARGET].nunique())

diff = get_diff(bias)
print(f'\033[1;31;43m Difference: \033[0;0m \n{diff}')


In [None]:
bias = custom_bias(diff, bias)
print(f'\033[1;31;43m Bias to add: \033[0;0m \n{bias}')


In [None]:
y_proba_list += bias
y_pred_tuned = le.inverse_transform(np.argmax(y_proba_list, axis=1))


## 6. Submission

In [None]:
subs[TARGET] = y_pred_tuned
subs.to_csv('./submission.csv', index=False)
subs
