# Tweaking Class Probabilities

In this final notebook, we attempt to fix a shortcoming of the ExtraTrees based models by manually tweaking the class probabilities predicted by our models. Our goal is to make our predicted label distribution more similar to the training label distribution. This is based of this [kaggle notebook](https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants).

In [1]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 10
EXT_PARAMS = dict(n_estimators = 300, random_state = RANDOM_SEED, n_jobs = -1)

In [2]:
# Generic
import numpy as np
import pandas as pd
import pyarrow
import time
import gc

# Optimized scikit-learn
from sklearnex import patch_sklearn
patch_sklearn()

# Sklearn
from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score

# Other stuff
from math import factorial
from random import choices, setstate
from collections import Counter
from itertools import product
import matplotlib.pyplot as plt
from IPython.display import Image
import seaborn as sns

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Helper Functions

In [3]:
# Helper functions from https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense/
from math import factorial
from random import choices, setstate
from collections import Counter
from itertools import product

def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(column):
    w = int(column[1:column.index('T')])
    x = int(column[column.index('T')+1:column.index('G')])
    y = int(column[column.index('G')+1:column.index('C')])
    z = int(column[column.index('C')+1:])
    return bias(w, x, y, z)

def get_histograms(input_df):
    return pd.DataFrame({
        col: ((input_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in features
    })

def gcd_of_all(df_i):
    gcd = df_i[features[0]]
    for col in features[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

def get_target_bins():
    temp = train[['target','target']].copy()
    temp.columns = ['row_id','target']
    temp['row_id'] = gcd_of_all(get_histograms(train[features]))
    return temp['row_id'].astype(str) + temp['target'].astype(str)

# Training Function

In [4]:
# Scoring/Training Baseline Function
def score_model(sklearn_model):
    
    # Store the holdout predictions
    oof_preds = np.zeros((len(train),))
    test_proba = np.zeros((len(test),len(train['target'].unique())))
    test_preds = list()
    scores = np.zeros(NUM_FOLDS)
    print('')
    
    # Stratified k-fold cross-validation
    for fold, (train_idx, valid_idx) in enumerate(SKF.split(train, target_bins)):
        
        # Train/Test/Validation Sets
        X_train, y_train = train[features + ['gcd']].iloc[train_idx], train['target'].iloc[train_idx]
        X_valid, y_valid = train[features + ['gcd']].iloc[valid_idx], train['target'].iloc[valid_idx]
        train_weights, valid_weights = train['sample_weight'].iloc[train_idx], train['sample_weight'].iloc[valid_idx]
        X_test = test[features + ['gcd']]; start = time.time()
        
        # Train Model
        model = clone(sklearn_model)
        model.fit(X_train, y_train, sample_weight = train_weights)
        gc.collect()
        
        # Get Predictions
        valid_preds = np.argmax(model.predict_proba(X_valid), axis = 1)
        test_prob = model.predict_proba(X_test)
        
        # Save Predictions
        test_proba += test_prob / NUM_FOLDS
        test_preds.append(np.argmax(test_prob, axis = 1))
        oof_preds[valid_idx] = valid_preds
    
    print("\nAverage Accuracy:", round(accuracy_score(train['target'], oof_preds, sample_weight = train['sample_weight']), 5))
    # return oof_preds, np.argmax(test_proba, axis = 1), mode(test_preds).mode[0]
    return oof_preds, test_proba

# Load Data

In [5]:
%%time
train = pd.read_feather('../data/train.feather')
features = [x for x in train.columns if x not in ['row_id','target','sample_weight','gcd']]

# Stratified K-fold
SKF = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)

# Training label distribution
label_dist = train['target'].value_counts().sort_index() / len(train) * 100

encoder = LabelEncoder()
train['target'] = encoder.fit_transform(train['target'])
target_bins = train['target'].astype(str) + train['gcd'].astype(str)
test = pd.read_feather('../data/test.feather')
submission = pd.read_csv('../data/sample_submission.csv')

print(f'Training Samples: {len(train)}')
pd.Series(encoder.classes_)

Training Samples: 123993
CPU times: total: 1.67 s
Wall time: 421 ms


0        Bacteroides_fragilis
1        Campylobacter_jejuni
2          Enterococcus_hirae
3            Escherichia_coli
4      Escherichia_fergusonii
5       Klebsiella_pneumoniae
6         Salmonella_enterica
7       Staphylococcus_aureus
8    Streptococcus_pneumoniae
9      Streptococcus_pyogenes
dtype: object

# Original Labels

In [6]:
label_dist

Bacteroides_fragilis        10.098957
Campylobacter_jejuni        10.056213
Enterococcus_hirae           9.978789
Escherichia_coli             9.917495
Escherichia_fergusonii       9.907817
Klebsiella_pneumoniae       10.016694
Salmonella_enterica          9.992500
Staphylococcus_aureus       10.012662
Streptococcus_pneumoniae    10.013469
Streptococcus_pyogenes      10.005404
Name: target, dtype: float64

# ExtraTrees

In [7]:
# ExtraTrees Baseline
oof_preds, test_proba = score_model(
    ExtraTreesClassifier(**EXT_PARAMS)
)

y_preds = encoder.inverse_transform(np.argmax(test_proba, axis=1))
pd.Series(y_preds, index=test.index).value_counts().sort_index() / len(test) * 100



Average Accuracy: 0.9563


Bacteroides_fragilis        10.035
Campylobacter_jejuni        10.236
Enterococcus_hirae           9.676
Escherichia_coli             8.369
Escherichia_fergusonii      11.077
Klebsiella_pneumoniae       10.181
Salmonella_enterica         10.284
Staphylococcus_aureus        9.951
Streptococcus_pneumoniae    10.078
Streptococcus_pyogenes      10.113
dtype: float64

In [8]:
new_proba = test_proba + np.array([0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0])
y_preds = encoder.inverse_transform(np.argmax(new_proba, axis=1))

submission['target'] = y_preds
submission.to_csv('../submissions/extratrees_tweaked.csv', index=False)

pd.Series(y_preds, index=test.index).value_counts().sort_index() / len(test) * 100

Bacteroides_fragilis        10.007
Campylobacter_jejuni        10.227
Enterococcus_hirae           9.762
Escherichia_coli             9.793
Escherichia_fergusonii      10.034
Klebsiella_pneumoniae       10.094
Salmonella_enterica         10.029
Staphylococcus_aureus        9.924
Streptococcus_pneumoniae    10.060
Streptococcus_pyogenes      10.070
dtype: float64

# ExtraTrees w/ Bagging

In [9]:
# ExtraTrees Baseline
oof_preds, test_proba1 = score_model(
    BaggingClassifier(ExtraTreesClassifier(**EXT_PARAMS), random_state = RANDOM_SEED)
)

y_preds = encoder.inverse_transform(np.argmax(test_proba1, axis=1))
pd.Series(y_preds, index=test.index).value_counts().sort_index() / len(test) * 100



Average Accuracy: 0.95896


Bacteroides_fragilis        10.033
Campylobacter_jejuni        10.228
Enterococcus_hirae           9.680
Escherichia_coli             8.508
Escherichia_fergusonii      10.936
Klebsiella_pneumoniae       10.258
Salmonella_enterica         10.208
Staphylococcus_aureus        9.982
Streptococcus_pneumoniae    10.063
Streptococcus_pyogenes      10.104
dtype: float64

In [10]:
new_proba1 = test_proba1 + np.array([0, 0, 0.01, 0.022, 0, 0, 0, 0, 0, 0])
y_preds = encoder.inverse_transform(np.argmax(new_proba1, axis=1))

submission['target'] = y_preds
submission.to_csv('../submissions/bagging_tweaked.csv', index=False)

pd.Series(y_preds, index=test.index).value_counts().sort_index() / len(test) * 100

Bacteroides_fragilis        10.016
Campylobacter_jejuni        10.213
Enterococcus_hirae           9.797
Escherichia_coli             9.651
Escherichia_fergusonii      10.096
Klebsiella_pneumoniae       10.146
Salmonella_enterica         10.036
Staphylococcus_aureus        9.941
Streptococcus_pneumoniae    10.044
Streptococcus_pyogenes      10.060
dtype: float64

# Separate High/Low Resolution

In [11]:
class FourResolutions(BaseEstimator):
    
    def __init__(self, base_estimator = ExtraTreesClassifier(**EXT_PARAMS)):
        self.base_estimator = base_estimator
        
    def clone_models(self):
        self.model1 = clone(self.base_estimator) # Model for 1,000,000 BOC Reads
        self.model2 = clone(self.base_estimator) # Model for 100,000 BOC Reads
        self.model3 = clone(self.base_estimator) # Model for 1,000 BOC Reads
        self.model4 = clone(self.base_estimator) # Model for 100 BOC Reads
            
    def gcd_of_all(self, df_i):
        features = [x for x in df_i.columns]
        gcd = df_i[features[0]]
        for col in features[1:]:
            gcd = np.gcd(gcd, df_i[col])
        self.gcd1 = (gcd == 1)
        self.gcd2 = (gcd == 10)
        self.gcd3 = (gcd == 1000)
        self.gcd4 = (gcd == 10000)
        
    def get_histograms(self, input_df):
        return pd.DataFrame({
            col: ((input_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in features
        })
        
    def fit(self, X, y, sample_weight = None):
        self.clone_models()
        temp = self.get_histograms(X)
        self.gcd_of_all(temp)
        self.num_labels = len(np.unique(y))
        if sample_weight is not None:
            self.model1.fit(X[self.gcd1], y[self.gcd1], sample_weight[self.gcd1])
            self.model2.fit(X[self.gcd2], y[self.gcd2], sample_weight[self.gcd2])
            self.model3.fit(X[self.gcd3], y[self.gcd3], sample_weight[self.gcd3])
            self.model4.fit(X[self.gcd4], y[self.gcd4], sample_weight[self.gcd4])
        else:
            self.model1.fit(X[self.gcd1], y[self.gcd1])
            self.model2.fit(X[self.gcd2], y[self.gcd2])
            self.model3.fit(X[self.gcd3], y[self.gcd3])
            self.model4.fit(X[self.gcd4], y[self.gcd4])
            
    def predict_proba(self, X):
        temp = self.get_histograms(X)
        self.gcd_of_all(temp)
        temp = np.zeros((len(X),self.num_labels))
        temp[self.gcd1] = self.model1.predict_proba(X[self.gcd1])
        temp[self.gcd2] = self.model2.predict_proba(X[self.gcd2])
        temp[self.gcd3] = self.model3.predict_proba(X[self.gcd3])
        temp[self.gcd4] = self.model4.predict_proba(X[self.gcd4])
        return temp
        
    def predict(self, X):
        temp = self.get_histograms(X)
        self.gcd_of_all(temp)
        temp = np.zeros((len(X),))
        temp[self.gcd1] = self.model1.predict(X[self.gcd1])
        temp[self.gcd2] = self.model2.predict(X[self.gcd2])
        temp[self.gcd3] = self.model3.predict(X[self.gcd3])
        temp[self.gcd4] = self.model4.predict(X[self.gcd4])
        return temp.astype(int)

In [12]:
# ExtraTrees Baseline
oof_preds, test_proba2 = score_model(
    FourResolutions()
)

y_preds = encoder.inverse_transform(np.argmax(test_proba2, axis=1))
pd.Series(y_preds, index=test.index).value_counts().sort_index() / len(test) * 100



Average Accuracy: 0.95578


Bacteroides_fragilis        10.052
Campylobacter_jejuni        10.238
Enterococcus_hirae           9.664
Escherichia_coli             8.507
Escherichia_fergusonii      10.929
Klebsiella_pneumoniae       10.214
Salmonella_enterica         10.257
Staphylococcus_aureus        9.949
Streptococcus_pneumoniae    10.091
Streptococcus_pyogenes      10.099
dtype: float64

In [13]:
new_proba2 = test_proba2 + np.array([0, 0, 0.015, 0.035, 0, 0, 0, 0, 0, 0])
y_preds = encoder.inverse_transform(np.argmax(new_proba2, axis=1))

submission['target'] = y_preds
submission.to_csv('../submissions/four_models_tweaked.csv', index=False)

pd.Series(y_preds, index=test.index).value_counts().sort_index() / len(test) * 100

Bacteroides_fragilis        10.014
Campylobacter_jejuni        10.204
Enterococcus_hirae           9.814
Escherichia_coli             9.874
Escherichia_fergusonii       9.993
Klebsiella_pneumoniae       10.092
Salmonella_enterica         10.001
Staphylococcus_aureus        9.921
Streptococcus_pneumoniae    10.046
Streptococcus_pyogenes      10.041
dtype: float64