In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from tqdm import tqdm
from math import factorial
import gc
import joblib

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from scipy.stats import mode

import math

from datetime import datetime

from warnings import simplefilter
simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(24, 6),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=1,
)
plot_params = dict(
    color=".75",
    style=".-",
    markeredgecolor=".25",
    markerfacecolor=".25",
)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from scipy.stats import norm, skew

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

# The idea
* to split dataset by GCD and model separately

# Read the data
* read train, test
* label encode target -> target num
* calculate GCD 

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')

bacteria = train['target'].unique().tolist()
elements = [e for e in train.columns if e != 'row_id' and e != 'target']

# Convert the 10 bacteria names to the integers 0 .. 9
target_encoder = LabelEncoder()
train['target'] = target_encoder.fit_transform(train.target)

def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def n_samples(df):
    return pd.DataFrame({col: ((df[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})

def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

# create train_i and test_i
train_ns, test_ns = n_samples(train[elements]), n_samples(test[elements])
train_ns['gcd'], test_ns['gcd'] = gcd_of_all(train_ns), gcd_of_all(test_ns)

# add 'gcd' to train and test
train['gcd']   = train_ns['gcd']
test['gcd']    = test_ns['gcd']

del train_ns
del test_ns
gc.collect()

In [None]:
train

In [None]:
test

In [None]:
TRAIN, TEST = {}, {}
for gcd in [1, 10, 1000, 10000]:
    TRAIN[gcd] = train[train['gcd'] == gcd]
    TRAIN[gcd] = TRAIN[gcd].drop(['gcd'], axis=1)
    TEST[gcd] = test[test['gcd'] == gcd]
    TEST[gcd] = TEST[gcd].drop(['gcd'], axis=1)
    print(f'{gcd}: train {TRAIN[gcd].shape}, test {TEST[gcd].shape}')

In [None]:
tc = {}
for gcd in [1, 10, 1000, 10000]:
    tc[gcd] = dict(TRAIN[gcd].target.value_counts())

t_counts = []
for bn in range(10):
    bacterium = target_encoder.inverse_transform([bn])[0]
    
    for gcd in [1, 10, 1000, 10000]:
        item = {}
        item['bacterium'] = bacterium
        item['b_num'] = bn
        item['gcd'] = gcd
        item['count'] = tc[gcd][bn]
        item['delta'] = tc[gcd][bn] / TRAIN[gcd].shape[0]
        item['df'] = 'train'
        t_counts.append(item)
target_counts = pd.DataFrame(t_counts) #.transpose()
target_counts

In [None]:
sns.barplot(data=target_counts, x='bacterium', y='delta', hue='gcd')

In [None]:
SCORES, Y_PREDS, FOLD_PROBS, Y_PRED = {}, {}, {}, {}
for gcd in [1, 10, 1000, 10000]:
    
    X = TRAIN[gcd].drop("target", axis=1).astype(np.float32)
    y = TRAIN[gcd]['target']

    FOLD_PROBS[gcd] = []
    Y_PREDS[gcd] = []
    scores = []

    folds = StratifiedKFold(n_splits=10, shuffle=True)
    if gcd < 100:
        estimators = 1000
    else:
        estimators = 3000

    for fold, (train_id, test_id) in enumerate(folds.split(X, y)):  
        X_train = X.iloc[train_id]
        y_train = y.iloc[train_id]
        X_valid = X.iloc[test_id]
        y_valid = y.iloc[test_id]

        model = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1)

        start = datetime.now()
        model.fit(X_train, y_train)
        end = datetime.now()

        valid_pred = model.predict(X_valid)
        valid_score = accuracy_score(y_valid, valid_pred)
        print("Fold:", fold + 1, "Accuracy:", valid_score, 'Time:', end - start)
        scores.append(valid_score)

        # Save predictions to later submit the mean values
        Y_PREDS[gcd].append(model.predict(TEST[gcd]))
        FOLD_PROBS[gcd].append(model.predict_proba(TEST[gcd]))
        
    print('gcd',gcd, "Mean accuracy score:", np.array(scores).mean())
    SCORES[gcd] = np.array(scores).mean()
        
    Y_PRED[gcd] = target_encoder.inverse_transform(mode(Y_PREDS[gcd]).mode[0])     

In [None]:
print(SCORES)

In [None]:
PRED = {}
for gcd in [1, 10, 1000, 10000]:
    PRED[gcd] = TEST[gcd].copy()
    PRED[gcd]['target'] = Y_PRED[gcd]
PRED[10]

In [None]:
tc = {}
for gcd in [1, 10, 1000, 10000]:
    tc[gcd] = dict(PRED[gcd].target.value_counts())
t_counts_p = []
for bn in range(10):
    bacterium = target_encoder.inverse_transform([bn])[0]
    
    for gcd in [1, 10, 1000, 10000]:
        item = {}
        item['bacterium'] = bacterium
        item['b_num'] = bn
        item['gcd'] = gcd
        item['count'] = tc[gcd][bacterium]
        item['delta'] = tc[gcd][bacterium] / PRED[gcd].shape[0]
        item['df'] = 'test'
        t_counts_p.append(item)
target_counts_p = pd.DataFrame(t_counts_p) #.transpose()
target_counts_p

In [None]:
sns.barplot(data=target_counts_p, x='bacterium', y='delta', hue='gcd')

In [None]:
df = pd.concat([target_counts, target_counts_p], axis=0)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(24, 4))
i = 0
for gcd in [1, 10, 1000, 10000]:
    sns.barplot(data=df[df.gcd == gcd], x='b_num', y='delta', hue='df', ax=ax[i])
    i += 1

In [None]:
cnt = df.values.tolist()
d = {}
dv = []

A = {}
for gcd in [1, 10, 1000, 10000]:
    A[gcd] = [0,0,0,0,0,0,0,0,0,0]

for c in cnt:
    if c[5] == 'train':
        item = {}
        item['count'] = c[3]
        item['delta'] = c[4]
        d[(c[1], c[2])] = item
    else:
        item = {}
        item['bacterium'] = c[0]
        item['b_num'] = c[1]
        item['gcd'] = c[2]
        item['count'] = d[(c[1], c[2])]['count'] - c[3]
        item['delta'] = d[(c[1], c[2])]['delta'] - c[4]
        
        if item['delta'] > 0.005 or item['delta'] < -0.005:
            A[c[2]][c[1]] = round(item['delta'], 3)
        
        dv.append(item)
divs = pd.DataFrame(dv)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(24, 4))
i = 0
for gcd in [1, 10, 1000, 10000]:
    sns.barplot(data=divs[divs.gcd == gcd], x='b_num', y='delta', ax=ax[i])
    i += 1

In [None]:
for gcd in [1, 10, 1000, 10000]:
    print(A[gcd], gcd)

In [None]:
MEAN_PROB = {}
for gcd in [1, 10, 1000, 10000]:
    mean_prob = sum(FOLD_PROBS[gcd]) / len(FOLD_PROBS[gcd]) # Mean probability for each row
    
    mean_prob += [0,0,0.011, 0.049, -0.032, 0, -0.007,0,0,0] #np.array(A[gcd])
    
    mean_pred = target_encoder.inverse_transform(np.argmax(mean_prob, axis=1))
    
    target_distrib = pd.DataFrame({
        'train_share': TRAIN[gcd]["target"].value_counts() / TRAIN[gcd].shape[0] * 100
    })

    target_distrib['pred_count'] = pd.Series(mean_pred, index=TEST[gcd].index).value_counts()
    target_distrib['mod_share'] = pd.Series(Y_PRED[gcd], index=TEST[gcd].index).value_counts() / len(TEST[gcd])* 100
    target_distrib['mean_share'] = pd.Series(mean_pred, index=TEST[gcd].index).value_counts() / len(TEST[gcd])* 100

    target_distrib['mod_diff'] = (target_distrib['mod_share'] - target_distrib['train_share']) 
    target_distrib['mean_diff'] = (target_distrib['mean_share'] - target_distrib['train_share']) 
    
    MEAN_PROB[gcd] = mean_prob
    PRED[gcd]['target_corrected'] = target_encoder.inverse_transform(np.argmax(mean_prob, axis=1))

In [None]:
for gcd in [1, 10, 1000, 10000]:
    PRED[gcd]['gcd'] = gcd

In [None]:
pred = pd.concat([PRED[1], PRED[10], PRED[1000], PRED[10000]], axis=0)   
pred = pred.sort_index()
pred

In [None]:
corrected = pred[pred.target != pred.target_corrected]
corrected

In [None]:
sub = pred['target_corrected']
sub = sub.rename('target')
sub.to_csv('submission.csv')

# The next step

* ExtraTreesClassifier with CV and clustering  
    Notebook: [ET + CV + clustering](https://www.kaggle.com/martynovandrey/et-cv-clustering)  
    public score **0.98875**

Thanks to [Luca Massaron](https://www.kaggle.com/lucamassaron), [AmbrosM](https://www.kaggle.com/ambrosm), [ŞAFAK TÜRKELI](https://www.kaggle.com/sfktrkl`)