### 0. Import libraries

In [None]:
%%capture

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from scipy.stats import mode
from math import factorial
import gc
import sys

!pip install scikit-learn-intelex

## 1. Loading the data

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col=0)
test_df = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col=0)

print(f"Nb samples in train: {train_df.shape[0]}\nNb columns in train: {train_df.shape[1]}\nNb samples in test: {test_df.shape[0]}\nNb columns in test: {test_df.shape[1]}\n")

Let's take a look at the data:

In [None]:
train_df.head()

The dataframes are quite large so let's reduce the precision of some columns

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
print("Train data")
train_df = reduce_mem_usage(train_df)
print("\nTest data")
test_df = reduce_mem_usage(test_df)

## 2. Cleaning and feature extraction

As some people pointed out, there are many duplicated rows in both sets that could cause overfitting and data leakage, so we should remove them before training

In [None]:
print(f"Total number of duplicated rows: {train_df.duplicated().sum()} out of {train_df.shape[0]} ({train_df.duplicated().sum()/train_df.shape[0]*100:.2f}%)")
train_df = train_df.drop_duplicates()
print(f"Total number of rows after removal: {train_df.shape[0]}")

In [None]:
# Label encoding of categorical features
le = LabelEncoder()
y = train_df['target']
y_le = le.fit_transform(y)

feat = [col for col in train_df.columns if col != 'target']

As proved in [this kernel](https://www.kaggle.com/hamzaghanmi/train-test-286), some samples in the test set are already in the train set, so let's save those labels for later.

In [None]:
#https://www.kaggle.com/hamzaghanmi/train-test-286
common_labels = pd.merge(train_df.reset_index(), test_df.reset_index(), how='inner', on=feat)
common_labels.head()

In this part I copied the code from the [great kernel](http://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense) made by AmbrosM to convert the features to integers and tag them acording to their resolution

In [None]:
def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def gcd_of_all(df_i, elements):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd


In [None]:
train_int = pd.DataFrame({col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in feat})
test_int = pd.DataFrame({col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in feat})

#train_df['res'] = gcd_of_all(train_int, feat)
#test_df['res'] = gcd_of_all(test_int, feat)
train_int['target'] = y_le

train_int['res'] = gcd_of_all(train_int, feat)
test_int['res'] = gcd_of_all(test_int, feat)

In [None]:
# credits: https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/308876
def get_gc(s, v):
    a = int(s[1:s.index('T')])
    t = int(s[s.index('T')+1:s.index('G')])
    g = int(s[s.index('G')+1:s.index('C')])
    c = int(s[s.index('C')+1:])
    return ((g + c) / 10) * v

df = pd.DataFrame({col: get_gc(col, (train_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in feat})
train_int['gc_content'] = df.sum(axis=1)/1000000
df = pd.DataFrame({col: get_gc(col, (test_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in feat})
test_int['gc_content'] = df.sum(axis=1)/1000000
feat = feat + ['gc_content']

In [None]:
train_int.head()

## 3. Modeling: Supervised Learning

In [None]:
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
# release some memory
del train_df, test_df, df
gc.collect()

In [None]:
res    = [1, 10, 1000, 10000]
nFolds = 5

etc_params = {
        'n_estimators': 300,
        'n_jobs': -1,
        'bootstrap': False,
        'verbose': 0
        }

#sca = StandardScaler()

y_pred = pd.DataFrame()
acc_avg = 0
n_valid = 0
for res_i in res:
    print(f"\nResolution = {res_i}")
    X_train = train_int.loc[train_int['res'] == res_i][feat]
    y_train = train_int.loc[train_int['res'] == res_i]['target']
    X_test = test_int.loc[test_int['res'] == res_i][feat]

    y_preds = []
    y_probs = []
    perf = []
    cv = KFold(n_splits=nFolds, shuffle=True, random_state=2022)
    for fold, (train_idx, valid_idx) in enumerate(cv.split(X_train, y_train)):         
        X_train_cv = X_train.iloc[train_idx] 
        y_train_cv = y_train.iloc[train_idx]  
        X_valid = X_train.iloc[valid_idx]
        y_valid = y_train.iloc[valid_idx]
        
        #X_train_cv = sca.fit_transform(X_train_cv)
        #X_valid = sca.transform(X_valid)

        # train
        clf = ExtraTreesClassifier(**etc_params)    
        clf.fit(X_train_cv, y_train_cv.values.ravel())

        # predict
        y_pred_val = clf.predict(X_valid)
        acc = accuracy_score(y_valid.values.ravel(),  y_pred_val)
        perf.append(acc)
        
        y_preds.append(clf.predict(X_test))
        y_probs.append(clf.predict_proba(X_test))
        print(f"CV - FOLD {fold+1} | Samples train: {len(train_idx)} | Samples validation: {len(valid_idx)} | acc = {acc:.4f}")

            
    print(f">>> Average across folds for res = {res_i} : acc = {np.mean(perf):.2f}")
    acc_avg += np.mean(perf)*len(valid_idx)
    n_valid += len(valid_idx)
    
    # Majority vote
    y_pred_res = mode(y_preds).mode[0]
    test_ind_res = test_int[test_int['res'] == res_i].index
    y_pred = y_pred.append(pd.DataFrame(le.inverse_transform(y_pred_res), index=test_ind_res, columns=['target']))
    test_int.loc[test_ind_res, 'target'] = y_pred_res.astype(int)
    y_pred.loc[test_ind_res]['res'] = res_i
print(f"\n>>> Weighted avg across folds and resolutions: {acc_avg/n_valid:.4f}")
sub = y_pred.copy()

We get perfect accuracy for the first two resolution levels. It is pretty clear, though, that the main problem here are the samples with lower resolution, i.e., those with a gcd value of 1000 and 10000 are noisier and, therefore, more difficult to classify. Moreover, those two subsets have a third of the number of samples we have for the other two. 
We know from other kernels that there are replicated rows in the test set too, so we should check on that. 

SPOILER ALERT: I bet those samples correspond to the lower resolution sets, those with res=1000 and res=10000.

In [None]:
# convert float labels to int to make things easier
test_int = test_int.astype({'target': 'int32'})

In [None]:
test_high = test_int[(test_int['res']==1) | (test_int['res']==10)]
test_low = test_int[(test_int['res']==1000) | (test_int['res']==10000)]

print(f"Total number of duplicated samples in high res test: {test_high.duplicated().sum()} out of {test_high.shape[0]} ({test_high.duplicated().sum()/test_high.shape[0]*100:.2f}%)")
print(f"Total number of duplicated samples in low res test: {test_low.duplicated().sum()} out of {test_low.shape[0]} ({test_low.duplicated().sum()/test_low.shape[0]*100:.2f}%)")

And that's exactly what happened. So from now on I'm going to **assume** that the high resolution samples are so easy to classify that all the predictions I've got for them are correct. Based on that **assumption** I could use those samples to artificially increase the training set and try to get better predictions on the remaining low-resolution test set samples. 

![img](https://media.giphy.com/media/Y2mXijj144TeY630d0/giphy.gif)

## 4. Modeling: Semi-supervised Learning

Now we are going to used the high resolution samples as pseudo-labeled data to retrain the model. To do that, we just extend the train dataframe by adding the samples in the test set that correspond to resolutions 1 and 10

In [None]:
train_ext = pd.concat([train_int, test_high.drop_duplicates()])
feat.append('res')
print(f"Nb samples in train set extended: {len(train_ext)} ({(len(train_ext)-len(train_int))/len(train_ext)*100:.2f}% increase)")

In [None]:
train_ext.head()

In [None]:
X_train = train_ext[feat].copy()
y_train = train_ext['target'].copy()
X_test = test_low[feat].copy()
index_test_low = test_low.index.copy()

X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

In [None]:
for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
del train_ext, train_int, test_int, test_high, test_low
gc.collect()

This time I'm not training four different models, but the same one with the *resolution* as a feature. I increase the number of estimators and perform 10 fold CV to do ensembling by averaging the predictions.

In [None]:
#credits: https://www.kaggle.com/alexandreayari/tps-02-22-extratrees-gcd-memory-opti
nFolds = 10 

SEED = 2022  
N_ESTIMATORS = 2000
MAX_DEPTH = 3691
MIN_SAMPLES_SPLIT = 3
MIN_SAMPLES_LEAF = 1
CRITERION  = 'gini'
VERBOSE = 0

#sca = StandardScaler()

y_preds_ext = []
y_probs_ext = []
perf = []
acc_avg = 0
n_valid = 0
cv = StratifiedKFold(n_splits=nFolds, shuffle=True, random_state=2022)
print("Starting training...")
for fold, (train_idx, valid_idx) in enumerate(cv.split(X_train, y_train)):         
    X_train_cv = X_train.iloc[train_idx] 
    y_train_cv = y_train.iloc[train_idx]  
    X_valid = X_train.iloc[valid_idx]
    y_valid = y_train.iloc[valid_idx]
        
    #X_train_cv = sca.fit_transform(X_train_cv)
    #X_valid = sca.transform(X_valid)

    etc_params = {
        'n_estimators': N_ESTIMATORS,
        'max_depth': MAX_DEPTH,
        'min_samples_split': MIN_SAMPLES_SPLIT,
        'min_samples_leaf': MIN_SAMPLES_LEAF,
        'criterion': CRITERION,
        'bootstrap': False,
        'n_jobs': -1,
        'random_state': SEED + fold + 1,
        'verbose': VERBOSE,
    }

    
    # train
    clf = ExtraTreesClassifier(**etc_params)    
    clf.fit(X_train_cv, y_train_cv.values.ravel())

    # predict
    y_pred_val = clf.predict(X_valid)
    acc = accuracy_score(y_valid.values.ravel(),  y_pred_val)
    perf.append(acc)
        
    y_preds_ext.append(clf.predict(X_test))
    y_probs_ext.append(clf.predict_proba(X_test))
    print(f"CV - FOLD {fold} | acc = {acc:.4f}")

print(f">>> Average across folds: {np.mean(perf):.4f}")

PS: You might be wondering how accurate the model would be using the initial train set and adding the resolution as a feature. The answer is   0.9786 (avg for 10-fold CV), which is slightly better than using 4 separate models, but still worse than using the pseudo-labelling approach, as we just saw.

## 5. Ensembling predictions

In [None]:
# Average prob
#y_probs_avg = sum(y_probs_ext) / len(y_probs_ext)
# The explanations for these numbers are in AMBROSM's code
#y_probs_avg += np.array([0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0])
#y_pred_tuned = le.inverse_transform(np.argmax(y_probs_avg, axis=1))

In [None]:
# credits: https://www.kaggle.com/max1mum/extra-trees-cv-voting
mean_prob = sum(y_probs_ext) / len(y_probs_ext)

# The distribution of bacteria types
target_dist = pd.Series(y_train).value_counts().sort_index() / len(y_train) * 100

# Finds the difference in percent between the normal and tuned target distributions
def get_diff(deltas, distribution):
    tuned_predictions = pd.Series(np.argmax(mean_prob + deltas, axis=1))
    return distribution - tuned_predictions.value_counts().sort_index() / len(X_test) * 100

# The list of probability deltas to match distributions
deltas = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

diff = get_diff(deltas, target_dist)
print("Mean difference before tuning:", diff.abs().mean(), "%")

# Finding the optimal probability deltas
for i in range(1000):
    diff_max_id = np.argmax(diff.abs())

    if diff[diff_max_id] > 0.1:
        deltas[diff_max_id] += 0.001
    elif diff[diff_max_id] < -0.1:
        deltas[diff_max_id] -= 0.001
    else:
        break
    diff = get_diff(deltas, target_dist)

print("Mean difference after tuning:", diff.abs().mean(), "%")
mean_prob += deltas

In [None]:
y_pred_tuned = le.inverse_transform(np.argmax(mean_prob, axis=1))
y_pred_tuned

In [None]:
#y_pred_res = mode(y_preds_ext).mode[0]
sub.loc[index_test_low, 'target'] = y_pred_tuned
sub.sort_index(inplace=True)

## 6. Correcting labels

In [None]:
# credits: https://www.kaggle.com/hamzaghanmi/train-test-286
#sub.loc[common_labels.row_id_y, 'target'] = common_labels.target

## 7. Submission

In [None]:
#sub_samp = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv", index_col=0)
#sub_samp

In [None]:
assert(sub.index.duplicated(keep='first').any() == False)
sub.to_csv('submission.csv')
print(sub)

### Future steps:

I think that there are two main problems to solve in this competition:

1. The first is what we saw: the classification of noisy samples with resolution of 1000 and 10000. This is something that can be addressed just using CV as we did here and trying to tune hyper-parameters, generate more data, create more useful features. 
2. The second problem is the deviation train and test samples that AmbrosM showed when plotting both datasets using PCA. He suggested a correction in the probabilities to fix the difference in the number of samples per class between sets. 

With regards to the first problem, I strongly believe that the most efficient approach is to split it into two more specific (and maybe smaller) problems: one easy and one hard. The easy problem consists simply in predicting the high-resolution samples, the ones with res=1 and res=10 (and it's very easy indeed as we saw earlier). The hard problem, though, is to somehow use the information extracted from the data and also from the easy problem to address the classification of the low-resolution samples. 

Thanks for reading!