In [None]:
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import gc
from math import factorial
from datetime import datetime
from scipy.stats import mode
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering

import os

# Previous attempt
*  In previous notebook [Four models, one for each GCD](https://www.kaggle.com/martynovandrey/four-models-one-for-each-gcd) I tried to split the dataset into four by GCD and model separately.

# Read data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col="row_id")
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col="row_id").astype(np.float32)

# Add GCD feature

In [None]:
elements = [e for e in train.columns if e != 'row_id' and e != 'target']

def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

train_i = pd.DataFrame({col: ((train[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
test_i = pd.DataFrame({col: ((test[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})

def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

train['gcd'] = gcd_of_all(train_i)
test['gcd'] = gcd_of_all(test_i)

del train_i
del test_i
gc.collect()

In [None]:
X = train.drop("target", axis=1).astype(np.float32)

target_encoder = LabelEncoder()
y = pd.Series(target_encoder.fit_transform(train["target"]))

# Model

In [None]:
fold_probs = []
y_preds = []
scores = []

folds = StratifiedKFold(n_splits=20, shuffle=True)
estimators = 2500

for fold, (train_id, test_id) in enumerate(folds.split(X, y)):  
    X_train = X.iloc[train_id]
    y_train = y.iloc[train_id]
    X_valid = X.iloc[test_id]
    y_valid = y.iloc[test_id]
    
    model = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1)
    
    start = datetime.now()
    model.fit(X_train, y_train)
    end = datetime.now()
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print("Fold:", fold + 1, "Accuracy:", valid_score, 'Time:', end - start)
    
    scores.append(valid_score)
    
    # Save predictions to later submit the mean values
    y_preds.append(model.predict(test))
    fold_probs.append(model.predict_proba(test))

print("Mean accuracy score:", np.array(scores).mean())    

In [None]:
y_pred = target_encoder.inverse_transform(mode(y_preds).mode[0])

mean_prob = sum(fold_probs) / len(fold_probs) # Mean probability for each row
mean_prob += np.array([0, 0, 0.03, 0.036, 0, 0, 0, 0.027, 0, 0])

mean_pred = target_encoder.inverse_transform(np.argmax(mean_prob, axis=1))

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")
submission["target"] = target_encoder.inverse_transform(np.argmax(mean_prob, axis=1))
# out.to_csv("submission.csv", index=False)

# Clustering

In [None]:
def pca_gcd10_full(df, target):
    """Plot a 2d projection of all points of df with gcd = 10, colored by target"""
    subset = df[df.gcd == 10]
    pred_subset = target_encoder.transform(target)[df.gcd == 10]
    # Compute the PCA
    pca3 = TruncatedSVD(n_components=10, random_state=1)
    pca3.fit(subset[elements])

    # Transform the data so that the components can be analyzed
    Xt_tr3 = pca3.transform(subset[elements])
    
pca_gcd10_full(train, train.target)
pca_gcd10_full(test, submission.target)

In [None]:
def pca_gcd10_selection(df, target, title, clustering, innermost):
    """Plot a 2d projection of certain points of df, colored by target
    or by a clustering, and add the new clustering labels to df.
    
    We select the two innermost clusters with gcd = 10 of bacteria 3 and 4"""
    # Get the subset
    subset = df[df.gcd == 10].copy()
    subset['radius'] = np.sqrt(np.square(subset[elements]).sum(axis=1))
    subset['pred'] = target_encoder.transform(target)[df.gcd == 10]
    mean_radius = subset.radius.groupby(subset.pred).mean()
    mean_radius.name = 'mean_radius'
    subset = subset.merge(mean_radius, left_on='pred', right_index=True).sort_index()
    if innermost:
        selection = ((subset.radius < subset.mean_radius * 0.388) &
                     subset.pred.isin([3, 4]))
    else:
        selection = ((subset.radius >= subset.mean_radius * 0.388) &
                     (subset.radius < subset.mean_radius * 0.64) & 
                     subset.pred.isin([3, 4]))
    subset = subset[selection]
    if clustering:
        # Cluster the data into two clusters
        km = KMeans(n_clusters=2, random_state=1)

        km.fit(subset[elements])
        # For every cluster, predict the most frequent label for all cluster members
        new_pred = subset.pred.groupby(km.labels_).transform(lambda s: [mode(s)[0][0]] * len(s))
        print(f"Relabeled {(new_pred != subset.pred).sum()} samples")

    # Compute the PCA
    pca3 = TruncatedSVD(n_components=10, random_state=1)
    pca3.fit(subset[elements])

    # Transform the data so that the components can be analyzed
    Xt_tr3 = pca3.transform(subset[elements])

    # Plot a scattergram, projected to two PCA components
#     d0, d1 = 2, 1
#     plt.scatter(Xt_tr3[:,d0], Xt_tr3[:,d1],
#                 cmap=ListedColormap(plt.get_cmap('tab10').colors[3:5]),
#                 c=(new_pred if clustering else subset.pred),
#                 s=25)
#     plt.title(title)
    if clustering:
        selected = pd.Series(False, index=df.index)
        selected.loc[df.gcd == 10] = selection
        df.loc[selected, 'new_pred'] = new_pred

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)
pca_gcd10_selection(train, train.target, 
                    'Training, gcd=10: True labels',
                    clustering=False, innermost=True)
plt.subplot(1, 2, 2)
pca_gcd10_selection(train, train.target,
                    'Training, gcd=10: Labeled by clustering',
                    clustering=True, innermost=True)
plt.show()

plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)
pca_gcd10_selection(test, submission.target, 
                    'Test, gcd=10: Labels of top submission',
                    clustering=False, innermost=True)
plt.subplot(1, 2, 2)
pca_gcd10_selection(test, submission.target, 
                    'Test, gcd=10: Labeled by clustering',
                    clustering=True, innermost=True)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)
pca_gcd10_selection(train, train.target,
                    'Training, gcd=10: True labels', 
                    clustering=False, innermost=False)
plt.subplot(1, 2, 2)
pca_gcd10_selection(train, train.target, 
                    'Training, gcd=10: Labeled by clustering',
                    clustering=True, innermost=False)
plt.show()

plt.figure(figsize=(12,6))
plt.subplot(1, 2, 1)
pca_gcd10_selection(test, submission.target,
                    'Test, gcd=10: Labels of top submission',
                    clustering=False, innermost=False)
plt.subplot(1, 2, 2)
pca_gcd10_selection(test, submission.target,
                    'Test, gcd=10: Labeled by clustering',
                    clustering=True, innermost=False)
plt.show()

# Submissions

In [None]:
test = test.reset_index()
submission['new_pred'] = submission.target
submission.loc[~test.new_pred.isna(), 'new_pred'] = target_encoder.inverse_transform(test.new_pred.dropna().astype(int))

print(f"Relabeled predictions: {(submission.new_pred != submission.target).sum()}")
final_submission = submission[['row_id', 'new_pred']].rename(columns={'new_pred': 'target'})
final_submission.to_csv('submission.csv', index=False)
final_submission

# The next step
* blending

Thanks to [Luca Massaron](https://www.kaggle.com/lucamassaron), [AmbrosM](https://www.kaggle.com/ambrosm), [ŞAFAK TÜRKELI](https://www.kaggle.com/sfktrkl`)