In [None]:
import numpy as np
import pandas as pd
from math import factorial
# Plot
from scipy.stats import mode

from sklearn.cluster import KMeans, AgglomerativeClustering
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from matplotlib.colors import ListedColormap
# Encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, TruncatedSVD
# Cross-Validation
from sklearn.model_selection import StratifiedKFold

Separate the training and test set based on gcd values and train separate models for different gcds

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv',index_col=0)
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv',index_col=0)
subs = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
train1 = train_df.copy()
test1 = test_df.copy()

In [None]:
elements = [e for e in train_df.columns if e != 'row_id' and e != 'target']


GCD values calculation

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

In [None]:
test_i = pd.DataFrame({col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
test_df['gcd'] = gcd_of_all(test_i)


train_i = pd.DataFrame({col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
train_df['gcd'] = gcd_of_all(train_i)
# np.unique(train_df['gcd'], return_counts=True), np.unique(test_df['gcd'], return_counts=True)

In [None]:
target_encoder = LabelEncoder()
train_df['target'] = target_encoder.fit_transform(train_df.target)

## function ExtraTrees

In [None]:
def trainingFunc(train, test, gcd):
    
    train.drop_duplicates(keep='first', inplace=True)
    numerical_features = train.columns[:-1]
    N_SPLITS = 10
    ESTIMATORS = 1300

    X = train.drop(["target"], axis=1)
    y = train["target"]

    scores = []
    y_probs = []
    folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)
    for fold, (train_id, test_id) in enumerate(folds.split(X, y)):  
        X_train = X.iloc[train_id]
        y_train = y.iloc[train_id]
        X_valid = X.iloc[test_id]
        y_valid = y.iloc[test_id]

        model = ExtraTreesClassifier(
            n_estimators=ESTIMATORS,
            n_jobs=-1
            # max_features=None
        )

        model.fit(X_train, y_train)
        valid_pred = model.predict(X_valid)
        valid_score = accuracy_score(y_valid, valid_pred)

        print("Fold:", fold + 1, "Accuracy:", valid_score)
        scores.append(valid_score)
        y_probs.append(model.predict_proba(test))

    print("Mean accuracy score:", np.array(scores).mean())
    y_prob = sum(y_probs) / len(y_probs)
    
    
    target_distribution = train['target'].value_counts().sort_index() / len(train) * 100
    def get_diff(tune):
        y_pred_tuned = np.argmax(y_prob + tune, axis=1)
        return target_distribution - pd.Series(y_pred_tuned).value_counts().sort_index() / len(test) * 100

    tune = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    diff = get_diff(tune)
    while abs(diff).max() > 0.1:
        
        for i in range(len(diff)):
            if diff[i] > 0.1:
                tune[i] += 0.001
                break
            if diff[i] < -0.1:
                tune[i] -= 0.001
                break
        diff1 = get_diff(tune)
        if((diff1 == diff).all()):
            print("equal-----------------")
            break
        else:
            diff = diff1

    # Credits to https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants
    print(tune)
    y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob + tune, axis=1))
    print(pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100)
    return y_pred_tuned, model




## gcd

In [None]:
gcd1 = train_df[train_df.gcd == 1].drop(['gcd'],axis = 1)
gcd10 = train_df[train_df.gcd == 10].drop(['gcd'],axis = 1)
gcd1000 = train_df[train_df.gcd == 1000].drop(['gcd'],axis = 1)
gcd10000 = train_df[train_df.gcd == 10000].drop(['gcd'],axis = 1)

In [None]:
gcd1_test = test_df[test_df.gcd == 1].drop(['gcd'],axis = 1)
gcd10_test = test_df[test_df.gcd == 10].drop(['gcd'],axis = 1)
gcd1000_test = test_df[test_df.gcd == 1000].drop(['gcd'],axis = 1)
gcd10000_test = test_df[test_df.gcd == 10000].drop(['gcd'],axis = 1)

## Training Separate model for each GCD category

In [None]:
gcd1_sub, gcd1_model = trainingFunc(gcd1,gcd1_test,"1")

In [None]:
gcd10_sub, gcd10_model = trainingFunc(gcd10,gcd10_test,"10")

In [None]:
gcd1000_sub, gcd1000_model = trainingFunc(gcd1000,gcd1000_test,"1000")

In [None]:
gcd10000_sub, gcd10000_model = trainingFunc(gcd10000,gcd10000_test,"10000")

In [None]:
gcd1_test["target"] = gcd1_sub
gcd10_test["target"] = gcd10_sub
gcd1000_test["target"] = gcd1000_sub
gcd10000_test["target"] = gcd10000_sub

In [None]:
temp = pd.concat([gcd1_test, gcd10_test, gcd1000_test, gcd10000_test]).sort_values(['row_id'])

In [None]:
temp.reset_index(level=0, inplace=True)

In [None]:
temp[['row_id','target']].to_csv("submission.csv", index=False)

Ensemble