In [None]:
import numpy as np 
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
import warnings
from math import factorial
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")

# Introduction
The basic idea of this notebook is originated from AbrosM's notebook: 
https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense/notebook

In this notebook he mentioned that maybe it would be good to make classifiers based on the gcd values. 

Unfortunately the results are not as good as I expected.

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')

In [None]:
df_train=df_train.drop('row_id', axis = 1)

In [None]:
def encodeTarget(data):
    target_dictionary = {
    'Streptococcus_pyogenes': 0,
    'Salmonella_enterica': 1,
    'Enterococcus_hirae': 2, 
    'Escherichia_coli': 3, 
    'Campylobacter_jejuni': 4,
    'Streptococcus_pneumoniae': 5, 
    'Staphylococcus_aureus': 6,
    'Escherichia_fergusonii': 7, 
    'Bacteroides_fragilis': 8,
    'Klebsiella_pneumoniae': 9
    }
    data['target'] = data['target'].replace(target_dictionary)
    
    return data

In [None]:
def decodeTarget(data):
    target_reverse_dictionary = {
    0: 'Streptococcus_pyogenes',
    1: 'Salmonella_enterica',
    2: 'Enterococcus_hirae', 
    3: 'Escherichia_coli', 
    4: 'Campylobacter_jejuni',
    5: 'Streptococcus_pneumoniae', 
    6: 'Staphylococcus_aureus',
    7: 'Escherichia_fergusonii', 
    8: 'Bacteroides_fragilis',
    9: 'Klebsiella_pneumoniae'
    }
    
    data['target'] = data['target'].replace(target_reverse_dictionary)
    
    return data

In [None]:
df_train = encodeTarget(df_train)

# Calculating the gcd values
I copied these functions from AmbrosM's notebook.

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd


In [None]:
elements = [c for c in df_train.columns if c not in ('row_id', 'target')]

In [None]:
train_i = pd.DataFrame({col: ((df_train[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
train_i.head()

In [None]:
test_i = pd.DataFrame({col: ((df_test[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
test_i.head()

In [None]:
df_train['gcd'] = gcd_of_all(train_i)
df_test['gcd'] = gcd_of_all(test_i)

In [None]:
gcd_count = df_train.groupby('gcd').gcd.count()
gcd_count

In [None]:
gcd_count_test = df_test.groupby('gcd').gcd.count()
gcd_count_test

# Dropping the duplicated
I dropped the duplicated rows based on AmbrosM's other notebook: https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants

In [None]:
vc = df_train.value_counts()
dedup_train = pd.DataFrame([list(tup) for tup in vc.index.values], columns=df_train.columns)
dedup_train['sample_weight'] = vc.values
dedup_train.head()

In [None]:
gcd_count_dedup_train = dedup_train.groupby('gcd').gcd.count()
gcd_count_dedup_train

# Split the data based on GCD values

In [None]:
df_train_1 = dedup_train[dedup_train.gcd == 1].drop(['gcd'],axis = 1)
df_train_10 = dedup_train[dedup_train.gcd == 10].drop(['gcd'],axis = 1)
df_train_1000 = dedup_train[dedup_train.gcd == 1000].drop(['gcd'],axis = 1)
df_train_10000 = dedup_train[dedup_train.gcd == 10000].drop(['gcd'],axis = 1)

df_test_1 = df_test[df_test.gcd == 1].drop(['gcd'],axis = 1)
df_test_10 = df_test[df_test.gcd == 10].drop(['gcd'],axis = 1)
df_test_1000 = df_test[df_test.gcd == 1000].drop(['gcd'],axis = 1)
df_test_10000 = df_test[df_test.gcd == 10000].drop(['gcd'],axis = 1)


# Define the model
I created the makePredictions function because I will fit the ExtraTreeClassifier 4 times and I wanted to avoid code repetition.

In [None]:
def makePredictions(train, test, n_splits):
    X = train[elements]
    y = train['target']
    sample_weight = train['sample_weight']
    
    N_SPLITS = n_splits
    folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)
    final_test_proba=[]
    #final_test_proba=[]
    valid_scores=[]
    test_row_ids = test['row_id']
    
    for fold, (train_id, valid_id) in enumerate(folds.split(X, y)):
        print("Fold: " + str(fold))
        x_train = X.iloc[train_id]
        y_train = y.iloc[train_id]
        sample_weight_train = sample_weight.iloc[train_id]

        x_valid = X.iloc[valid_id]
        y_valid = y.iloc[valid_id]
        sample_weight_valid = sample_weight.iloc[valid_id]
        
        x_test = test.copy()
        x_test = x_test[elements]
        
        sc = StandardScaler()
        x_train = sc.fit_transform(x_train)
        x_valid = sc.transform(x_valid)
        x_test = sc.transform(x_test)
        
        model = ExtraTreesClassifier(n_estimators=300, n_jobs=-1, verbose=0, random_state=1) 
        
        model.fit(x_train, y_train, sample_weight = sample_weight_train)
        
        valid_pred = model.predict(x_valid)
        #valid_pred = np.argmax(valid_pred, axis=-1)
        valid_score = accuracy_score(y_valid,valid_pred, sample_weight = sample_weight_valid)
        print("Fold: " + str(fold) + " Score: " + str(valid_score))
        valid_scores.append(valid_score)
        
        y_proba = model.predict_proba(x_test)
        final_test_proba.append(y_proba)
        
        
    y_proba = sum(final_test_proba) / len(final_test_proba)
    y_pred = np.argmax(y_proba, axis = 1)
    y_pred_df = pd.DataFrame({'row_id': test_row_ids, 'target': y_pred})
    
    return np.mean(valid_scores), y_pred_df

# Make the predictions

In [None]:
N_SPLITS = 5

## Prediction for data where GCD = 1

In [None]:
gcd_1_result = makePredictions(df_train_1, df_test_1, N_SPLITS)
print("gcd1 mean score: " + str(gcd_1_result[0]))

## Prediction for data where GCD = 10

In [None]:
gcd_10_result = makePredictions(df_train_10, df_test_10, N_SPLITS)
print("gcd10 mean score: " + str(gcd_10_result[0]))

## Prediction for data where GCD = 1000

In [None]:
gcd_1000_result = makePredictions(df_train_1000, df_test_1000, N_SPLITS)
print("gcd1000 mean score: " + str(gcd_1000_result[0]))

## Prediction for data where GCD = 10000

In [None]:
gcd_10000_result = makePredictions(df_train_10000, df_test_10000, N_SPLITS)
print("gcd10000 mean score: " + str(gcd_10000_result[0]))

# Create the final prediction

I concatenated the predictions and O created the final datafram for the submission.

In [None]:
final_result = pd.concat([gcd_1_result[1], gcd_10_result[1], gcd_1000_result[1], gcd_10000_result[1]], axis = 0)
final_result = final_result.sort_values(by=['row_id'])
final_result_decoded = decodeTarget(final_result)
final_result_decoded.head()

In [None]:
final_result.to_csv('submission.csv', index=False)

# Final remark
Unfortunately the CV scores and the public leader board score are very bad and I don't know exactly why. I will continue the investigation and I hope I can improve the model.