In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from tqdm import tqdm
from math import factorial
import gc
import joblib

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

import math

from warnings import simplefilter
simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(12, 6),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from scipy.stats import norm, skew

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
from sklearn.feature_selection import mutual_info_classif

def make_mi_scores(X, y, discrete_features='auto'):
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

from sklearn.model_selection import cross_val_score
def score_dataset(X, y, model=ExtraTreesClassifier()):
    score = cross_val_score(
        model, X, y, cv=5, scoring="accuracy",
    )
    score = score.mean()
    return score

def plot_variance(pca, width=12, dpi=100):
    # Create figure
    fig, axs = plt.subplots(2, 1)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 0.2)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=width, dpi=100)
    return axs

# Read the data
* read train, test
* label encode target -> target num
* calculate GCD 

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')

bacteria = train['target'].unique().tolist()
elements = [e for e in train.columns if e != 'row_id' and e != 'target']

# Convert the 10 bacteria names to the integers 0 .. 9
le = LabelEncoder()
train['target_num'] = le.fit_transform(train.target)

def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def integer(float_df):
    df = float_df.copy()
    for col in elements:
        df[col] = df[col].apply(lambda x: x*1000000).astype(int)
    return df

def n_samples(df):
    return pd.DataFrame({col: ((df[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})

def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

target = train['target']
target_num = train['target_num']

# create train_i and test_i
train_ns, test_ns = n_samples(train[elements]), n_samples(test[elements])
train_ns['gcd'], test_ns['gcd'] = gcd_of_all(train_ns), gcd_of_all(test_ns)

# add 'gcd' to train and test
train['gcd']   = train_ns['gcd']
test['gcd']    = test_ns['gcd']

del train_ns
del test_ns
gc.collect()

## Uniques
* Calculate unique values counts for elements
* Select categoricals (uniques count < C)

In [None]:
C = 50
uni = {e:len(train[e].unique()) for e in elements}
uniques = {k: v for k, v in sorted(uni.items(), key=lambda item: item[1])}

categories = [e for e in uniques if uniques[e] < C]
print('categories', categories)

df = pd.DataFrame(uniques.values()) #, index=uniques.keys())
df.columns = ['count']
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
sns.barplot(x=df.index, y=df['count'], ax=ax)
ax.xaxis.set_major_locator(ticker.MultipleLocator(10))

## Sample
* for performance make smaller sample of train or train_i

In [None]:
sample = train.sample(n=10000, axis=0)
target = 'target'

# Mutual Information
Locate features with the most potential.

In [None]:
mi_scores = make_mi_scores(sample[elements], sample['target_num'])

df = pd.DataFrame(mi_scores).transpose()
df= df[elements]
display(df.head(10))

BIAS = pd.DataFrame([bias_of(f) for f in elements]).transpose()
BIAS.columns = elements

fig, ax = plt.subplots(1, 1, figsize=(24, 4))
sns.barplot(x=elements, y=df.iloc[0,:], ax=ax, label='MI Scores')
sns.lineplot(x=elements, y=BIAS.iloc[0,:]*30, color='r', ax=ax, label='Bias')
ax.xaxis.set_major_locator(ticker.MultipleLocator(20))
ax.legend()
fig.show()

# Baseline
* features == elements

In [None]:
X = train[elements]
y = train['target_num']
base_score = score_dataset(X, y)
print(base_score)

In [None]:
X = train[elements + ['gcd']]
base_gcd_score = score_dataset(X, y)
print(base_gcd_score, base_gcd_score - base_score)

## Unique GCD values

In [None]:
scales = np.sort(sample['gcd'].unique())
scales

In [None]:
palette = sns.color_palette("bright")

### Let's make some plots and try to find patterns
* first of all it's beautiful

# Elements vs Target

In [None]:
nf, nc = len(elements[:8]), 4
f, c = 0, 0
while f < nf:
    if c == 0:
        fig, ax = plt.subplots(1, nc, figsize=(24, 4))
    sns.scatterplot(x=sample[elements[f]], y=sample[target], ax=ax[c])
    c += 1
    f += 1
    if c == nc:
        fig.show()
        c = 0

In [None]:
fig = plt.figure(figsize=(16, 6))

ax1 = fig.add_subplot(121)
sns.histplot(data=sample, x=target, hue='gcd', palette=palette[:4], ax=ax1)
ax1.tick_params(axis='x', rotation=-30)

ax2 = fig.add_subplot(122)
sns.histplot(data=sample, x='gcd', hue=target, palette=palette, log_scale=True, ax=ax2)

## Feature values distributions by GCD

In [None]:
plots = 3 # change to see more

nf, ns, nc = len(elements[:plots]), len(scales), 4

for f in range(nf):
    fig, ax = plt.subplots(1, nc, figsize=(24, 4))
    for s in range(ns):
        Xs = sample[sample['gcd'] == scales[s]]
        p = sns.kdeplot(data=Xs, x=elements[f], hue='target', palette=palette, ax=ax[s])
        p.set_title(f'scale {scales[s]}')

# Creating features

* Calculate sum of A T G C and pairs GC TC TG in samples (train rows)

In [None]:
# add gene quantites
def gene_count(snippet, genes):
    qty = 0
    for gene in genes:
        if gene == 'A':
            qty += int(snippet[1:snippet.find('T')])
        if gene == 'T':
            qty += int(snippet[snippet.find('T')+1:snippet.find('G')])
        if gene == 'G':
            qty += int(snippet[snippet.find('G')+1:snippet.find('C')])
        if gene == 'C':
            qty += int(snippet[snippet.find('C')+1:])
    return qty
    
G = ['A','T','G','C', 'TG','TC','GC']
    
gene_counts = {}
for g in G:
    line = [gene_count(f, g) for f in elements]
    gene_counts[g] = line

df_gene_counts = pd.DataFrame(gene_counts).transpose()
df_gene_counts.columns = elements

def add_ATGC(df):
    for g in G:
        df[g] = df[elements].multiply(df_gene_counts.loc[g, :]).sum(axis=1)
    return df

train = add_ATGC(train)
features = elements + G
sample = train.sample(n=10000, axis=0)

In [None]:
mi_scores = make_mi_scores(sample[features], sample['target_num'])
# mi_scores[::3]  # show a few features with their MI scores
df = pd.DataFrame(mi_scores)
display(df.head(10))

fig, ax = plt.subplots(1, 1, figsize=(16, 4))
sns.barplot(x=df.index, y=df['MI Scores'], ax=ax)
ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
ax.set_xticklabels(ax.get_xticklabels(),rotation = 45)
fig.show()

In [None]:
X = train[elements + ['gcd'] + ['A', 'T', 'G', 'C']]
base_gcd_ATGC_score = score_dataset(X, y)
print(base_gcd_ATGC_score, base_gcd_ATGC_score - base_score)

In [None]:
X = train[elements + ['gcd'] + ['A', 'T', 'G', 'C', 'TC', 'TG', 'GC']]
base_gcd_ATGCTCTGGC_score = score_dataset(X, y)
print(base_gcd_ATGCTCTGGC_score, base_gcd_ATGCTCTGGC_score - base_score)

In [None]:
nf, nc = len(G), 4
f, c = 0, 0
while f < nf:
    if c == 0:
        fig, ax = plt.subplots(1, nc, figsize=(24, 4))
    sns.scatterplot(x=sample[G[f]], y=sample[target], ax=ax[c])
    c += 1
    f += 1
    if c == nc:
        fig.show()
        c = 0

In [None]:
nf, ns, nc = len(G), len(scales), 4

for f in range(nf):
    fig, ax = plt.subplots(1, nc, figsize=(24, 4))
    for s in range(ns):
        Xs = sample[sample['gcd'] == scales[s]]
        p = sns.kdeplot(data=Xs, x=G[f], hue='target', ax=ax[s])
        p.set_title(f'scale {scales[s]}')

In [None]:
nf, nb, nc = len(G), len(bacteria), 10
palette = sns.color_palette("bright")
sns.set_palette("bright")
for b in range(nb):
    fig, ax = plt.subplots(1, nf, figsize=(24, 4))
    Xs = sample[sample[target] == bacteria[b]]
    for f in range(nf):
        p = sns.kdeplot(data=Xs, x=G[f], hue='gcd', palette=palette[:4], ax=ax[f])
        p.set_title(f'{bacteria[b]}')

# PCA

## Elements

In [None]:
N_COMPONENTS = 3

X = train.copy()
y = X.pop('target_num')
X = X.loc[:, elements]

# Standardize
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)

from sklearn.decomposition import PCA

# Create principal components
pca = PCA(n_components=N_COMPONENTS)
X_pca = pca.fit_transform(X_scaled)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_pca.head(3)

In [None]:
loadings = pd.DataFrame(
    pca.components_.T,  # transpose the matrix of loadings
    columns=component_names,  # so the columns are the principal components
    index=X.columns,  # and the rows are the original features
)
loadings.head(3)

In [None]:
plot_variance(pca, width=8);

In [None]:
mi_scores = make_mi_scores(X_pca, y, discrete_features=False)
mi_scores

In [None]:
X = pd.concat([train[elements + ['gcd']], X_pca], axis=1)
base_gcd_PCA3_score = score_dataset(X, y)
print(base_gcd_PCA3_score, base_gcd_PCA3_score - base_score)

* Add GCD and select gcd == 1

In [None]:
def plotly_scatter(X, cols=['a', 'b', 'c']):
    fig = px.scatter(x=X[cols[0]], y=X[cols[1]], color=X['target'], width=800, height=450)
    fig.update_xaxes(title=cols[0])
    fig.update_yaxes(title=cols[1])
    fig.show()
    fig = px.scatter(x=X[cols[0]], y=X[cols[2]], color=X['target'], width=800, height=450)
    fig.update_xaxes(title=cols[0])
    fig.update_yaxes(title=cols[2])
    fig.show()
    fig = px.scatter(x=X[cols[2]], y=X[cols[1]], color=X['target'], width=800, height=450)
    fig.update_xaxes(title=cols[2])
    fig.update_yaxes(title=cols[1])
    fig.show()

import matplotlib.colors as mcolors
colors = list(mcolors.TABLEAU_COLORS.values())

def plots_polar(df, cols=['a', 'b', 'c']):
    X = df.copy()
    X['color'] = X['target_num'].apply(lambda x: colors[x])
    fig = plt.figure(figsize=(24, 8))
    ax1 = fig.add_subplot(131, projection='polar')
    c = ax1.scatter(X[cols[1]], X[cols[0]], c=X['color'])
    ax2 = fig.add_subplot(132, projection='polar')
    c = ax2.scatter(X[cols[2]], X[cols[0]], c=X['color'])
    ax3 = fig.add_subplot(133, projection='polar')
    c = ax3.scatter(X[cols[2]], X[cols[1]], c=X['color'])   
    
def plots_scatter(X, cols=['a', 'b', 'c']):
    fig, ax = plt.subplots(1, 3, figsize=(24, 8))
    sns.scatterplot(data=X, x=cols[0], y=cols[1], hue='target', size=1, ax=ax[0])
    sns.scatterplot(data=X, x=cols[0], y=cols[2], hue='target', size=1, ax=ax[1])
    sns.scatterplot(data=X, x=cols[1], y=cols[2], hue='target', size=1, ax=ax[2])    

In [None]:
X_pca['gcd'] = train['gcd']
X_gcd = X_pca[X_pca['gcd'] == 1].copy()
X_gcd['target'] = train['target']
X_gcd['target_num'] = train['target_num']

In [None]:
plots_scatter(X_gcd, cols=['PC1', 'PC2', 'PC3'])
plots_polar(X_gcd, cols=['PC1', 'PC2', 'PC3'])

### Let's zoom-in with Plotly

In [None]:
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [None]:
fig = px.scatter(x=X_gcd['PC1'], y=X_gcd['PC3'], color=X_gcd['target'], width=800, height=450)
fig.update_xaxes(title='PC1')
fig.update_yaxes(title='PC3')
fig.show()

* Unfortunately it's not possible to place "too many" plotly interactive plots visible in a viewer: `WARNING: Too many active WebGL contexts. Oldest context will be lost.`
#### to see more uncomment `plotly-*` in the cell below and latter and play with zoom and selection.

In [None]:
# uncomment in edit modeto see more plotly interactive plots
# plotly_scatter(X_gcd, ['PC1', 'PC2', 'PC3'])

### Looking for patterns with transforms

### PCi / PCj

In [None]:
Xa = X_gcd[['PC1', 'PC2', 'PC3']]
Xa['a'] = Xa["PC1"]/Xa["PC3"]
Xa['b'] = Xa["PC1"]/Xa["PC2"]
Xa['c'] = Xa["PC2"]/Xa["PC3"]
Xa['target'] = train['target']
Xa['target_num'] = train['target_num']

plots_scatter(Xa)
plots_polar(Xa)

In [None]:
# uncomment in edit mode to see more plotly interactive plots
# plotly_scatter(Xa)

### arctan

In [None]:
Xa = X_gcd[['PC1', 'PC2', 'PC3']]
Xa['a'] = np.arctan(Xa["PC1"]/Xa["PC3"])
Xa['b'] = np.arctan(Xa["PC1"]/Xa["PC2"])
Xa['c'] = np.arctan(Xa["PC2"]/Xa["PC3"])
Xa['target'] = train['target']
Xa['target_num'] = train['target_num']

plots_scatter(Xa)
plots_polar(Xa)

In [None]:
# uncomment in edit modeto see more plotly interactive plots
# plotly_scatter(Xa)

In [None]:
fig = px.scatter_3d(Xa, x='a', y='b', z='c', color='target', width=1000, height=800)
fig.update_traces(marker_size=2)
fig.show()

In [None]:
X = pd.concat([train[elements + ['gcd']], X_pca], axis=1)

X['a'] = np.arctan(X["PC1"]/X["PC3"])
X['b'] = np.arctan(X["PC1"]/X["PC2"])
X['c'] = np.arctan(X["PC2"]/X["PC3"])
X = X.drop(['PC1', 'PC2', 'PC3'], axis=1)

base_gcd_PCA3_arctan_score = score_dataset(X, y)
print(base_gcd_PCA3_arctan_score, base_gcd_PCA3_arctan_score - base_score)

### arctan2

In [None]:
Xa = X_gcd[['PC1', 'PC2', 'PC3']]
Xa['a'] = np.arctan2(Xa["PC1"], Xa["PC3"])
Xa['b'] = np.arctan2(Xa["PC1"], Xa["PC2"])
Xa['c'] = np.arctan2(Xa["PC2"], Xa["PC3"])
Xa['target'] = train['target']
Xa['target_num'] = train['target_num']

plots_scatter(Xa)
plots_polar(Xa)

In [None]:
# uncomment in edit modeto see more plotly interactive plots
# plotly_scatter(Xa)

In [None]:
X = pd.concat([train[elements + ['gcd']], X_pca], axis=1)

X['a'] = np.arctan2(X["PC1"],X["PC3"])
X['b'] = np.arctan2(X["PC1"],X["PC2"])
X['c'] = np.arctan2(X["PC2"],X["PC3"])
X = X.drop(['PC1', 'PC2', 'PC3'], axis=1)

base_gcd_PCA3_arctan2_score = score_dataset(X, y)
print(base_gcd_PCA3_arctan2_score, base_gcd_PCA3_arctan2_score - base_score)

## A T G C TG TC GC features

In [None]:
X = train.copy()
y = X.pop('target_num')
X = X.loc[:, G]

# Standardize
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
# Create principal components
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)
X_pca.head(3)

In [None]:
loadings = pd.DataFrame(
    pca.components_.T,  # transpose the matrix of loadings
    columns=component_names,  # so the columns are the principal components
    index=X.columns,  # and the rows are the original features
)
loadings.head(3)

In [None]:
plot_variance(pca, width=8);

In [None]:
X = pd.concat([train[elements + ['gcd']], X_pca[['PC1', 'PC2', 'PC3']]], axis=1)
base_gcd_PCA3_G_score = score_dataset(X, y)
print(base_gcd_PCA3_G_score, base_gcd_PCA3_G_score - base_score)

In [None]:
X = pd.concat([train[['gcd']], X_pca[['PC1', 'PC2', 'PC3']]], axis=1)
gcd_PCA3_G_score = score_dataset(X, y)
print(gcd_PCA3_G_score, gcd_PCA3_G_score - base_score)

In [None]:
X = X_pca[['PC1', 'PC2', 'PC3']]
PCA3_G_score = score_dataset(X, y)
print(PCA3_G_score, PCA3_G_score - base_score)

In [None]:
X_pca['gcd'] = train['gcd']
X_pca['target'] = train['target']
X_gcd = X_pca[X_pca['gcd'] == 1].copy()
X_gcd['target_num'] = train['target_num']

In [None]:
plots_scatter(X_gcd, cols=['PC1', 'PC2', 'PC3'])
plots_polar(X_gcd, cols=['PC1', 'PC2', 'PC3'])

In [None]:
# uncomment in edit modeto see more plotly interactive plots
# plotly_scatter(X_gcd, ['PC1', 'PC2', 'PC3'])

In [None]:
Xa = X_gcd[['PC1', 'PC2', 'PC3']]
Xa['a'] = Xa["PC1"]/Xa["PC3"]
Xa['b'] = Xa["PC1"]/Xa["PC2"]
Xa['c'] = Xa["PC2"]/Xa["PC3"]
Xa['target'] = train['target']
Xa['target_num'] = train['target_num']

plots_scatter(Xa)
plots_polar(Xa)

In [None]:
# uncomment in edit modeto see more plotly interactive plots
# plotly_scatter(Xa)

In [None]:
X = pd.concat([train[elements + ['gcd']], X_pca[['PC1', 'PC2', 'PC3']]], axis=1)

X['a'] = X["PC1"]/X["PC3"]
X['b'] = X["PC1"]/X["PC2"]
X['c'] = X["PC2"]/X["PC3"]
X = X.drop(['PC1', 'PC2', 'PC3'], axis=1)

base_gcd_PCA3_div_G_score = score_dataset(X, y)
print(base_gcd_PCA3_div_G_score, base_gcd_PCA3_div_G_score - base_score)

In [None]:
Xa = X_gcd[['PC1', 'PC2', 'PC3']]
Xa['a'] = np.arctan2(Xa["PC1"], Xa["PC3"])
Xa['b'] = np.arctan2(Xa["PC1"], Xa["PC2"])
Xa['c'] = np.arctan2(Xa["PC2"], Xa["PC3"])
Xa['target'] = train['target']
Xa['target_num'] = train['target_num']

plots_scatter(Xa)
plots_polar(Xa)

In [None]:
# uncomment in edit modeto see more plotly interactive plots
# plotly_scatter(Xa)

# Dataset scores

In [None]:
scores = {}
scores['base'] = 0
scores['+GCD'] = base_gcd_score - base_score
scores['+ATGC'] = base_gcd_ATGC_score - base_score
scores['+ATGC TC TG GC'] = base_gcd_ATGCTCTGGC_score - base_score
scores['+PCA 3'] = base_gcd_PCA3_score - base_score
scores['+PCA 3G'] = base_gcd_PCA3_G_score - base_score
scores['GCD PCA 3G'] = gcd_PCA3_G_score - base_score
scores['PCA 3G'] = PCA3_G_score - base_score
scores['+PCA 3G div'] = base_gcd_PCA3_div_G_score - base_score
scores['+PCA 3 arctan'] = base_gcd_PCA3_arctan_score - base_score

display(pd.DataFrame(scores, index=scores.keys()).iloc[0])

fig, ax = plt.subplots(1, 1, figsize=(10, 4))
sns.barplot(x=list(scores), y=list(scores.values()), ax=ax)

# Conclusion.
* So, the golden feature was not found. Only the addition of the GCD increases the accuracy.

# The next step
* I tried to split the dataset into for by GCD value and model each separately.  
    See the result in [Four models, one for each GCD](https://www.kaggle.com/martynovandrey/four-models-one-for-each-gcd)  
    public score **0.98694**  
* ExtraTreesClassifier with CV and clustering  
    Notebook: [ET + CV + clustering](https://www.kaggle.com/martynovandrey/et-cv-clustering)  
    public score **0.98875**

### Consider to upvote if you find the notebook interesting.

Thanks to [Luca Massaron](https://www.kaggle.com/lucamassaron), [AmbrosM](https://www.kaggle.com/ambrosm), [ŞAFAK TÜRKELI](https://www.kaggle.com/sfktrkl`)