# <span style='color:#A80808'>Motivation</span>

This notebook examines some ideas initialized by @ambrosm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from math import factorial

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

# <span style='color:#A80808'>Data</span>

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

features = train.columns[1:-1]

le = LabelEncoder()
train['target'] = le.fit_transform(train.target)

# Use top public score prediction for analysis
test['target'] = le.transform(pd.read_csv('../input/early-ensemble/submission.csv')['target'])

In [None]:
#https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense
def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

train_i = pd.DataFrame({col: ((train[col] + bias_of(col)) * 1000000).round().astype(int)
                        for col in features})
test_i = pd.DataFrame({col: ((test[col] + bias_of(col)) * 1000000).round().astype(int)
                       for col in features})

train['gcd'] = np.gcd.reduce(train_i[features], axis=1)
test['gcd'] = np.gcd.reduce(test_i[features], axis=1)

# <span style='color:#A80808'>GCD=1</span>

In [None]:
gcd = 1

plt.figure(figsize=(18, 24))

for idx in range(10):
    features_sum = train[features][(train['gcd'] == gcd) & (train['target'] == idx)].abs().sum(axis=1)
    features_sum_test = test[features][(test['gcd'] == gcd) & (test['target'] == idx)].abs().sum(axis=1)
    
    plt.subplot(10,2,2*idx+1)
    plt.title(f'Train dataset, {le.inverse_transform([idx])[0]}')
    plt.hist(features_sum, 
             bins=np.linspace(0,features_sum.max(),200),
             color=plt.cm.get_cmap('tab10', 10)(idx))
    plt.xlabel('Feature sum')
    plt.ylabel('Frequence')
    plt.tight_layout()

    plt.subplot(10,2,2*idx+2)
    plt.title(f'Test dataset, {le.inverse_transform([idx])[0]}')
    plt.hist(features_sum_test, 
             bins=np.linspace(0,features_sum.max(),200),
             color=plt.cm.get_cmap('tab10', 10)(idx), density=True)
    plt.xlabel('Feature sum')
    plt.ylabel('Frequence')
    plt.tight_layout()
plt.show()

# <span style='color:#A80808'>GCD=10</span>

In [None]:
gcd = 10

plt.figure(figsize=(18, 24))

for idx in range(10):
    features_sum = train[features][(train['gcd'] == gcd) & (train['target'] == idx)].abs().sum(axis=1)
    features_sum_test = test[features][(test['gcd'] == gcd) & (test['target'] == idx)].abs().sum(axis=1)
    
    plt.subplot(10,2,2*idx+1)
    plt.title(f'Train dataset, {le.inverse_transform([idx])[0]}')
    plt.hist(features_sum, 
             bins=np.linspace(0,features_sum.max(),200),
             color=plt.cm.get_cmap('tab10', 10)(idx))
    plt.xlabel('Feature sum')
    plt.ylabel('Frequence')
    plt.tight_layout()

    plt.subplot(10,2,2*idx+2)
    plt.title(f'Test dataset, {le.inverse_transform([idx])[0]}')
    plt.hist(features_sum_test, 
             bins=np.linspace(0,features_sum.max(),200),
             color=plt.cm.get_cmap('tab10', 10)(idx), density=True)
    plt.xlabel('Feature sum')
    plt.ylabel('Frequence')
    plt.tight_layout()
plt.show()

# <span style='color:#A80808'>GCD=1000</span>

In [None]:
gcd = 1000

plt.figure(figsize=(18, 24))

for idx in range(10):
    features_sum = train[features][(train['gcd'] == gcd) & (train['target'] == idx)].abs().sum(axis=1)
    features_sum_test = test[features][(test['gcd'] == gcd) & (test['target'] == idx)].abs().sum(axis=1)
    
    plt.subplot(10,2,2*idx+1)
    plt.title(f'Train dataset, {le.inverse_transform([idx])[0]}')
    plt.hist(features_sum, 
             bins=np.linspace(0,features_sum.max(),200),
             color=plt.cm.get_cmap('tab10', 10)(idx))
    plt.xlabel('Feature sum')
    plt.ylabel('Frequence')
    plt.tight_layout()

    plt.subplot(10,2,2*idx+2)
    plt.title(f'Test dataset, {le.inverse_transform([idx])[0]}')
    plt.hist(features_sum_test, 
             bins=np.linspace(0,features_sum.max(),200),
             color=plt.cm.get_cmap('tab10', 10)(idx), density=True)
    plt.xlabel('Feature sum')
    plt.ylabel('Frequence')
    plt.tight_layout()
plt.show()

# <span style='color:#A80808'>GCD=10000</span>

In [None]:
gcd = 10000

plt.figure(figsize=(18, 24))

for idx in range(10):
    features_sum = train[features][(train['gcd'] == gcd) & (train['target'] == idx)].abs().sum(axis=1)
    features_sum_test = test[features][(test['gcd'] == gcd) & (test['target'] == idx)].abs().sum(axis=1)
    
    plt.subplot(10,2,2*idx+1)
    plt.title(f'Train dataset, {le.inverse_transform([idx])[0]}')
    plt.hist(features_sum, 
             bins=np.linspace(0,features_sum.max(),200),
             color=plt.cm.get_cmap('tab10', 10)(idx))
    plt.xlabel('Feature sum')
    plt.ylabel('Frequence')
    plt.tight_layout()

    plt.subplot(10,2,2*idx+2)
    plt.title(f'Test dataset, {le.inverse_transform([idx])[0]}')
    plt.hist(features_sum_test, 
             bins=np.linspace(0,features_sum.max(),200),
             color=plt.cm.get_cmap('tab10', 10)(idx), density=True)
    plt.xlabel('Feature sum')
    plt.ylabel('Frequence')
    plt.tight_layout()
plt.show()