In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import gc

from math import factorial
from scipy.stats import mode

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

oligomers = [oligomer for oligomer in train_df.columns if oligomer != 'row_id' and oligomer != 'target']

le = LabelEncoder()
train_df['target_le'] = le.fit_transform(train_df.target)

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4 ** 10)

def bias_of(oligomer):
    w = int(oligomer[1:oligomer.index('T')])
    x = int(oligomer[oligomer.index('T') + 1:oligomer.index('G')])
    y = int(oligomer[oligomer.index('G') + 1:oligomer.index('C')])
    z = int(oligomer[oligomer.index('C') + 1:])
    
    return bias(w, x, y, z)

In [None]:
train_df[oligomers].head()

In [None]:
train_int = pd.DataFrame({oligomer: ((train_df[oligomer] + bias_of(oligomer)) * 1000000).round().astype(int) for oligomer in oligomers})
train_int.head()

In [None]:
test_df[oligomers].head()

In [None]:
test_int = pd.DataFrame({oligomer: ((test_df[oligomer] + bias_of(oligomer)) * 1000000).round().astype(int) for oligomer in oligomers})
test_int.head()

In [None]:
def GCD(df_int):
    GCD = df_int[oligomers[0]]
    for oligomer in oligomers[1:]:
        GCD = np.gcd(GCD, df_int[oligomer])
        
    return GCD
train_df['gcd'] = GCD(train_int)
test_df['gcd'] = GCD(test_int)

In [None]:
train_df.gcd.value_counts()

In [None]:
best_submission = pd.read_csv('../input/tpsfeb22-03-clustering-improves-the-predictions/submission.csv')

In [None]:
best_submission.head()

In [None]:
def svd_gcd_10(df, target, title):
    print(f'\n====={title}======\n')
    
    gcd_10 = df[df.gcd == 10]
    print(f'\nGCD_10:\n{gcd_10}\n')
    
    pred_gcd_10 = le.transform(target)[df.gcd == 10]
    print(f'\nPRED_GCD_10/TARGET_LE:\n{pred_gcd_10}')
    svd = TruncatedSVD(n_components=10, random_state=1)
    svd.fit(gcd_10[oligomers])
    print(f'\nSVD.FIT:\n{svd}\n')
    
    X_gcd_10 = svd.transform(gcd_10[oligomers])
    print(f'\nX_GCD_10:\n{X_gcd_10}\n')
    print(f'\nX_GCD_10[:, 1]:\n{X_gcd_10[:, 1]}\n')
    print(f'\nX_GCD_10[:, 2]:\n{X_gcd_10[:, 2]}\n')
    print(f'\nX_GCD_10.SHAPE:\n{X_gcd_10.shape}\n')
    
    oligomer_1, oligomer_2 = 1, 2
    plt.scatter(X_gcd_10[:, oligomer_1], X_gcd_10[:, oligomer_2], c=pred_gcd_10, cmap='tab10', s=1)
    plt.title(title)
    
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
svd_gcd_10(train_df, train_df.target, 'TRAINING_DF/GCD = 10 TARGET')
plt.subplot(1, 2, 2)
svd_gcd_10(test_df, best_submission.target, 'BEST SUBMISSION/GCD = 10 TARGET')
plt.show()

In [None]:
def svd_gcd_10_bacteria_3_4(df, target, title, innermost, clustering):
    gcd_10 = df[df.gcd == 10].copy()
    print(f'\nGCD_10:\n{gcd_10}\n')
    
    gcd_10['radius'] = np.sqrt(np.square(gcd_10[oligomers]).sum(axis=1))
    print(f"\nGCD_10[RADIUS]:\n{gcd_10['radius']}\n")
    
    gcd_10['pred_le'] = le.transform(target)[df.gcd == 10]
    print(f"\nGCD_10[PRED_LE]:\n{gcd_10['pred_le']}\n")
    
    print(f"\nGCD_10.RADIUS.GROUPBY(GCD_10[PRED_LE]):\n{gcd_10.radius.groupby(gcd_10['pred_le'])}\n")
    
    mean_radius = gcd_10.radius.groupby(gcd_10['pred_le']).mean()
    print(f'\nMEAN_RADIUS:\n{mean_radius}\n')
    
    mean_radius.name = 'mean_radius'
    
    print(f'\nGCD_10:\n{gcd_10}\n')
    print(f"\nGCD_10.MERGE:\n{gcd_10.merge(mean_radius, left_on='pred_le', right_index=True)}\n")
    print(f'\nGCD_10 AFTER MERGING:\n{gcd_10}\n')
    gcd_10 = gcd_10.merge(mean_radius, left_on='pred_le', right_index=True).sort_index()
    print(f'\nGCD_10=MERGE.SORT_INDEX:\n{gcd_10}\n')
    
    if innermost:
        bacteria_3_4 = ((gcd_10.radius < gcd_10.mean_radius * 0.388) & gcd_10.pred_le.isin([3, 4]))
        print(f'\nINNERMOST BACTERIA_3_4:\n{bacteria_3_4}\n')
    else:
        bacteria_3_4 = ((gcd_10.radius >= gcd_10.mean_radius * 0.388) &
                        (gcd_10.radius < gcd_10.mean_radius * 0.64) &
                        gcd_10.pred_le.isin([3, 4]))
        print(f'\nBACTERIA_3_4:\n{bacteria_3_4}\n')
        
    gcd_10 = gcd_10[bacteria_3_4]
    print(f'\nGCD_10[BACTERIA_3_4]:\n{gcd_10}\n')
    
    if clustering:
        km = KMeans(n_clusters=2, random_state=1)
        # km = AgglomerativeClustering(n_clusters=2)

        km.fit(gcd_10[oligomers])
        print(f'\nKM.FIT:\n{km}\n')
        print(f'\nGCD_10.PRED_LE:\n{gcd_10.pred_le}\n')
        pred_most_freq_lbl = gcd_10.pred_le.groupby(km.labels_).transform(lambda lbl: [mode(lbl)[0][0]] * len(lbl))
        print(f'\nPRED_MOST_FREQ_LBL:\n{pred_most_freq_lbl}/n')
        print(f'\nPRED_MOST_FREQ_LBL != GCD_10.PRED_LE:\n{pred_most_freq_lbl != gcd_10.pred_le}\n')
        print(f'\nRELABELED SAMPLES: (PRED_MOST_FREQ_LBL != GCD_10.PRED_LE).SUM\n{(pred_most_freq_lbl != gcd_10.pred_le).sum()}\n')
        
    svd = TruncatedSVD(n_components=10, random_state=1)
    svd.fit(gcd_10[oligomers])
    
    X_gcd_10 = svd.transform(gcd_10[oligomers])
    oligomer_1, oligomer_2 = 1, 2
    plt.scatter(X_gcd_10[:, oligomer_2],
                X_gcd_10[:, oligomer_1],
                cmap=ListedColormap(plt.get_cmap('tab10').colors[2:4]),
                c=(pred_most_freq_lbl if clustering else gcd_10.pred_le),
                s=25)
    plt.title(title)
    
    if clustering:
        clust_bacteria_3_4 = pd.Series(False, index=df.index)
        print(f'\nCLUST_BACTERIA_3_4:\n{clust_bacteria_3_4}\n')
        clust_bacteria_3_4.loc[df.gcd == 10] = bacteria_3_4
        print(f'\nCLUST_BACTERIA_3_4.LOC[DF.GCD == 10]:\n{clust_bacteria_3_4.loc[df.gcd == 10]}\n')
        df.loc[clust_bacteria_3_4, 'pred_lbl'] = pred_most_freq_lbl
        print(f"\nDF.LOC[CLUST_BACTERIA_3_4, PRED_LBL]:\n{df.loc[clust_bacteria_3_4, 'pred_lbl']}")

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
svd_gcd_10_bacteria_3_4(train_df,
                        train_df.target,
                        'TRAINING CGD_10 WITH TRUE LABELS',
                        innermost=True,
                        clustering=False)
plt.subplot(1, 2, 2)
svd_gcd_10_bacteria_3_4(train_df,
                        train_df.target,
                        'TRAINING CGD_10 WITH CLUSTER LABELING',
                        innermost=True,
                        clustering=True)
plt.show()

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
svd_gcd_10_bacteria_3_4(test_df,
                        best_submission.target,
                        'TESTING GCD_10 WITH BEST_SUBMISSION LABELS',
                        innermost=True,
                        clustering=False)
plt.subplot(1, 2, 2)
svd_gcd_10_bacteria_3_4(test_df,
                        best_submission.target,
                        'TESTING GCD_10 WITH CLUSTER LABELING',
                        innermost=True,
                        clustering=True)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
svd_gcd_10_bacteria_3_4(train_df,
                        train_df.target,
                        'TRAINING CGD_10 WITH TRUE LABELS',
                        innermost=False,
                        clustering=False)
plt.subplot(1, 2, 2)
svd_gcd_10_bacteria_3_4(train_df,
                        train_df.target,
                        'TRAINING CGD_10 WITH CLUSTER LABELING',
                        innermost=False,
                        clustering=True)
plt.show()

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
svd_gcd_10_bacteria_3_4(test_df,
                        best_submission.target,
                        'TESTING GCD_10 WITH BEST_SUBMISSION LABELS',
                        innermost=False,
                        clustering=False)
plt.subplot(1, 2, 2)
svd_gcd_10_bacteria_3_4(test_df,
                        best_submission.target,
                        'TESTING GCD_10 WITH CLUSTER LABELING',
                        innermost=False,
                        clustering=True)
plt.show()

In [None]:
best_submission['pred_lbl'] = best_submission.target

best_submission.loc[~test_df.pred_lbl.isna(), 'pred_lbl'] = le.inverse_transform(test_df.pred_lbl.dropna().astype(int))
print(f'\nBEST_SUBMISSION:\n{best_submission}\n')
print(f'\nBEST_SUBMISSION.PRED_LBL != BEST_SUBMISSION.TARGET:\n{best_submission.pred_lbl != best_submission.target}\n')
print(f'\nRELABELED PREDICTIONS (BEST_SUBMISSION.PRED_LBL != BEST_SUBMISSION.TARGET).SUM:\n{(best_submission.pred_lbl != best_submission.target).sum()}\n')
print(f'\nBEST_SUBMISSION:\n{best_submission}\n')

In [None]:
submission = best_submission[['row_id', 'pred_lbl']].rename(columns={'pred_lbl': 'target'})
submission.to_csv('submission_6.scv', index=False)
submission