In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from math import factorial

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import RadiusNeighborsClassifier

In [None]:
prob_train = [0.10, 0.10, 0.10, 0.10, 0.1, 0.10, 0.08, 0.12, 0.10, 0.10]
prob_test  = [0.12, 0.09, 0.09, 0.15, 0.1, 0.05, 0.10, 0.10, 0.09, 0.11]

print('Train:', np.random.RandomState(seed=231).choice(10, size=30, p=prob_train))
print('Test: ', np.random.RandomState(seed=231).choice(10, size=30, p=prob_test))

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

elements = [e for e in train_df.columns if e != 'row_id' and e != 'target']

# Convert the 10 bacteria names to the integers 0 .. 9
le = LabelEncoder()
train_df['target_num'] = le.fit_transform(train_df.target)

train_df.shape, test_df.shape


In [None]:
# Compute gcd and integer representations
def bias_of(s):
    """
    Bias is between 9.5e-7 and 2.4e-2. The sum of all biases is 1."""
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

bias_vector = np.array([bias_of(col) for col in elements])

train_i = pd.DataFrame(((train_df[elements].values + bias_vector) * 1000000).round().astype(int), columns=elements, index=train_df.index)
test_i = pd.DataFrame(((test_df[elements].values + bias_vector) * 1000000).round().astype(int), columns=elements, index=test_df.index)

train_df['gcd'] = np.gcd.reduce(train_i[elements], axis=1)
test_df['gcd'] = np.gcd.reduce(test_i[elements], axis=1)

In [None]:
# Select training samples with gcd=10000 (i.e. num_reads=100), drop duplicates and convert to integer
Z_tr = (train_i[(train_df.gcd == 10000)].drop_duplicates(elements) // 10000)
y_tr = train_df[(train_df.gcd == 10000)].drop_duplicates(elements).target_num

# Select test samples with gcd=10000 (i.e. num_reads=100) and convert to integer
Z_te = (test_i[(test_df.gcd == 10000)] // 10000)
Z_tr.shape, y_tr.shape, Z_te.shape

In [None]:
def plot_pca(tr, te, title):
    pca = PCA(n_components=2)
    tr_p = pca.fit_transform(tr)
    te_p = pca.transform(te)

    plt.figure(figsize=(18, 8))
    plt.gca().set_facecolor((0.7, 0.7, 0.7))
    #plt.scatter(tr_p[:,0], tr_p[:,1], s=3, c=y_tr, cmap='tab10', label='Training')
    plt.scatter(tr_p[:,0], tr_p[:,1], s=3, c='#0057b8', label='Train') # train: blue
    plt.scatter(te_p[:,0], te_p[:,1], s=3, c='#ffd700', label='Test') # test: yellow
    plt.legend()
    plt.title(title)
    plt.show()
    
plot_pca(Z_tr, Z_te, 'The untransformed data')

In [None]:
%%time
# Convert Z_tr and Z_te to arrays X_tr and X_te of shape (n_samples, 100)
def transform(Z):
    ll = [] # list of lists which will be converted to a 2d array
    for i in range(len(Z)):
        l = [] # list which will be converted to a row of the new 2d array
        for j in range(Z.shape[1]):
            for k in range(Z.iloc[i, j]): l.append(j)
        ll.append(l)
    return np.array(ll)

X_tr = transform(Z_tr)
X_te = transform(Z_te)
X_tr.shape, X_te.shape

In [None]:
plot_pca(X_tr, X_te, 'The transformed data makes the pairs visible')

In [None]:
# Predict with RadiusNeighborsClassifier
rnc = RadiusNeighborsClassifier(radius=18, weights='distance',
                                p=1, outlier_label=-1, n_jobs=-1)
rnc.fit(X_tr, y_tr)
y_pred = rnc.predict(X_te)
print('Unique predictions:', np.unique(y_pred))
print('Frequencies:', np.unique(y_pred, return_counts=True)[1])
print('Samples:', len(y_pred))
print('Predicted samples:', 
      len(y_pred) - np.unique(y_pred, return_counts=True)[1][0])

In [None]:
# Read the top public submission and merge it with our predictions
top_submission = pd.read_csv('../input/extrablenderadditionv12/submission.csv')
test_df['target_num'] = le.transform(top_submission.target)
test_df.loc[(test_df.gcd == 10000), 'target_num'] = y_pred # can contain -1
test_df['target_num'] = np.where(test_df.target_num == -1,
                                 le.transform(top_submission.target),
                                 test_df.target_num)
test_df['target'] = le.inverse_transform(test_df.target_num.astype(int))
submission = test_df[['row_id', 'target']]
print('Modified:', (submission.target != top_submission.target).sum())
submission.to_csv('submission_radiusneighbors.csv', index=False)
submission
