In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# The codes here a copy of Ambrosm's notebook. I have only created the pseudo-labelled data using the model in this notebook and re-run the notebook after adding it to the training data.

#### Pseudo-labelled data was created by taking all rows from the test data where the predicted probability is >=0.99. We find that these are exclusively from rows with gcd = 1 or 10 which is not surprising.

#### Code for creating the pseudo-labelled data is commented in this notebook as we do not need to re-create it.

### We see an improvement in the CV score by 0.00653 and LB by 0.00015 when using Pseudo Labelling.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import gc

from scipy.stats import mode
from math import factorial
from tqdm import tqdm

# Install Intelex

In [None]:
!pip install scikit-learn-intelex

from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

# Read train, test and pseudo-labelled data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')

In [None]:
pseudo_train = pd.read_csv('../input/pseudo-labelled-data-for-tps-2022/test_pseudo.csv', index_col='row_id')
pseudo_train.drop(['proba','gcd'], axis=1, inplace=True)
pseudo_train.head()

In [None]:
train_df = pd.concat([train_df, pseudo_train], axis=0)

In [None]:
train_df.duplicated().sum()

In [None]:
test_df.duplicated().sum()

In [None]:
hist_bins=train_df.columns.drop('target')

# Calculate bias and add it

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

train_i = pd.DataFrame({col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in hist_bins})
test_i = pd.DataFrame({col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in hist_bins})
train_i

# Add gcd feature

In [None]:
def gcd_of_all(df_i):
    gcd = df_i[hist_bins[0]]
    for col in hist_bins[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

train_df['gcd'] = gcd_of_all(train_i)
test_df['gcd'] = gcd_of_all(test_i)
np.unique(train_df['gcd'], return_counts=True), np.unique(test_df['gcd'], return_counts=True)

# Remove duplicates and create sample weight for train data

In [None]:
vc = train_df.value_counts()
dedup_train = pd.DataFrame([list(tup) for tup in vc.index.values], columns=train_df.columns)
dedup_train['sample_weight'] = vc.values
dedup_train

In [None]:
(train_df[hist_bins].values == dedup_train[hist_bins].iloc[0].values.reshape(1, -1)).all(axis=1).sum()

In [None]:
le = LabelEncoder()

In [None]:
train_df['target_num'] = le.fit_transform(train_df.target)

# PCA

In [None]:
for scale in np.sort(train_df['gcd'].unique()):
    # Compute the PCA
    pca = PCA(whiten=True, random_state=1)
    pca.fit(train_i[hist_bins][train_df['gcd'] == scale])

    # Transform the data so that the components can be analyzed
    Xt_tr = pca.transform(train_i[hist_bins][train_df['gcd'] == scale])
    Xt_te = pca.transform(test_i[hist_bins][test_df['gcd'] == scale])

    # Plot a scattergram, projected to two PCA components, colored by classification target
    plt.figure(figsize=(6,6))
    plt.scatter(Xt_tr[:,0], Xt_tr[:,1], c=train_df.target_num[train_df['gcd'] == scale], s=1)
    plt.title(f"{1000000 // scale} decamers ({(train_df['gcd'] == scale).sum()} samples with gcd = {scale})")
    plt.show()

## Observation: We can see that some of the deviation between train and test data is captured by Pseudo labelling

# Model

In [None]:
X = dedup_train[hist_bins]
y = pd.DataFrame(le.fit_transform(dedup_train['target']), columns=['target'])
sample_weight = dedup_train['sample_weight']

In [None]:
X_test = test_df.drop(['gcd'], axis=1)

In [None]:
#%%time

N_SPLITS = 10
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)
y_pred_list, y_proba_list, scores = [], [], []

for fold, (train_id, valid_id) in enumerate(tqdm(folds.split(X, y), total=N_SPLITS)):
    print('####### Fold: ', fold)
    
    # Splitting
    X_train, y_train, sample_weight_train = X.iloc[train_id], y.iloc[train_id], sample_weight.iloc[train_id]
    X_valid, y_valid, sample_weight_valid = X.iloc[valid_id], y.iloc[valid_id], sample_weight.iloc[valid_id]
    
    # Model
    model = ExtraTreesClassifier(
        n_estimators=1300,
        n_jobs=-1,
        verbose=0,
        random_state=1
    )

    # Training
    model.fit(X_train, y_train, sample_weight_train)
        
    # Validation
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred, sample_weight=sample_weight_valid)
    print(f'Accuracy score: {valid_score:5f}\n')
    scores.append(valid_score)
    
    # Prediction for submission
    y_pred_list.append(model.predict(X_test))
    y_proba_list.append(model.predict_proba(X_test))
    
score = np.array(scores).mean()
print(f'Mean accuracy score: {score:6f}')

In [None]:
y_pred = mode(y_pred_list).mode[0]
y_pred = le.inverse_transform(y_pred)

In [None]:
y_proba = sum(y_proba_list) / len(y_proba_list)
y_proba += np.array([0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0])
y_pred_tuned = le.inverse_transform(np.argmax(y_proba, axis=1))
pd.Series(y_pred_tuned, index=test_df.index).value_counts().sort_index() / len(test_df) * 100

In [None]:
# test_df['target'] = y_pred_tuned
# test_df['proba'] = np.max(y_proba, axis=1)

# test_pseudo_2 = test_df[test_df['proba']>=0.99
# test_pseudo_2.to_csv('test_pseudo2.csv')

In [None]:
# np.unique(test_pseudo['gcd'], return_counts=True)

# On running the above cell, we find that all the pseudo labelled data is from gcd = 1 or 10 which is not surprising.

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
submission['target'] = y_pred_tuned
submission.to_csv('submission2.csv', index=False)
submission