In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn import ensemble
from sklearn import model_selection
from sklearn import naive_bayes
import seaborn as sns
import os
from lightgbm import LGBMClassifier

sns.set_theme()

In [None]:
root = '/kaggle/input/tabular-playground-series-feb-2022/'

## Data Input

In [None]:
train_df = pd.read_csv(os.path.join(root, 'train.csv'))
test_df = pd.read_csv(os.path.join(root, 'test.csv'))

train_df.head()

In [None]:
cols = train_df.columns.drop(['row_id', 'target'])

## Visualization

### Histogram

In [None]:
train_df[cols].hist(figsize=(20, 250), layout=(72, 4), log=True)
plt.show()

### Most Correlated Columns

In [None]:
r = train_df[cols].corr()

(
    r[r.abs() < 1]
    .unstack()
    .sort_values(ascending=False, key=np.abs)
    .drop_duplicates()
    .reset_index()
    .rename(columns={
        'level_0': 'Sample 1',
        'level_1': 'Sample 2',
        0: 'Correlation Coefficient'
    })[:10]
)

## Cleaning Data

### Removing duplicate rows

There are several duplicated rows in the training set. To remove bias for these rows, they are removed

In [None]:
train_df.shape[0]

In [None]:
train_df = train_df.drop(columns=['row_id']).drop_duplicates()
train_df.shape[0]

### Remove Anomalies

In [None]:
anomaly_model = ensemble.IsolationForest(n_estimators=10)
anomalies = anomaly_model.fit_predict(train_df[cols])

train_df = train_df.loc[anomalies == 1]
train_df.shape[0]

## Feature Engineering

### Aggregate individual ATGC values

In [None]:
A = np.zeros(len(cols))
T = np.zeros(len(cols))
G = np.zeros(len(cols))
C = np.zeros(len(cols))

for i, x in enumerate(cols):
    A[i] = int(x.split('A')[1].split('T')[0])
    T[i] = int(x.split('T')[1].split('G')[0])
    G[i] = int(x.split('G')[1].split('C')[0])
    C[i] = int(x.split('C')[1])
    
A /= 10
T /= 10
G /= 10
C /= 10
    
train_df['A'] = np.matmul(train_df[cols].to_numpy(), A[np.newaxis].T)
train_df['T'] = np.matmul(train_df[cols].to_numpy(), T[np.newaxis].T)
train_df['G'] = np.matmul(train_df[cols].to_numpy(), G[np.newaxis].T)
train_df['C'] = np.matmul(train_df[cols].to_numpy(), C[np.newaxis].T)

train_df[['A', 'T', 'G', 'C', 'target']]

### Aggregate attributes

In [None]:
train_df['sum'] = train_df[cols].sum(axis=1)
train_df['mean'] = train_df[cols].mean(axis=1)
train_df['std'] = train_df[cols].std(axis=1)
train_df['min'] = train_df[cols].min(axis=1)
train_df['max'] = train_df[cols].max(axis=1)

train_df[['sum', 'mean', 'std', 'min', 'max', 'target']]

In [None]:
X = train_df.drop(columns=['target'])
y = train_df['target']

## Model Evaluation

In [None]:
kfold = model_selection.StratifiedKFold(n_splits=10, shuffle=True)
estimators = 300

### Random Forest

In [None]:
rf = ensemble.RandomForestClassifier(max_depth=12, max_leaf_nodes=63, n_estimators=estimators, n_jobs=-1)
rf_performance = model_selection.cross_val_score(rf, X, y=y, cv=kfold, n_jobs=-1)

### Extra Trees

In [None]:
et = ensemble.ExtraTreesClassifier(max_depth=12, max_leaf_nodes=63, n_estimators=estimators, n_jobs=-1)
et_performance = model_selection.cross_val_score(et, X, y=y, cv=kfold, n_jobs=-1)

### LightGBM

In [None]:
params = {
    'n_estimators': estimators,
    'baggin_freq': 10,
    'bagging_fraction':0.8,
    'max_depth': 12,
    'num_leaves': 63,
    'learning_rate': 0.04,
    'n_jobs': -1
}

lgbm = LGBMClassifier(**params)
lgbm_performance = model_selection.cross_val_score(lgbm, X, y=y, cv=kfold, n_jobs=-1)

### Visualization

In [None]:
plt.boxplot(
    [rf_performance, et_performance, lgbm_performance], 
    labels=['Random Forest', 'Extra Trees', 'LightGBM'])
plt.title('Model Performance')
plt.ylabel('Multi-Class Accuracy')
plt.tight_layout()
plt.show()

based on our experiment, it looks like LightGBM is the strongest classifier for this application.

## Training

In [None]:
model = lgbm.fit(X, y=y)

## Inference

In [None]:
for i, x in enumerate(cols):
    A[i] = int(x.split('A')[1].split('T')[0])
    T[i] = int(x.split('T')[1].split('G')[0])
    G[i] = int(x.split('G')[1].split('C')[0])
    C[i] = int(x.split('C')[1])
    
A /= 10
T /= 10
G /= 10
C /= 10
    
test_df['A'] = np.matmul(test_df[cols].to_numpy(), A[np.newaxis].T)
test_df['T'] = np.matmul(test_df[cols].to_numpy(), T[np.newaxis].T)
test_df['G'] = np.matmul(test_df[cols].to_numpy(), G[np.newaxis].T)
test_df['C'] = np.matmul(test_df[cols].to_numpy(), C[np.newaxis].T)

test_df['sum'] = test_df[cols].sum(axis=1)
test_df['mean'] = test_df[cols].mean(axis=1)
test_df['std'] = test_df[cols].std(axis=1)
test_df['min'] = test_df[cols].min(axis=1)
test_df['max'] = test_df[cols].max(axis=1)

In [None]:
x = test_df.drop(columns=['row_id'])
inference = model.predict(x)

In [None]:
submission = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': inference
})

submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()