# Introduction

Learning details about a dataset is crucial for influencing decision making when attempting classification of it. I hope what is presented here can be useful for the reader. If there are more things you feel I should have explored please let me know in the comments.

In [None]:
import numpy as np 
import pandas as pd
from dateutil.parser import parse
from typing import List
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Data

I will immediately drop the row_id as it will only cause issues

In [None]:
df = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")
df.drop(columns="row_id", inplace=True)

Turn target column into categorical codes and add to dataframe. I'll then drop the target column from the dataframe and also store the link between the codes in a seperate dataframe for later use.

In [None]:
df["target_code"] = df.target.astype('category').cat.codes
saved_link_df = df.loc[:,['target','target_code']].drop_duplicates()
df.drop(columns="target", inplace=True)
saved_link_df

# Exploratory Data Analysis

I will now take a quick look at the data with a multiplot:

In [None]:
pltdf = df.copy()
rename = [cname[0:10] for cname in df.columns]
pltdf.columns = rename
pltdf.iloc[:100, :24].plot(subplots=True, layout=(20,4), figsize=(25,20))

plt.show()

Now I will plot the class balance

In [None]:
sns.countplot(x=df["target_code"])

Great to see balanced target classes.

Now I will see and plot important correlations into a heat map.

In [None]:
# calculate the correlation matrix
cols = []
cols_done = []
for col_one in df.iloc[:,:].columns:
    if (df[col_one].corr(df['target_code']) > 0.07):
        cols.append(col_one)
    cols_done.append(col_one)
corrdf = df.copy()
corrdf = corrdf[cols].corr()

sns.heatmap(corrdf, cmap="Blues")

Not much correlation to speak of with the target.

I find a decision tree is useful at looking how the data is splitting.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

tree_set = df.select_dtypes(exclude="object").copy()
target = tree_set["target_code"]
tree_set.drop(["target_code"], axis=1, inplace=True)

tree_clf = DecisionTreeClassifier(max_depth=3, random_state=1)
tree_clf.fit(tree_set, target)
text_representation = tree.export_text(tree_clf, feature_names=tree_set.columns.tolist())
print(text_representation)
print("accuracy: " + str(tree_clf.score(tree_set, target)))

Notice how A3T3G2C2 was the highest correlated feature and also appears in our decision tree split. 

Now we can plot the split variables.

In [None]:
f,ax=plt.subplots(2,2,figsize=(20,10))

sns.histplot(x=df.loc[df.A2T1G4C3 < 0.02].A2T1G4C3, ax=ax[0, 0], color='blue')
ax[0, 0].set_title('A2T1G4C3')
sns.histplot(x=df.loc[df.A4T6G0C0 < 0.005].A4T6G0C0, ax=ax[0, 1], color='darkblue')
ax[0, 1].set_title('A4T6G0C0')
sns.histplot(x=df.loc[df.A4T0G4C2 < 0.005].A4T0G4C2, ax=ax[1, 0], color='blue')
ax[1, 0].set_title('A4T0G4C2')
sns.histplot(x=df.loc[df.A1T1G4C4 < 0.02].A1T1G4C4, ax=ax[1, 1], color='darkblue')
ax[1, 1].set_title('A1T1G4C4')
plt.show()

Looks like there are very clear spikes in the data around certain values.

# Principle Component Analysis

Now we can look at some Principle Component Analysis to see the columns that explain variance in the data. First we need to scale the data. Then we will cut it down to represent 95% of the variance:

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Separating out the features
x = df.iloc[:, :-1]
x = StandardScaler().fit_transform(x)
pca = PCA(.95)
principalComponents = pca.fit_transform(x)

In [None]:
principalDf = pd.DataFrame(data = principalComponents)
print(len(principalDf.columns))

237 Columns account for 95% of the variance in the data rather than the 287 feature columns we began with. 

Now I will try a more extreme PCA to 4 components and then visualise it to see if seperate groups form. This can be useful in a classification task if there is seperation at low dimensions. 

In [None]:
pca = PCA(n_components=5)
pairedComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = pairedComponents)

In [None]:
sns.set_theme(style="white")

g = sns.PairGrid(principalDf.iloc[:200,:], diag_sharey=False)
g.map_upper(sns.scatterplot, s=15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=2)

This is interesting and there are seperate groups within the data but it isn't particularly significant. 

# Prediction

For this notebook I will use Pycaret to create a simple Extra Trees Model.

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

setup(data = df.copy(), 
             target = "target_code",
             numeric_imputation = 'mean',
             silent = True, normalize = True, session_id=42)
display()

In [None]:
et = create_model('et')

In [None]:
tuned_extratrees = tune_model(et)

The tuned model is considerably worse so we will ignore it

In [None]:
predict_model(et);

# Look at Model Metrics

In [None]:
plot_model(et, plot = 'error')

In [None]:
plot_model(et, plot = 'boundary')

I will plot feature importance here. You might notice that some of the features match up with what we saw earlier in the decision tree.

In [None]:
plot_model(et, plot = 'feature')

In [None]:
# AUC plot
plot_model(et, plot = 'auc')

# Train Model on Full Dataset

In [None]:
et = finalize_model(et)
predict_model(et);

# Create Submission

In [None]:
test_data = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")
test_data.drop(columns="row_id", inplace=True)
predictions = predict_model(et, data=test_data)
predictions.head()

In [None]:
label_dict = {}
for _, row in saved_link_df.iterrows():
    label_dict[row["target_code"]] = row["target"]

In [None]:
labels = []
for _, row in predictions.iterrows():
    labels.append(label_dict[round(row["Label"])])

In [None]:
submissiondf = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")
submissiondf["target"] = labels
submissiondf.head()

In [None]:
submissiondf.to_csv("submission.csv", index=False)