In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Please ask or give me suggestions in the comment section, I'd really appreciate it.

In [None]:
# importing useful packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

## First dataset overview

**Importing Datasets**

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-feb-2022/test.csv')

In [None]:
print(f'Trainining set is composed by {train.shape[0]} rows and {train.shape[1]} columns',
     f'Test set is composed by {test.shape[0]} rows and {test.shape[1]} columns',
     sep='\n')

- Test set has half of training set rows.
- There are 287 predictors!

It's a very challenging problem for me.

In [None]:
train.info()

All the predictors are numeric (float and only one integer). The only qualitative column is the target one.
Let's see the cardinality of the integer

In [None]:
train.select_dtypes(include='integer').head()

It is the `row_id` column, I delete it from both train e test datasets.

In [None]:
train.drop(columns='row_id', inplace=True)
test.drop(columns='row_id', inplace=True)

In [None]:
train.head()

So now we have 287 predictors - all of them are the sampled histogram of base count of the relative snippet of DNA - and the target column - the relative bacteria specie.

**Duplicates**

In [None]:
print(f'Duplicate rows in train dataset: {train.duplicated().sum()}',
      f'Duplicate rows in test dataset: {test.duplicated().sum()}',
      sep='\n')

It's better to drop them.

In [None]:
train = train.drop_duplicates()
print(train.shape, test.shape, train.duplicated().sum(), test.duplicated().sum())

## EDA

#### Target variable

Unique bacteria species proportion to predict:

In [None]:
target_proportion = pd.concat([train['target'].value_counts().map('{:,}'.format),train['target'].value_counts(normalize=True).map('{:.4f}'.format)], axis=1)
target_proportion

Fortunately species proportion are balanced through the dataset, we won't have to resample the dataset.
A graph to show clearly classes counts:

In [None]:
sns.countplot(y=train['target'], order=target_proportion.index)

- There are 10 bacteria species to classify
- The dataset is balanced through the class

#### Predictors

As we saw earlier all the predictors are float numbers.

###### **1. Missing Values**

In [None]:
train.isna().sum().sum()

In [None]:
test.isna().sum().sum()

Fortunately there are no NAs, both in train set either in test set.

###### **2. Deep dive in numerical predictors**

Let's see if there is any discrete predictor. I don't think I will find any since they are all floats.

*Note: I usually consider discrete variable a numeric feature with maximum 25 unique values. Otherwise I consider it continuous.

**2.1 Discrete features**

In [None]:
train_predictors = train.drop(columns='target')
discrete_features = [feature for feature in train_predictors.columns if len(train_predictors[feature].unique()) <= 25]
print(f'Discrete variables count: {len(discrete_features)}')

Unexpectedly there are 8 discrete variables. Let's analyze them, and after we will look at continuos ones.

In particular I want to see their relationship with target column.

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(32, 25))
i = 0

for feature in discrete_features:
    data = train.copy()
    
    current_axes = axes.flat[i]
    current_axes.barh(y = train['target'].unique(), width=data.groupby('target')[feature].mean())
    current_axes.set_xlabel('Target')
    current_axes.set_ylabel(feature)
    current_axes.set_title(feature)
    
    i += 1

We can see that some of these DNA snippets could be very important to classify the bacteria specie:
- if `A0T0G0C10` > 0 or `A0T0G10C0` > 0 the bacteria can be a `Campylobacter_jejuni`
- if `A0T0G1C9` > 0 or `A0T0G9C1` is very close to 0 the bacteria can be a `Klebsiella_pneumoniae`
- if `A0T10G0C0` > 0 the bacteria can be a `Enterococcus_hirae`
- if `A0T10G0C0` > 0  the bacteria can be a `Enterococcus_hirae` or a `Streptococcus_pneumoniae`


In [None]:
fig, axes = plt.subplots(4, 2, figsize=(32,25))
i = 0
for feature in discrete_features:
    data = train.copy()
    
    current_axes = axes.flat[i]
    sns.boxplot(y='target', x=feature, data=data, ax=current_axes)
    
    i += 1

There are few outliers that may have distorted my deductions on bar plots.

Later I'll have to understand how to deal with them, but by now let's see the same plots with median and not mean values

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(32,25))
i = 0
for feature in discrete_features:
    data = train.copy()
    
    current_axes = axes.flat[i]
    current_axes.barh(y=train['target'].unique(), width=data.groupby('target')[feature].median())
    current_axes.set_xlabel('Target')
    current_axes.set_ylabel(feature)
    current_axes.set_title(feature)
    
    i += 1

From these graphs it's clear that I was wrong, but some features seems to clearly classify the bacteria specie.

**2.2 Continuos features**

Continuous features distribution:

In [None]:
continuos_features = list(set(train_predictors.columns) - set(discrete_features))

fig, axs = plt.subplots(70, 4, figsize=(16, 300))
i = 0
for feature in continuos_features:
    data = train.copy()
    
    current_axs = axs.flat[i]
    current_axs.hist(data[feature], bins=50)
    current_axs.set_xlabel(feature)
    current_axs.set_ylabel('Count')
    #current_axs.set_title(feature)
    # current_axs.show()
    
    i += 1

We see that most of continuous features are skewed. A good practice would be to scale them into a normal distribution before modelling.

**2.3 Outliers**

## Preprocessing

There are too many predictors in the dataset. We have to perform a preprocessing algorithm to reduce dimensionality in the data.
I will try with:
- LDA: linear discriminant analysis
- PCA: principal component analysis

**LDA**

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

train_sample = train.sample(10000, random_state=123)
lda_data = LDA(n_components=2).fit_transform(train_sample.drop(columns='target'), train_sample.target)
plt.figure(figsize=(10,10))
sns.scatterplot(x = lda_data[:, 0], y = lda_data[:, 1], hue = 'target', data=train_sample)

We can see that there is a clear pattern in different bacteria species values.

**PCA**

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler # the first thing to do when you run a PCA is to use a standard scaler or a min max scaler on data

scaler = StandardScaler()
scaler.fit(train_predictors)
scaled_data = scaler.transform(train_predictors)

In [None]:
pca = PCA(n_components=30)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

In [None]:
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
labels = ['PC' + str(x) for x in range(1,len(per_var)+1)]

plt.figure(figsize=(15, 10))
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of expjlained variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

In [None]:
per_var.cumsum()

All 30 principal components don't explain enough original data variance. I think that this algorithm doesn't work well in this particular case.

###### Please comment below if you know why, or if you want to discuss about PCA topic, or if you have any suggestions or you know useful articles about this topic/algorithm

In [None]:
x_pca = pd.DataFrame(x_pca)

## Modelling

**Baseline ExtraTreesCalssifier Model**



In [None]:
params = dict(n_estimators=1000,
              criterion='gini', 
              max_depth=None, 
              min_samples_split=2, 
              min_samples_leaf=1, 
              min_weight_fraction_leaf=0.0, 
              max_features='auto', 
              max_leaf_nodes=None, 
              min_impurity_decrease=0.0, 
              bootstrap=False, 
              oob_score=False, 
              n_jobs=-1, 
              random_state=123, 
              verbose=0, 
              warm_start=False, 
              class_weight=None, 
              ccp_alpha=0.0, 
              max_samples=None
             )

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(train['target'])  # label encoding for the bacteria species.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier as et
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(scaled_data, y,
                                                   test_size=0.25, random_state=123)
model = et(**params)
model_fit = model.fit(X_train, y_train)
pred = model_fit.predict(X_test)

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
sub.target = model_fit.predict(test)
sub['target_le'] = le.inverse_transform(sub['target'])
sub.head()

In [None]:
sub[['row_id', 'target_le']].to_csv("baseline-submission-new.csv", index=False)