# Overview

Wow this dataset has a lot of rows! Moreover, many have exponential values, both positive and negative. Seems like a good use case to apply PCA for visual exploration of the bacteria x gene dataset.

After EDA, I'm also exploring the comparison of model based on PCA vs. Original

### Credits
I'm leveraging the pre-processing steps and get some idea inspiration from the following notebooks. Huge thanks!
- https://www.kaggle.com/hasanbasriakcay/tps-feb22-eda-ignore-important-cols
- https://www.kaggle.com/odins0n/tps-feb-22-eda-modelling

# Data Loading

In [None]:
!pip install joypy

In [None]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import joypy
from matplotlib import cm 
%matplotlib inline

warnings.simplefilter("ignore")

In [None]:

train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
print(train.shape)
print(test.shape)
print(submission.shape)

In [None]:
print(train.columns)
display(train.head())

In [None]:
train.drop('row_id',axis=1,inplace=True)
numeric_cols = train.select_dtypes(include=np.number).columns.tolist()
object_cols = list(set(train.columns) - set(numeric_cols))

# Dimensionality Reduction - PCA

## Start with 20 components, and use Scree Plot to determine a good number of components

In [None]:
from sklearn.decomposition import PCA

PCA20 = PCA(n_components=20,random_state=1).fit(train.drop(columns='target'))
npPCA20 = PCA20.transform(train.drop(columns='target'))
dfPCA20 = pd.DataFrame(npPCA20)
dfPCA20['target'] = train['target']
dfPCA20.head()

In [None]:
PCA_values = np.arange(PCA20.n_components_) + 1
plt.plot(PCA_values, PCA20.explained_variance_ratio_, 'o-')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.axhline(0.02,color='orange')
plt.axhline(0.01,color='yellow')
plt.show()

In [None]:
pd.DataFrame({'PC':PCA_values,'Expl Variance':PCA20.explained_variance_ratio_}).head(10)

## Based on Scree Plot above, I would use 5 factors for EDA as it already achieve > 98% explained variance


In [None]:
PCA5 = PCA(n_components=5,random_state=1).fit(train.drop(columns='target'))
npPCA5 = PCA5.transform(train.drop(columns='target'))
dfPCA5 = pd.DataFrame(npPCA5)
dfPCA5['target'] = train['target']
dfPCA5.head()

### Profiling of each bacteria based on PCA Values Distribution

We can see that the incremental variance is very small for PC #5

In [None]:
for i in range(5):
    dfSubset = dfPCA5[[i,'target']]
    fig, axes = joypy.joyplot(dfSubset, by='target', colormap=cm.tab10,  overlap=3, figsize=(15,5), fill=True)
    axes[0].set_title('Distribution of PC Values per Species | Principal Component # {}'.format(i+1))

#### ..or a simplified version, which is just taking the mean and show as bar plot

In [None]:
dfPCA5mean = dfPCA5.groupby('target').mean()
dfPCA5mean.columns = dfPCA5mean.columns.get_level_values(0)
dfPCA5mean

In [None]:
fig, axes = plt.subplots(figsize=(20,7))
dfPCA5mean.T.plot(y=dfPCA5mean.T.columns, kind='bar', ax=axes, title = "Comparing the Avg Value of PC for each Bacteria", xlabel = "Principal Component (minus 1)")

## Showing which 10mer constitute each of the 5 PC Component

In [None]:
dfPCA5components = pd.DataFrame(PCA5.components_)
dfPCA5components.columns = train.drop(columns='target').columns
fig, axes = plt.subplots(figsize=(16,60))
sns.heatmap(dfPCA5components.T,cmap='Greens',annot=True)

#### ..and the shrunken version (numberless) to see the pattern better

In [None]:
dfPCA5components = pd.DataFrame(PCA5.components_)
dfPCA5components.columns = train.drop(columns='target').columns
fig, axes = plt.subplots(figsize=(4,8))
sns.heatmap(dfPCA5components.T,cmap='Greens',annot=False)

### Simplified correlation based only on the average PCA(5) values

In [None]:
dfCorr = dfPCA5mean.T.corr()
sns.clustermap(dfCorr,annot=True)

# Modeling Section

In [None]:
!pip install pycaret

In [None]:
dfPCA5.head()

# Compare PCA and non-PCA on Random Forest

### Start with PCA data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score


In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dfPCA5['target'] = encoder.fit_transform(dfPCA5['target'])

In [None]:
dfPCA5.iloc[:,:-1].head()

In [None]:
X = dfPCA5.iloc[:,:-1]
y = dfPCA5['target']

listAcc_scores = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
for fold, (train_idx, valid_idx) in enumerate(skf.split(X,y)):
    print('********Fold= ', fold)
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    modelRF = RandomForestClassifier()
    modelRF.fit(X_train, y_train)
    preds_valid = modelRF.predict(X_valid)
    acc = accuracy_score(y_valid, preds_valid)
    print(f'Accuracy for fold {fold} is {acc:.4f}')
    listAcc_scores.append(acc)
print(f'Mean Accuracy: {np.mean(listAcc_scores):.4f}')

### Next, the non-PCA

In [None]:
train.head()

In [None]:
encoder2 = LabelEncoder()
train['target'] = encoder2.fit_transform(train['target'])

In [None]:
X = train.iloc[:,:-1]
y = train['target']

listAcc_scores_nonPCA = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
for fold, (train_idx, valid_idx) in enumerate(skf.split(X,y)):
    print('********Fold= ', fold)
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    modelRF = RandomForestClassifier()
    modelRF.fit(X_train, y_train)
    preds_valid = modelRF.predict(X_valid)
    acc = accuracy_score(y_valid, preds_valid)
    print(f'Accuracy for fold {fold} is {acc:.4f}')
    listAcc_scores_nonPCA.append(acc)
print(f'Mean Accuracy: {np.mean(listAcc_scores_nonPCA):.4f}')

In [None]:
print(f'Mean Accuracy of PCA model: {np.mean(listAcc_scores):.4f}')
print(f'Mean Accuracy of non-PCA model: {np.mean(listAcc_scores_nonPCA):.4f}')
diff_accuracy_pp = np.mean(listAcc_scores) - np.mean(listAcc_scores_nonPCA)
print(f'PCA model has better accuracy of : {diff_accuracy_pp:.4f} pp')
