# TPS FEB 2022 EDA and Baseline

![@3dparadise Unsplash](https://images.unsplash.com/photo-1628595351029-c2bf17511435?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1032&q=50)

### Objective

Here the task is to classify 10 different bacteria species using data from a genomic analysis technique that has some data compression and data loss.

### Evaluation

`Accuaracy` will be used as metric to evaluate submissions.

$$ Accuracy = \frac{Total\ Correct\ Predictions}{Total\ Number\ of\ Observations} $$

### About Data

Each row of data contains a spectrum of histograms generated by repeated measurements of a sample, each row containing the output of all 286 histogram possibilities (e.g.,  `A0G0T0B10 to A10G0T0B0` ), which then has a bias spectrum (of totally random ATGC) subtracted from the results.

The data (both train and test) also contains simulated measurement errors (of varying rates) for many of the samples, which makes the problem more challenging.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as prep
import sklearn.model_selection as ms
import sklearn.metrics as metrics
import sklearn.ensemble as esm
import catboost as ctb
import sklearn.impute as imputer
import scipy.stats as stats
from sklearn.experimental import enable_iterative_imputer
from skopt import BayesSearchCV

import warnings
warnings.filterwarnings("ignore")
plt.style.use("ggplot")
%matplotlib inline

In [None]:
train_csv = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")
test_csv = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")
sample = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

# EDA (exploratory data analysis)

### Basic EDA

In [None]:
print("Number of samples in train_csv", len(train_csv))
print("Number of samples in test_csv", len(test_csv))
print("train_csv shape", train_csv.shape)
print("test_csv shape", test_csv.shape)

Lets take a high level look at train data

In [None]:
train_csv.info()

Lets visualize some samples from train csv file to know its structure

In [None]:
train_csv.head(20)

`row-id` denotes index and can be neglected from features list, also target which we have to predict are strings which we will convert to integer using some encoding.

In [None]:
FEATURES = [col for col in train_csv.columns if col not in ["row_id", "target"]]
TARGET = "target"

Now lets briefly take a glimpse of values range, mean and standard deviation of every column in train data and test data

In [None]:
train_csv.describe()

In [None]:
test_csv.describe()

all `286` features are float, lets also take a look if data contains any null values in both train and test csv

In [None]:
print("Number of NaN values in train_csv:", train_csv.isnull().sum().sum())
print("Number of NaN values in test_csv:", test_csv.isnull().sum().sum())

No null values thats good!

### Individual Feature Distribution

We are plotting all `286` features to get the idea of distribution of these features.

In [None]:
counts = 0

In [None]:
rows = 10
cols = 5
fig, axes = plt.subplots(rows, cols, figsize=(20, 20))
fig.suptitle(f"Numerical continious features distribution: {counts+1} - {counts+(rows*cols)}", fontsize=24, y=1.01)
axes = axes.ravel()
for num, feat in enumerate(FEATURES[counts:counts+(rows*cols)]):
    ax = sns.kdeplot(x=feat, data=train_csv, shade=True, ax=axes[num])
    ax.set_title(feat.upper())
    ax.tick_params(axis="x", labelrotation=90)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
fig.tight_layout()
counts += rows*cols

In [None]:
rows = 10
cols = 5
fig, axes = plt.subplots(rows, cols, figsize=(20, 20))
fig.suptitle(f"Numerical continious features distribution: {counts+1} - {counts+(rows*cols)}", fontsize=24, y=1.01)
axes = axes.ravel()
for num, feat in enumerate(FEATURES[counts:counts+(rows*cols)]):
    ax = sns.kdeplot(x=feat, data=train_csv, shade=True, ax=axes[num])
    ax.set_title(feat.upper())
    ax.tick_params(axis="x", labelrotation=90)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
fig.tight_layout()
counts += rows*cols

In [None]:
rows = 10
cols = 5
fig, axes = plt.subplots(rows, cols, figsize=(20, 20))
fig.suptitle(f"Numerical continious features distribution: {counts+1} - {counts+(rows*cols)}", fontsize=24, y=1.01)
axes = axes.ravel()
for num, feat in enumerate(FEATURES[counts:counts+(rows*cols)]):
    ax = sns.kdeplot(x=feat, data=train_csv, shade=True, ax=axes[num])
    ax.set_title(feat.upper())
    ax.tick_params(axis="x", labelrotation=90)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
fig.tight_layout()
counts += rows*cols

In [None]:
rows = 10
cols = 5
fig, axes = plt.subplots(rows, cols, figsize=(20, 20))
fig.suptitle(f"Numerical continious features distribution: {counts+1} - {counts+(rows*cols)}", fontsize=24, y=1.01)
axes = axes.ravel()
for num, feat in enumerate(FEATURES[counts:counts+(rows*cols)]):
    ax = sns.kdeplot(x=feat, data=train_csv, shade=True, ax=axes[num])
    ax.set_title(feat.upper())
    ax.tick_params(axis="x", labelrotation=90)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
fig.tight_layout()
counts += rows*cols

In [None]:
rows = 10
cols = 5
fig, axes = plt.subplots(rows, cols, figsize=(20, 20))
fig.suptitle(f"Numerical continious features distribution: {counts+1} - {counts+(rows*cols)}", fontsize=24, y=1.01)
axes = axes.ravel()
for num, feat in enumerate(FEATURES[counts:counts+(rows*cols)]):
    ax = sns.kdeplot(x=feat, data=train_csv, shade=True, ax=axes[num])
    ax.set_title(feat.upper())
    ax.tick_params(axis="x", labelrotation=90)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
fig.tight_layout()
counts += rows*cols

In [None]:
rows = 8
cols = 5
fig, axes = plt.subplots(rows, cols, figsize=(20, 20))
fig.suptitle(f"Numerical continious features distribution: {counts+1} - {counts+(rows*cols)}", fontsize=24, y=1.01)
axes = axes.ravel()
for num, feat in enumerate(FEATURES[counts:counts+(rows*cols)]):
    ax = sns.kdeplot(x=feat, data=train_csv, shade=True, ax=axes[num])
    ax.set_title(feat.upper())
    ax.tick_params(axis="x", labelrotation=90)
    ax.set_xlabel(None)
    ax.set_ylabel(None)
fig.tight_layout()
counts += rows*cols
fig.delaxes(axes[num+1])
fig.delaxes(axes[num+2])
fig.delaxes(axes[num+3])
fig.delaxes(axes[num+4])

All these features distribution seems to be right skewed, so we have to transform these features and try to fix these, 

some transforms for fixing right skewed distribution
- log transform
- boxcox transform
- cuberoot transform
- inverse transform

Lets also check skewness using pandas `skew` method for all features

### Features skewness and transformation

In [None]:
plt.figure(figsize=(20, 10))
xcol = "Features"
ycol = "Skewness"
df = pd.DataFrame(columns=[xcol, ycol])
df[xcol] = FEATURES
skewness = []
for feat in FEATURES:
    skewness.append(train_csv[feat].skew())
df[ycol] = skewness
ax = sns.barplot(x=xcol, y=ycol, data=df)
ax.tick_params(axis="x", labelbottom=False)
_ = ax.set_title("Feature Skewness Plot", fontsize=24, y=1.01)
plt.show()

print("============ Feature Skewness ==============")
print("\nFEATURES\tSKEWNESS\n")
for feat, skew in zip(FEATURES, skewness):
    print(f"{feat}\t{skew}")

All the features are right skewed (positive skewed), except two `A2T3G2C3` and `A2T3G3C2` also most of these features are highly right skewed, we will focus in transforming highly skewed features whose skewness is greater than 1.

In [None]:
skewed_features = [feat for feat, skew in zip(FEATURES, skewness) if skew>1]
print("Number of right skewed features: ", len(skewed_features))

log and boxcox transforms needs values to be positive, also log accepts value greater than 0 keeping that in mind before apply these transforms. These two transformations are mostly use to fix right skewed distributions

In [None]:
plt.figure(figsize=(20, 10))
xcol = "Features"
ycol = "Skewness"
df = pd.DataFrame(columns=[xcol, ycol])
df[xcol] = skewed_features
skewness = []
for feat in skewed_features:
    skewness.append(pd.Series(stats.boxcox(train_csv[feat] + 1)[0]).skew())
df[ycol] = skewness
ax = sns.barplot(x=xcol, y=ycol, data=df)
ax.tick_params(axis="x", labelbottom=False)
_ = ax.set_title("Feature Skewness Plot after boxcox transform", fontsize=24, y=1.01)
plt.show()

print("\nFEATURES\tSKEWNESS\n")
for feat, skew in zip(FEATURES, skewness):
    print(f"{feat}\t{skew}")

In [None]:
plt.figure(figsize=(20, 10))
xcol = "Features"
ycol = "Skewness"
df = pd.DataFrame(columns=[xcol, ycol])
df[xcol] = skewed_features
skewness = []
for feat in skewed_features:
    skewness.append(pd.Series(np.log10(train_csv[feat] + 1)).skew())
df[ycol] = skewness
ax = sns.barplot(x=xcol, y=ycol, data=df)
ax.tick_params(axis="x", labelbottom=False)
_ = ax.set_title("Feature Skewness Plot after Log transform", fontsize=24, y=1.01)
plt.show()

print("\nFEATURES\tSKEWNESS\n")
for feat, skew in zip(FEATURES, skewness):
    print(f"{feat}\t{skew}")

boxcox transform fix our issue and reduce right skewness with good margin, log transform do not show any effect here. Why? I will need investigate this currently I do not know why it do not work at all here, if someone knows comment below so we all can know about it.

### Target Distribution

In [None]:
print("Number of Target Classes: ", train_csv[TARGET].nunique())
print("\nTarget Classes:")
for i, unique in enumerate(train_csv[TARGET].unique()):
    print(f"{i+1}- {unique}")

Lets plot a count plot of targets and check data imbalance

In [None]:
plt.figure(figsize=(15, 10))
ax = sns.countplot(x=TARGET, data=train_csv)
_ = ax.set_title("Target Distribution Counts", fontsize=24, y=1.01)
ax.set_xlabel(None)
ax.set_ylabel(None)
ax.tick_params(axis="x", labelrotation=45)
plt.show()

Cool! data is almost balanced, thats why `accuracy` was choosen for competition metrics, if targets were unbalanced then metrics like `auc` would be the choice.

In [None]:
plt.figure(figsize=(15, 10))
counts = train_csv[TARGET].value_counts()
per = counts.values / len(train_csv) * 100
CLASSES = counts.index
xcol = "Classes"
ycol = "Percentage %"
df = pd.DataFrame(columns=[xcol, ycol])
df[xcol] = CLASSES
df[ycol] = per
ax = sns.barplot(x=xcol, y=ycol, data=df)
_ = ax.set_title("Target Distribution Percentage", fontsize=24, y=1.01)
ax.set_xlabel(None)
ax.set_ylabel(None)
ax.tick_params(axis="x", labelrotation=45)
plt.show()

### Encoding Target Variables

There are various ways to encode target variables, as they are string we cannot directly use it to train model, some common encoding includes:
- LabelEncoder
- OneHotEncoder
- BinaryEncoder
- Mean Encoding
etc.

Here we will use simple label encoding which will assign integer to target categories.

In [None]:
encoder = prep.LabelEncoder()
train_csv[TARGET] = encoder.fit_transform(train_csv[TARGET])
CLASSES = encoder.classes_

transforming training and test data and saving it, we will run crossvalidation using data with and without transformation later.

In [None]:
train_transformed = train_csv.copy(deep=True)
test_transformed = test_csv.copy(deep=True)
for feat in FEATURES:
    train_transformed[feat] = stats.boxcox(train_csv[feat] + 1)[0]
    test_transformed[feat] = stats.boxcox(test_csv[feat] + 1)[0]
train_transformed.to_csv("train_transformed.csv", index=False)
test_transformed.to_csv("test_transformed.csv", index=False)

### Data correlation

In [None]:
plt.figure(figsize=(15, 10))
corr = train_csv.corr()
ax = sns.heatmap(corr, mask=np.tril(corr))
_ = ax.set_title("Train Correlation matrix", fontsize=24, y=1.05)

# CrossValidation

We will use ExtraTree classifier and run crossvalidation using non-transformed data and transformed data. We want to find out that the transformations we performed helped in improving crossvalidation score.

In [None]:
seed = 42
folds = 5

In [None]:
class CrossValidation:
    def __init__(self, df, shuffle,random_state=None):
        self.df = df
        self.random_state = random_state
        self.shuffle = shuffle
        if shuffle is True:
            self.df = df.sample(frac=1,
                random_state=self.random_state).reset_index(drop=True)

    def hold_out_split(self,percent,stratify=None):
        if stratify is not None:
            y = self.df[stratify]
            train,val = ms.train_test_split(self.df, test_size=percent/100,
                stratify=y, random_state=self.random_state)
            return train,val
        size = len(self.df) - int(len(self.df)*(percent/100))
        train = self.df.iloc[:size,:]
        val = self.df.iloc[size:,:]
        return train,val

    def kfold_split(self, splits, stratify=None):
        if stratify is not None:
            kf = ms.StratifiedKFold(n_splits=splits,
                shuffle=self.shuffle,
                random_state=self.random_state)
            y = self.df[stratify]
            for train, val in kf.split(X=self.df,y=y):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v
        else:
            kf = ms.KFold(n_splits=splits, shuffle=self.shuffle,
                random_state=self.random_state)
            for train, val in kf.split(X=self.df):
                t = self.df.iloc[train,:]
                v = self.df.iloc[val, :]
                yield t,v

### Using non-transformed Data

In [None]:
cv = CrossValidation(train_csv, shuffle=True, random_state=seed)

In [None]:
%%time
fold_accuracy = []
for fold, (train_, val_) in enumerate(cv.kfold_split(splits=folds, stratify=TARGET)):
    print("CV fold", fold+1)
    model = esm.ExtraTreesClassifier(n_jobs=-1, random_state=seed)
    model.fit(train_[FEATURES], train_[TARGET])
    preds = model.predict(val_[FEATURES])
    acc_score = metrics.accuracy_score(val_[TARGET], preds)
    print("FOLD ACCURACY: ", acc_score)
    print(metrics.classification_report(val_[TARGET], preds))
    fold_accuracy.append(acc_score)
print("CV SCORE: ", np.mean(fold_accuracy))

### Using transformed data

In [None]:
cv = CrossValidation(train_transformed, shuffle=True, random_state=seed)

In [None]:
%%time
fold_accuracy = []
for fold, (train_, val_) in enumerate(cv.kfold_split(splits=folds, stratify=TARGET)):
    print("CV fold", fold+1)
    model = esm.ExtraTreesClassifier(n_jobs=-1, random_state=seed)
    model.fit(train_[FEATURES], train_[TARGET])
    preds = model.predict(val_[FEATURES])
    acc_score = metrics.accuracy_score(val_[TARGET], preds)
    print("FOLD ACCURACY: ", acc_score)
    print(metrics.classification_report(val_[TARGET], preds))
    fold_accuracy.append(acc_score)
print("CV SCORE: ", np.mean(fold_accuracy))

We can see little improvement with feature transformation,

- CV before feature transformation: `0.9940849999999999`
- CV after feature transformation: `0.9942400000000001`

# Parameter Optimization

We will be using Bayesian Search to search for best hyperparameters for model, `skopt` package provide `BayesSearchCV` that implements bayesian search for hyperparameter optimization, we will use just a subset of train data for faster parameters searches.

In [None]:
%%time
ext_tree = esm.ExtraTreesClassifier(n_jobs=-1)

random_params = {"n_estimators": [50, 300],
                 "min_samples_leaf": [1, 10],
                 "min_samples_split": [2, 10]
                }

opt = BayesSearchCV(
    ext_tree,
    random_params,
    n_iter=50,
    cv=3,
    n_jobs=-1
)

data = train_transformed.sample(1000)
opt.fit(data[FEATURES], data[TARGET])
print("Best Params : ", dict(opt.best_params_))
print("Best Score : ", opt.best_score_)

In [None]:
MODEL_PARAMS = dict(opt.best_params_)

# Training and Predicting Test Data

In [None]:
def model_train_and_predict(cv, model_params, test_csv, seed):
    valid_preds = {}
    test_preds = []
    val_fold_acc = []
    for fold, (train_, val_) in enumerate(cv.kfold_split(splits=folds, stratify=TARGET)):
        print("Training fold: ", fold+1)
        model = esm.ExtraTreesClassifier(**model_params, n_jobs=-1, verbose=0, random_state=seed)
        trainX = train_[FEATURES]
        trainY = train_[TARGET]
        valX = val_[FEATURES]
        valY = val_[TARGET]

        val_ids = val_.row_id.values.tolist()

        model.fit(trainX, trainY)

        predY = model.predict(valX)
        val_acc = metrics.accuracy_score(valY, predY)
        print(f"Fold {fold+1} accuracy", val_acc)
        val_fold_acc.append(val_acc)

        valid_preds.update(dict(zip(val_ids, predY)))

        predY = model.predict_proba(test_csv[FEATURES])
        test_preds.append(predY)
    return val_fold_acc, valid_preds, test_preds

In [None]:
def max_voting(predictions):
    idxs = np.argmax(predictions, axis=1)
    return np.take_along_axis(predictions, idxs[:, None], axis=1)

In [None]:
model_count = 1
total_val_fold_accuracy = []
test_predictions = []
#Rebalancing the classes with respect to training set, credit: https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants
weights = np.array([0, 0, 0.03, 0.036, 0, 0, 0, 0, 0, 0])

In [None]:
%%time
val_fold_acc, valid_preds, test_preds = model_train_and_predict(cv, MODEL_PARAMS, test_transformed, seed=42)

fold_acc = np.mean(val_fold_acc)
print("Fold Accuracy: ", fold_acc)

total_val_fold_accuracy.append(fold_acc)

test_df = pd.DataFrame(columns=["row_id", f"pred_{model_count}"]) 
test_df["row_id"] = test_csv["row_id"]

test_preds = sum(test_preds)/len(test_preds)
test_preds += weights
test_preds = np.argmax(test_preds, axis=1)
test_df[f"pred_{model_count}"] = test_preds
test_df.to_csv(f"test_pred_{model_count}.csv", index=False)
test_predictions.append(test_preds)

model_count += 1

In [None]:
%%time
val_fold_acc, valid_preds, test_preds = model_train_and_predict(cv, MODEL_PARAMS, test_transformed, seed=111)

fold_acc = np.mean(val_fold_acc)
print("Fold Accuracy: ", fold_acc)

total_val_fold_accuracy.append(fold_acc)

test_df = pd.DataFrame(columns=["row_id", f"pred_{model_count}"]) 
test_df["row_id"] = test_csv["row_id"]

test_preds = sum(test_preds)/len(test_preds)
test_preds += weights
test_preds = np.argmax(test_preds, axis=1)
test_df[f"pred_{model_count}"] = test_preds
test_df.to_csv(f"test_pred_{model_count}.csv", index=False)
test_predictions.append(test_preds)

model_count += 1

In [None]:
%%time
val_fold_acc, valid_preds, test_preds = model_train_and_predict(cv, MODEL_PARAMS, test_transformed, seed=555)

fold_acc = np.mean(val_fold_acc)
print("Fold Accuracy: ", fold_acc)

total_val_fold_accuracy.append(fold_acc)

test_df = pd.DataFrame(columns=["row_id", f"pred_{model_count}"]) 
test_df["row_id"] = test_csv["row_id"]

test_preds = sum(test_preds)/len(test_preds)
test_preds += weights
test_preds = np.argmax(test_preds, axis=1)
test_df[f"pred_{model_count}"] = test_preds
test_df.to_csv(f"test_pred_{model_count}.csv", index=False)
test_predictions.append(test_preds)

model_count += 1

In [None]:
%%time
val_fold_acc, valid_preds, test_preds = model_train_and_predict(cv, MODEL_PARAMS, test_transformed, seed=777)

fold_acc = np.mean(val_fold_acc)
print("Fold Accuracy: ", fold_acc)

total_val_fold_accuracy.append(fold_acc)

test_df = pd.DataFrame(columns=["row_id", f"pred_{model_count}"]) 
test_df["row_id"] = test_csv["row_id"]

test_preds = sum(test_preds)/len(test_preds)
test_preds += weights
test_preds = np.argmax(test_preds, axis=1)
test_df[f"pred_{model_count}"] = test_preds
test_df.to_csv(f"test_pred_{model_count}.csv", index=False)
test_predictions.append(test_preds)

model_count += 1

In [None]:
%%time
val_fold_acc, valid_preds, test_preds = model_train_and_predict(cv, MODEL_PARAMS, test_transformed, seed=999)

fold_acc = np.mean(val_fold_acc)
print("Fold Accuracy: ", fold_acc)

total_val_fold_accuracy.append(fold_acc)

test_df = pd.DataFrame(columns=["row_id", f"pred_{model_count}"]) 
test_df["row_id"] = test_csv["row_id"]

test_preds = sum(test_preds)/len(test_preds)
test_preds += weights
test_preds = np.argmax(test_preds, axis=1)
test_df[f"pred_{model_count}"] = test_preds
test_df.to_csv(f"test_pred_{model_count}.csv", index=False)
test_predictions.append(test_preds)

model_count += 1

### Creating Submission

In [None]:
def create_submission(sub_name,
                      predictions, 
                      encoder,
                      template_path="../input/tabular-playground-series-feb-2022/sample_submission.csv"):
    template = pd.read_csv(template_path)
    template[TARGET] = encoder.inverse_transform(predictions)
    template.to_csv(sub_name+".csv", index=False)

In [None]:
predictions = max_voting(np.column_stack(test_predictions))
predictions.shape

In [None]:
create_submission("submission", predictions, encoder=encoder)

In [None]:
pd.read_csv("submission.csv").head()

I might miss some important things, if that the case then let us know, it will help all the readers of this notebook. Thanks!

