In [None]:
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import f_classif
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
#from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/train.csv")
test=pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv")
train_orig=train.copy()
print("We have approximately the same number of observations.. for all bacteria types")
display(train[["target",train.columns[10]]].groupby("target").count().reset_index())

In [None]:
display(train.head())

In [None]:
display(test.head())

In [None]:
encoder = LabelEncoder()
train["target_encoded"] = encoder.fit_transform(train["target"])

# The whole data set is sooo strange and new to me (I'm an economist), that I have to pause here and simply look at individual variables in the data.
# For now lets check column: A8T2G0C0

In [None]:
check_var="A8T2G0C0"

In [None]:
plt.figure(figsize=(20,10))
ax = sns.violinplot(x="target", y=check_var, data=train, fontsize=20, palette="Greens")
ax = sns.boxplot(x="target", y=check_var, data=train, color='white')
plt.title(check_var + " value by bacteria type  (with box and violin plots)", fontsize=20);
plt.ylabel(check_var, fontsize=14);
plt.xlabel("Bacteria name", fontsize=14);
plt.yticks(fontsize=12);
plt.xticks(fontsize=12,rotation=90);
# This is inclonclusive unfortunately, we see nothing.
# THe differences in values are soo big the we might have a measurement error? 

# Minimal feature engineering

In [None]:
# Identifying columns and try to compare feature importances
genom_cols=train.columns[1:-2]
feature_metrics=pd.DataFrame()
for col in genom_cols:
    # set the col type for sure
    train[col] = train[col].astype(np.float64)
    test[col] = test[col].astype(np.float64)
    
    if train[col].min()<0.0:
        train[col]=train[col]+abs(train[col].min())+0.0000001
        test[col]=test[col]+abs(test[col].min())+0.0000001
        
    f_val, p_val=f_classif(np.array(train[[col]]).reshape(-1,1),np.array(train["target"]))
    feature_metrics=feature_metrics.append(pd.DataFrame({
        "variable":[col],
        "f_value":f_val,
        "p_value":p_val,
        "skew_orig": [train[col].skew()],
        "skew_sqrt": [np.sqrt(train[col]).skew()],
        "mean_sqrt": [np.nanmean(np.sqrt(train[col]))]
    }))
    if abs(train[col].skew())>3:
        train[col]=np.sqrt(train[col])
        test[col]=np.sqrt(test[col])
feature_metrics.sort_values(by=["f_value"], inplace=True)
display(feature_metrics.tail(10))

# here we should scale
scaler = MinMaxScaler()
train[genom_cols]=scaler.fit_transform(train[genom_cols])
test[genom_cols]=scaler.transform(test[genom_cols])

In [None]:
# Observe the two feature with the highest ANOVA value, so possibly we observe our best features here
selected_variables=list(feature_metrics.tail(8)["variable"])
for i in range(0,4):
    plt.figure(figsize=(20,10))
    sns.scatterplot(data=train, x=selected_variables[i*2], y=selected_variables[i*2+1], hue="target")
    plt.title(selected_variables[i*2]+" comapred to variable "+selected_variables[i*2+1], fontsize=20)
    plt.show()

In [None]:
# further data column manipulation, we try to identify comovements as well
# hence here I would like to create the product/ratio of variable pairs, if anova shows promising result
# I print the newly created variables
additional_cols=list()
most_important_vars=list(feature_metrics.tail(70)["variable"])

train_add=train[["row_id"]].copy()
test_add=test[["row_id"]].copy()

# check multiplication
for i in range(0,len(most_important_vars)):
    for j in range(i,len(most_important_vars)):
        f_i, _ =f_classif(np.array(train[[genom_cols[i]]]).reshape(-1,1),np.array(train["target"]))
        f_j, _ =f_classif(np.array(train[[genom_cols[j]]]).reshape(-1,1),np.array(train["target"]))
        f_cand, _ =f_classif(np.array(train[genom_cols[i]]*train[genom_cols[j]]).reshape(-1,1),np.array(train["target"]))
        if f_cand>(1.03*max(f_i,f_j)):
            train_add[genom_cols[i]+"_"+genom_cols[j]]=train[genom_cols[i]]*train[genom_cols[j]]
            additional_cols.append(genom_cols[i]+"_"+genom_cols[j])
            print(genom_cols[i]+"_"+genom_cols[j])
            test_add[genom_cols[i]+"_"+genom_cols[j]]=test[genom_cols[i]]*test[genom_cols[j]]

for i in range(0,len(most_important_vars)):
    for j in range(i,len(most_important_vars)):
        if i!=j:
            f_i, _ =f_classif(np.array(train[[genom_cols[i]]]).reshape(-1,1),np.array(train["target"]))
            f_j, _ =f_classif(np.array(train[[genom_cols[j]]]).reshape(-1,1),np.array(train["target"]))
            f_cand, _ =f_classif(np.array((1+train[genom_cols[i]])/(1+train[genom_cols[j]])-1).reshape(-1,1),np.array(train["target"]))
            if f_cand>(1.03*max(f_i,f_j)):
                train_add[genom_cols[i]+"_"+genom_cols[j]+"_ratio"]=(1+train[genom_cols[i]])/(1+train[genom_cols[j]])-1
                additional_cols.append(genom_cols[i]+"_"+genom_cols[j]+"_ratio")
                print(genom_cols[i]+"_"+genom_cols[j]+"_ratio")
                test_add[genom_cols[i]+"_"+genom_cols[j]+"_ratio"]=(1+test[genom_cols[i]])/(1+test[genom_cols[j]])-1           
            
scaler_addition = MinMaxScaler()
train_add[additional_cols]=scaler_addition.fit_transform(train_add[additional_cols])
test_add[additional_cols]=scaler_addition.transform(test_add[additional_cols])

In [None]:
train=pd.concat([train, train_add],axis=1)
test=pd.concat([test, test_add],axis=1)

In [None]:
gc.collect();
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42, max_iter=1000, n_init=100)

max_vars=300
orig_cols=list(feature_metrics.tail(max_vars)["variable"])
extended_cols=orig_cols + additional_cols

train["cluster_group"] = kmeans.fit_predict(train[extended_cols])
test["cluster_group"] = kmeans.predict(test[extended_cols])

In [None]:
train_cluster_dummy=pd.get_dummies(train["cluster_group"],prefix="cluster_")
test_cluster_dummy=pd.get_dummies(test["cluster_group"],prefix="cluster_")
cluster_cols=list(test_cluster_dummy.columns)

In [None]:
train=pd.concat([train, train_cluster_dummy],axis=1)
test=pd.concat([test, test_cluster_dummy],axis=1)
extended_cols=extended_cols+cluster_cols

In [None]:
train_conf_matrix=confusion_matrix(train["cluster_group"], train["target_encoded"])
train_conf_matrix=pd.DataFrame(train_conf_matrix)
train_conf_matrix.columns=list(encoder.inverse_transform(range(0,10)))
train_conf_matrix.index=cluster_cols

plt.figure(figsize=(13,10))
ax = sns.heatmap(train_conf_matrix,cmap="YlGnBu",annot=True)
plt.xlabel("Clusters", fontsize=14)
plt.ylabel("Encoded categories", fontsize=14)
plt.title("Confusion matrix for cluster results", fontsize=20);

In [None]:
# Let me plot these new features, hopefuly we obsrve slightly different patterns
selected_variables=additional_cols[-4:]
for i in range(0,2):
    plt.figure(figsize=(20,10))
    sns.scatterplot(data=train, x=selected_variables[i*2], y=selected_variables[i*2+1], hue="target")
    plt.title(selected_variables[i*2]+" comapred to variable "+selected_variables[i*2+1], fontsize=20)
    plt.show()

In [None]:
gc.collect();

# First with Multiple Random Forests

In [None]:
train_small, remainder =train_test_split(train,train_size=0.005, stratify=train["target"])
x_tree=train[extended_cols]
x_test=test[extended_cols]
y_tree=train["target_encoded"]

x_small=train_small[extended_cols]
y_small=train_small["target_encoded"]


train_relevant, test_relevant =train_test_split(train,train_size=0.66, stratify=train["target"])
x_rel=train_relevant[extended_cols]
x_test_rel=test_relevant[extended_cols]
y_tree_rel=train_relevant["target_encoded"]
y_tree_test_rel=test_relevant["target_encoded"]

In [None]:
# random_state=42
# repeats=3
# rfs = RandomForestClassifier()
# # I've encountered some error with this gridSearchCV so I do parameter tunning with cross_val_score
# result_map_summary={}

# for n_estimators in [200,400,800,1200]:
#     for sample_split in [2,5,10, 20,50,100]:
#         rfs = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=sample_split,min_samples_leaf=1)
#         scores=list()
#         for i in range(0,repeats):
#             scores=scores+list(cross_val_score(rfs, np.matrix(x_small), np.array(y_small), cv=3))
#         result_map_summary["estimators|"+str(n_estimators)+" and split|"+str(sample_split)]=scores
#         print("estimators|"+str(n_estimators)+" and split|"+str(sample_split) + " -> done")
#         print(np.nanmean(scores))
# # Approximately 800 trees and setting to min_sample_split = 2 gives a decent result...

In [None]:
del kmeans

In [None]:
gc.collect();

In [None]:
# having more trees can be useful here, so we add more than what we have tested...
# as the score was increasing in n_estimators
preferred_params={
    "n_estimators":2400, 
    "min_samples_split":2,
    "min_samples_leaf":1,
    "n_jobs": -1,
    "random_state":42}

rfs = RandomForestClassifier(**preferred_params)
rfs.fit(x_rel,y_tree_rel)

rf_train_score = rfs.score(x_rel, y_tree_rel)
rf_accuracy = rfs.score(x_test_rel, y_tree_test_rel)
print("Train: {:.2f} %".format(rf_train_score * 100))
print("Test: {:.2f} %".format(rf_accuracy*100))
print('Overfit: {:.2f} %'.format((rf_train_score-rf_accuracy)*100))

In [None]:
importances = rfs.feature_importances_
forest_importances = pd.DataFrame({"feature":extended_cols, "importance":importances})
forest_importances_selection=forest_importances.sort_values(by=["importance"]).tail(100)
plt.figure(figsize=(20,20)) 
ax = sns.barplot(x="importance", y="feature", data=forest_importances_selection)
plt.title(" Feature importance for estimated forest", fontsize=20);

In [None]:
labels=list(encoder.inverse_transform(range(0,10)))
test_conf_matrix=confusion_matrix(rfs.predict(x_test_rel), y_tree_test_rel)
test_conf_matrix=pd.DataFrame(test_conf_matrix)
test_conf_matrix.columns=labels
test_conf_matrix.index=labels

plt.figure(figsize=(13,10))
ax = sns.heatmap(test_conf_matrix,cmap="YlGnBu",annot=True)
plt.xlabel("Predict", fontsize=14)
plt.ylabel("Realized", fontsize=14)
plt.title("Confusion matrix for test data (generated using splitting the original train data set)", fontsize=20);

In [None]:
train_conf_matrix=confusion_matrix(rfs.predict(x_rel), y_tree_rel)
train_conf_matrix=pd.DataFrame(train_conf_matrix)
train_conf_matrix.columns=labels
train_conf_matrix.index=labels

plt.figure(figsize=(13,10))
ax = sns.heatmap(train_conf_matrix,cmap="YlGnBu",annot=True)
plt.xlabel("Predict", fontsize=14)
plt.ylabel("Realized", fontsize=14)
plt.title("Confusion matrix for train data (generated using splitting the original train data set)", fontsize=20);

In [None]:
# submission=pd.DataFrame({
#     "row_id": test["row_id"],
#     "target": encoder.inverse_transform(rfs.predict(x_test))
# }).to_csv("submission_partial.csv",index=False)

# Creating the final model version

In [None]:
# we need to keep our memory limit
del rfs
gc.collect();

In [None]:
# retrain the model using all available data
rfs_fin = RandomForestClassifier(**preferred_params);
rfs_fin.fit(x_tree,y_tree);

In [None]:
val=encoder.inverse_transform(rfs_fin.predict(x_test))
submission=pd.DataFrame({
    "row_id": range(len(train),len(train)+len(test)),
    "target": list(val)
})
submission.to_csv("submission.csv",index=False)