In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from IPython.display import clear_output
from tqdm import tqdm
import time

## Importing Data

In [None]:
df_train=pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv",index_col="row_id")

In [None]:
df_test=pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv",index_col="row_id")

In [None]:
df_train.head()

In [None]:
df_train.describe().T

In [None]:
total_count=df_train.groupby("target").count().sum(axis=1)/286

In [None]:
sample_submission=pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
total_count

In [None]:
categorical_dict=dict(zip(df_train["target"].unique().tolist(),range(1,11)))

In [None]:
categorical_dict

In [None]:
inverse_categorical_dict=dict(zip(categorical_dict.values(),categorical_dict.keys()))

In [None]:
df_train["target"]

In [None]:
list(categorical_dict.keys())

## EDA

In [None]:
mean_df=df_train.groupby("target").mean()
std_df=df_train.groupby("target").std()
mean_df.head()

In [None]:
std_df.head()

In [None]:
mean_df.sum(axis=1)/286,std_df.sum(axis=1)/286

In [None]:
X=df_train.drop("target",axis=1)
y=df_train["target"]

In [None]:
def single_bacteria_plot(row=85098):
    plt.figure(figsize=(10,4))
    plt.axes
    X.loc[row].plot.bar()
    X.loc[row].plot()
    ax = plt.gca()
    ax.axes.xaxis.set_ticks(())

    legend=str(y.loc[row]),categorical_dict[y.loc[row]]
    plt.title(legend)
    plt.show()

In [None]:
single_bacteria_plot(row=50000)

In [None]:
arr=np.linspace(0,285,286)

In [None]:

plt.figure(figsize=(12,6))
for num in range(10):
    sns.scatterplot(x=arr,y=(mean_df.iloc[num]),label=mean_df.index[num])

In [None]:
for bacteria in mean_df.index:
    plt.figure(figsize=(10,4))
    plt.axes
    mean_df.loc[bacteria].plot.bar(label="mean bar",alpha=1)
    mean_df.loc[bacteria].plot(label="mean")
    
    std_df.loc[bacteria].plot.bar(label="std bar",color="orange",alpha=0.5)
    std_df.loc[bacteria].plot(label="std")
    plt.legend()
    ax = plt.gca()
    ax.axes.xaxis.set_ticks(())

    legend=bacteria
    plt.title(legend)
    plt.show()

## PCA

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
X=df_train.drop("target",axis=1)


In [None]:
yy=df_train["target"]

In [None]:
yy

In [None]:
normalizer=Normalizer()

In [None]:
X=normalizer.fit_transform(X)

In [None]:
n_components=2
pca = PCA(n_components=n_components)
X_r = pca.fit(X).transform(X)
lda = LinearDiscriminantAnalysis(n_components=n_components)
X_r2 = lda.fit(X, y).transform(X)

In [None]:
def pca_lda_plot(component_row_n=200000,s=10):
    
    
    figsize=figsize=(20,8)
    s=10
    facecolors='none'
    colors = ["cyan", "r","blue", "lightblue","peru","gray","purple","yellow","green","lightpink"]
    markers=["o","v","^","<",">","p","*","h","H","+"]
    
    print(
        "explained variance ratio (first",n_components,"components): %s"
        % str(pca.explained_variance_ratio_)
    )

    plt.figure(figsize=figsize)

    lw = 1
    
    for color, i, target_name ,m in zip(colors, categorical_dict.keys(), categorical_dict.values(),markers):
        plt.scatter(
            X_r[yy == i, 1][0:component_row_n],X_r[yy == i, 0][0:component_row_n]*(1), color=color, alpha=0.75, lw=lw,s=s, label=inverse_categorical_dict[target_name],marker=m,cmap='plasma'
        )
    plt.legend(loc="best", shadow=False, scatterpoints=1,prop={'size': 13}, markerscale=3.)
    plt.title("PCA of Bacterial Species ")
    plt.xlabel('First principal component')
    plt.ylabel('Second Principal Component')

    plt.figure(figsize=figsize)
    for color, i,target_name,m in zip(colors, categorical_dict.keys(), categorical_dict.values(),markers):
        plt.scatter(
             X_r2[yy == i, 1][0:component_row_n],X_r2[yy == i, 0][0:component_row_n]*(1), alpha=0.75, color=color, s=s, label=inverse_categorical_dict[target_name],marker=m
        )
    plt.legend(loc="best", shadow=False, scatterpoints=1,prop={'size': 13}, markerscale=3.)
    plt.title("LDA of Bacterial Species")
    plt.xlabel('First LDA component')
    plt.ylabel('Second LDA Component')

    plt.show()
    

In [None]:
pca_lda_plot(component_row_n=100000,s=10)

In [None]:
pca.score(X)

In [None]:
pca.score_samples(X)

In [None]:
pca.explained_variance_

In [None]:
from IPython.display import display
from sklearn.feature_selection import mutual_info_regression


plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)


def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs


In [None]:
plot_variance(pca)

## Model Training

In [None]:
df_train["target"]=df_train["target"].replace(categorical_dict.keys(),categorical_dict.values())

In [None]:
X=df_train.drop("target",axis=1)

y=df_train["target"]

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler ,Normalizer,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix


In [None]:
model = {}
scores = {}
pred_proba = {}
conf_mats = {}
X = df_train.drop("target",axis=1)
y = df_train["target"]
scaler=StandardScaler()
X=scaler.fit_transform(X)
df_test=scaler.transform(df_test)
 
for rs in range(3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=rs)  
    for n_est in [1000]:

        print("__"*10+f"n_est: {n_est}, random state: {rs}"+"__"*15)
        xgb=XGBClassifier(n_estimators = n_est,max_features=None,
                    random_state=rs,
                           tree_method="gpu_hist" ,verbosity = 0 )
        lgbm=LGBMClassifier(n_estimators = n_est,max_features=None,silent=True,
                            random_state=rs, 
                                      verbose =-100,
                            device='gpu' )
#         catboost=CatBoostClassifier(n_estimators = n_est,
#                             random_state=rs, 
#                                       bootstrap_type="Bayesian",task_type="GPU",verbose=False)
        xgb.fit(X_train,y_train)
        scores_xgb=f1_score(y_test,xgb.predict(X_test),average="micro")
        print("xgb "+" for n_estimator : %d ,accuracy on test data: %1.4f" %(n_est,scores_xgb))
        conf_mat_xgb = confusion_matrix(y_test, xgb.predict(X_test))
        pred_xgb = xgb.predict_proba(df_test)

        lgbm.fit(X_train,y_train)
        scores_lgbm=f1_score(y_test,lgbm.predict(X_test),average="micro")
        print("lgbm "+" for n_estimator : %d ,accuracy on test data: %1.4f" %(n_est,scores_lgbm))
        conf_mat_lgbm = confusion_matrix(y_test, lgbm.predict(X_test))
        pred_lgbm = lgbm.predict_proba(df_test)

#         catboost.fit(X_train,y_train)
#         scores_catboost=f1_score(y_test,catboost.predict(X_test),average="micro")
#         print("catboost "+" for n_estimator : %d ,accuracy on test data: %1.4f" %(n_est,scores_catboost))
#         conf_mat_catboost = confusion_matrix(y_test, catboost.predict(X_test))
#         pred_catboost = catboost.predict_proba(df_test)

        model[f"lgbm_random_state_{rs}_n_est_{n_est}"]=lgbm
        model[f"catboost_random_state_{rs}_n_est_{n_est}"]=catboost
        scores[f"xgb_random_state_{rs}_n_est_{n_est}"]=scores_xgb
        scores[f"lgbm_random_state_{rs}_n_est_{n_est}"]=scores_lgbm
#         scores[f"catboost_random_state_{rs}_n_est_{n_est}"]=scores_catboost
        pred_proba[f"xgb_random_state_{rs}_n_est_{n_est}"]=pred_xgb
        pred_proba[f"lgbm_random_state_{rs}_n_est_{n_est}"]=pred_lgbm
#         pred_proba[f"catboost_random_state_{rs}_n_est_{n_est}"]=pred_catboost
        conf_mats[f"xgb_random_state_{rs}_n_est_{n_est}"]=conf_mat_xgb
        conf_mats[f"lgbm_random_state_{rs}_n_est_{n_est}"]=conf_mat_lgbm
#         conf_mats[f"catboost_random_state_{rs}_n_est_{n_est}"]=conf_mat_catboost

## Confusion Matrices

In [None]:
model = 0
for conf_mat in list(conf_mats.values()):
    plt.figure(figsize=(10,7))
    ax = plt.gca()
    sns.heatmap(conf_mat,annot=True,cmap="YlGnBu", fmt="2g",
                xticklabels=categorical_dict.keys(),
        yticklabels=categorical_dict.keys(),ax=ax)
    
    ax.set_xticklabels(categorical_dict.keys(), fontdict={'fontsize': 11})
    
    plt.title(list(conf_mats.keys())[model])
    plt.setp(ax.get_xticklabels() ,rotation=-45, ha="left", rotation_mode="anchor")
    plt.show()
    model += 1
    print("")

## Submission

In [None]:
predictions = np.array(list(pred_proba.values())).mean(axis=0).argmax(axis=1)

In [None]:
predictions+1

In [None]:
sample_submission["target"]=predictions+1

In [None]:
sample_submission["target"]=sample_submission["target"].replace(categorical_dict.values(),categorical_dict.keys())

In [None]:
sample_submission.groupby("target").count()

In [None]:
sample_submission.to_csv("submission.csv",index=False)

In [None]:
sample_submission.head()