# Summary 

* v1 base model with CV 
* v2 Pseudo lables
* v3 Z-score of each samples fpr each bacteria 
* V4 - add pseudo labels
* v5 feature elimination

# Additonal Steps to improve 

* Extra Trees seems to come out on top - as it is overfitting (score on PB LB = 0.94) - we should regularize this  
* Noise was added in the paper and this could help regularize the ET 
* Our data is not Imbalanced however our accuracy is in the 90% so any small improvement is good  -- potentially apply imbalanced scoring metrics? 
* Clustering could also create additional features

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import metrics
from collections import Counter

from scipy.stats import mode
from math import factorial

import gc

In [None]:
# experimental params 

ITERATIONS = 1030
SEED = 42
FOLDS = 10

PSEUDO = True
CLUSTER = True

K_BEST_COLS = 250

DROP_DUPS = True

# Import Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col = 0) 
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col = 0) 
sub = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv", index_col = 0)

In [None]:
train.head()

In [None]:
print( "Total segments" , len( train.columns ) -1 ) 

# Feature engineering 
#### From the master Luca Massaron 
https://www.kaggle.com/lucamassaron/basic-eda-and-model-to-start/notebook#Feature-engineering

In [None]:
features = train.columns[:-1]

In [None]:
def feature_engineering(df):
    
    df['mean'] = df[features].mean(axis=1)
    df['median'] = df[features].median(axis=1)
    df['q01'] = df[features].quantile(q=0.01, axis=1)
    df['q05'] = df[features].quantile(q=0.05, axis=1)
    df['q10'] = df[features].quantile(q=0.10, axis=1)
    df['q25'] = df[features].quantile(q=0.25, axis=1)
    
    #added
    df['q40'] = df[features].quantile(q=0.40, axis=1)
    df['q60'] = df[features].quantile(q=0.50, axis=1)
    
    df['q75'] = df[features].quantile(q=0.75, axis=1)
    df['q90'] = df[features].quantile(q=0.90, axis=1)
    df['q95'] = df[features].quantile(q=0.95, axis=1)
    df['q99'] = df[features].quantile(q=0.99, axis=1)
    df['max'] = df[features].max(axis=1)
    df['min'] = df[features].min(axis=1)
    
    df['std'] = df[features].std(axis=1)
    df['range'] = df['max'] - df['min']
    df['iqr'] = df['q75'] - df['q25']
    df['tails'] = df['range'] / df['iqr']
    df['dispersion'] = df['std'] / df['mean']
    df['dispersion_2'] = df['iqr'] / df['median']
    df['skew'] = df[features].skew(axis=1)
    df['kurt'] = df[features].kurt(axis=1)
    
    df['median-max'] = df['median'] - df['max']
    df['median-min'] = df['median'] - df['min']
    df['q99-q95'] = df['q99'] - df['q95']
    df['q99-q90'] = df['q99'] - df['q90']
    df['q01-q05'] = df['q01'] - df['q05']
    df['q01-q10'] =  df['q01'] - df['q10']
    
    return df

feature_engineering(test)
feature_engineering(train)

original_features = features
features

#### From the amazing Ambrosm 
https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

train_i = pd.DataFrame({col: ((train[col] + bias_of(col)) * 1000000).round().astype(int) for col in original_features})
test_i = pd.DataFrame({col: ((test[col] + bias_of(col)) * 1000000).round().astype(int) for col in original_features})

def gcd_of_all(df_i, elements=original_features):
    gcd = df_i.index
    for col in elements:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

train['gcd'] = gcd_of_all(train_i)
test['gcd'] = gcd_of_all(test_i)

del train_i
del test_i


# Pseudolabels

In [None]:
et_1 = pd.read_csv("../input/extra-trees-cv-voting/submission.csv",index_col=0)
et_2 = pd.read_csv("../input/early-ensemble/submission.csv",index_col=0)
et_3 = pd.read_csv("../input/tps-feb-2022/submission.csv",index_col=0)
ensemble = pd.DataFrame({"target": et_2["target"],"target2": et_1["target"],"target3": et_3["target"], })

In [None]:
if PSEUDO:
    print("Adding Pseudolabels")
    index_e = ensemble[ (ensemble["target"]==ensemble["target2"]) & (ensemble["target2"]==ensemble["target3"])].index
    ensemble =ensemble.loc[index_e].drop(["target2","target3"],axis =1)

    pseudo = pd.concat([test.loc[index_e],ensemble],axis=1 )
    
    train = pd.concat([train,pseudo],axis=0).reset_index(drop=True)

# Duplicates

In [None]:
if DROP_DUPS:
    print("Dropping Dups")
    train.drop_duplicates(keep="first", inplace=True,ignore_index=True)

# Clustering - DBSCAN
The assumption comes from the paper and shows that there is clustering for the classes \
The paper also noted that with the addition of noise, the classes where still distinctive when clustered 

* we will try cluster with multiple distance/min_samples and see if this extra feature assists our final model in distinguishing classes

In [None]:
if CLUSTER:
    all_df = pd.concat([train.assign(ds="a"),test.assign(ds="b")],axis =0).drop("target",axis=1)
    for idex_e, e in enumerate ([1]): # increase will increase clusters
        for idex_s, min_s in enumerate([6 ,7,8,9, 10]): #increase will decrease clusters 
            print(f"eps: {e} | min_samples: {min_s}")
            clustering = DBSCAN(eps=e, min_samples=min_s).fit(all_df.drop("ds",axis =1))
            labels = clustering.labels_
            print (len(set(labels)) - (1 if -1 in labels else 0))

            col = f"cluster_{idex_e}{idex_s}"

            all_df[col] = labels
            train[col] =  all_df[all_df["ds"]=="a"][col]
            test[col] =  all_df[all_df["ds"]=="b"][col]
            
    for idex_e, e in enumerate ([0.05]): 
        for idex_s, min_s in enumerate([5, 6,7]): 
            print(f"eps: {e} | min_samples: {min_s}")
            clustering = DBSCAN(eps=e, min_samples=min_s).fit(all_df.drop("ds",axis =1))
            labels = clustering.labels_
            print (len(set(labels)) - (1 if -1 in labels else 0))

            col = f"cluster_{idex_e}{idex_s}"

            all_df[col] = labels
            train[col] =  all_df[all_df["ds"]=="a"][col]
            test[col] =  all_df[all_df["ds"]=="b"][col]

# Z-scoring 
### Difference to zscore for each bacteria & for each column  
1. We will look at each column (that is a DNA segment) and calculate the mean and standard deviation for each bacteria
1. We will then take each sample and find the zscore against each bacteria
1. The minimum zscore is identified - and saved in a new column 

In [None]:
encoder = LabelEncoder()
train["target"] = encoder.fit_transform(train["target"])

In [None]:
def check_min_zscore(x, bac_mean, bac_std):
    
    z_score = (x-bac_mean)/bac_std
    
    #variance = (x - np.array(bac_mean))**2/ (len(train)-1)
    return np.argmin(z_score)

In [None]:
segments = [col for col in train.columns if "A" in col]
variance_df = pd.DataFrame(index=train["target"].unique(), columns=[segments])

#for each column 
for col in segments :
    
    #Create arrays to hold mean/std for each bacteria
    bacteria_mean = np.array(train["target"].unique()).astype("float64")
    bacteria_std = np.array(train["target"].unique()).astype("float64")
    
    #for each bacteria type
    for idx, bacteria in enumerate(train["target"].unique()):
        
        #get a slice of train for bacteria and column - then get mean and std dev
        splice = train[train["target"]==bacteria][col]
        
        #create a list of mean values for each bacteria slice 
        bacteria_mean[idx] = np.mean(splice)
        bacteria_std[idx] = np.std(splice)
        
        #std_dev =np.std(splice)
        
    variance_df[col] = bacteria_mean
    print(col)
    #create new column - for each value in column - check if closest to which mean
    train[col+"_"+"z"] = train[col].apply(lambda x: check_min_zscore(x, bacteria_mean,bacteria_std) )
    test[col+"_"+"z"] = test[col].apply(lambda x: check_min_zscore(x, bacteria_mean,bacteria_std) )

##### We have created many new features lets see if it helps 
Second option is to create 4 columns for each bacteria for each feature 
= Column1_zscore_bacteria1

# Feature Elimination 

Amazing notebook by [ ARJUN PRASAD SARKHEL](https://www.kaggle.com/arjunprasadsarkhel/tps-feb-k-best-features/notebook)



In [None]:
def finding_correlation(data, threshold):
    correlated_columns = set()
    correlation_matrix = data[features].corr()
    for i in range(correlation_matrix.shape[0]):
        for j in range(i):
            if abs(correlation_matrix.iloc[i,j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correlated_columns.add(column_name)
    return correlated_columns

In [None]:
tr_correlated_cols = finding_correlation(train, .8)
print("Train correlated cols=", len(tr_correlated_cols))

In [None]:
te_correlated_cols = finding_correlation(test, .8)
print("Test correlated cols=", len(te_correlated_cols))

In [None]:
cor_cols = list(set(tr_correlated_cols).intersection(te_correlated_cols))
cor_cols

In [None]:
train.drop(cor_cols, inplace=True, axis=1)
test.drop(cor_cols, inplace=True, axis=1)

## Remove mutual information columns  

In [None]:
features = [column for column in train.columns if column not in ('target')]
X_feat = train[features]
Y_feat = train.target
print(X_feat.shape, Y_feat.shape)

In [None]:
%time
mutual_info = mutual_info_classif(X_feat, Y_feat)

mutual_info = pd.Series(mutual_info)
mutual_info.index = X_feat.columns
mutual_info.sort_values(ascending=False)

## Select K best columns 
use information obtained from Mutual info and select k best columns 

In [None]:
select_k_features  = SelectKBest(mutual_info_classif, k=K_BEST_COLS)
select_k_features.fit(X_feat, Y_feat)
select_k_features

In [None]:
cols = X_feat.columns[select_k_features.get_support()]
print(list(cols))

In [None]:
train_ = train[cols]
train_['target'] = train.target
train = train_.copy()
del train_
gc.collect()
test = test[cols]

## Downcasting 

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    df = df.copy()
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test) 

# Modelling

In [None]:
features_added= [col for col in train.columns if col not in features]
features = [col for col in train.columns if col not in ['target']]
features_added.remove("target")
print(list(features_added))

In [None]:
X = train.drop("target",axis =1)
y = train["target"]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=42)

# Base model 

In [None]:
params ={} 
model = ExtraTreesClassifier( n_estimators = ITERATIONS ,random_state=SEED, n_jobs=-1 )

def build_run_model(estimator, params, X_train, y_train,X_test,y_test): 
    
    estimator.fit(X_train,y_train)
    y_pred = estimator.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred) 
    print("Accuracy",accuracy )

    return accuracy, estimator, y_pred

In [None]:
accuracy, model, preds = build_run_model(model, params, X_train, y_train,X_test,y_test)
print("Accuracy",accuracy)
print(classification_report(y_test,preds))

In [None]:
feature_importance = pd.DataFrame(data= model.feature_importances_, index = X.columns, columns=["importance"],dtype= "float64").sort_values(by = "importance", ascending = False)
feature_importance

In [None]:
#added features
feature_importance.loc[features_added].sort_values(by = "importance", ascending= False).transpose()

In [None]:
# plt.figure(figsize=(25,10))
# sns.barplot(data= feature_importance.loc[features_added[:150]], x = feature_importance.loc[features_added[:150]].index,y="importance" )
# plt.xticks(rotation = 90)
# plt.title("Added features part 1")
# plt.show()

In [None]:
# plt.figure(figsize=(25,10))
# sns.barplot(data= feature_importance.loc[features_added[150:]], x = feature_importance.loc[features_added[150:]].index,y="importance" )
# plt.xticks(rotation = 90)
# plt.title("Added features part 2")
# plt.show()

In [None]:
# Features with 0 importance
print("Zero important features\n", list(feature_importance[feature_importance.importance==0].index))

0. 'Bacteroides_fragilis', 
1. 'Campylobacter_jejuni',
1. 'Enterococcus_hirae',
1. 'Escherichia_coli',
1. 'Escherichia_fergusonii',
1. 'Klebsiella_pneumoniae', 
1. 'Salmonella_enterica',
1. 'Staphylococcus_aureus', 
1. 'Streptococcus_pneumoniae',
1. 'Streptococcus_pyogenes'

# Cross Validation - Train

In [None]:
del X_train
del X_test
del y_train
del y_test
del feature_importance
del preds
del accuracy
del DBSCAN

In [None]:
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

In [None]:
def cross_val(X,y):
    
    scores = []
    val_preds = []
    test_preds = []
    test_proba = []

    #for viz
    actuals =[]
    preds= []

    for idx, (train_idx, val_idx) in enumerate( cv.split(X,y)):
        print (f"Running fold {idx}")

        X_train , X_test = X.iloc[train_idx] , X.iloc[val_idx]
        y_train, y_test = y[train_idx], y[val_idx]

        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred) 
        print("Accuracy",accuracy )

        #save preditions on Training set
        #train.loc[y_test.index, "pred_train"]= y_pred 

        scores.append(accuracy)
        test_preds.append(model.predict(test) ) 
        test_proba.append( model.predict_proba(test))

        #save actuals and preds for visualisation of missclassified
        actuals.extend( list(y_test))
        preds.extend(list(y_pred))

    print( "\nFinal Accuracy" , np.mean(scores)  ) 
    
    return test_preds, test_proba, scores, actuals, preds

In [None]:
test_preds, test_proba, scores, actuals_viz, preds_viz = cross_val(X,y)

In [None]:
miss_class = pd.DataFrame(data = {"actuals":actuals_viz, "preds":preds_viz})
miss_class["mismatch"] = miss_class["actuals"].astype(str)+"_"+miss_class["preds"].astype(str)

plt.figure(figsize=(25,8))
sns.countplot(miss_class[miss_class["actuals"]!= miss_class["preds"]]["mismatch"], order = miss_class[miss_class["actuals"]!= miss_class["preds"]]["mismatch"].value_counts().index)
plt.title("Miss-Classification: Actual_Predicted ")
plt.show()

# Post Processing 
Probabilities check and shift 

# Submission 

In [None]:
# Sum and argmax Predictions 
test_preds = np.argmax(sum(test_proba) / len(test_proba), axis=1)
test_preds

In [None]:
final_preds = encoder.inverse_transform(test_preds)
sub["target"] = final_preds
sub.to_csv("submission.csv")
sub.head()