# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">EXPLORATORY DATA     ANALYSIS</p>

<p style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;">This notebook gives extensive overview about the data features with interactive plots to describe the pattern among the features. I will be updating this notebook during the competition.<br>
Let's describe the problem first:<br>
We will be estimating bacterium species based on repeated lossy measurements of DNA snippets for this assignment. Raman spectroscopy is used to examine snippets of length 10 to compute the histogram of bases in the snippet.
Each row of data provides a spectrum of histograms formed by repeated measurements of a sample, with each row including the output of all 286 histogram possibilities (e.g., to), which is then subtracted from the findings by a bias spectrum (of entirely random ATGC).
To make it more believable The data (both train and test) also includes generated measurement errors (at various rates) for many of the samples, which complicates the situation.
</p>

<p style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;">See more facintating images of the bacterias to be predicted you can refer the discussion topic by <b>Remek Kinas:</b> 
<a href="https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/304472">BACTERIAS in competition</a></p>



In [None]:
!pip install scikit-learn-intelex -q --progress-bar off

In [None]:
import shap
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from colorama import Fore
from math import factorial

import seaborn as sns
from sklearn import metrics
from scipy import stats
import matplotlib as mpl
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB,CategoricalNB

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.max_columns = 999

import warnings
warnings.filterwarnings("ignore")

import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier,AdaBoostClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #000000; color: white;'
}
from IPython.display import HTML

In [None]:
n_folds=10
seed=42
n_estimators=500
pseudolabel=False  ## to do pseudo labelling or not 

In [None]:
target_map={0:'Streptococcus_pyogenes', 1:'Salmonella_enterica',
       2:'Enterococcus_hirae', 3:'Escherichia_coli', 4:'Campylobacter_jejuni',
       5:'Streptococcus_pneumoniae', 6:'Staphylococcus_aureus',
       7:'Escherichia_fergusonii', 8:'Bacteroides_fragilis',
       9:'Klebsiella_pneumoniae'}

train=pd.read_parquet('../input/tabular-february/train.parquet')
test=pd.read_parquet('../input/tabular-february/test.parquet') #

if pseudolabel:
    ptrain=pd.read_csv('../input/febpseudolabelling/pseudo_labels.csv')
    ptrain=ptrain['0'].map(lambda x:target_map[x])
    ptest=pd.read_csv('../input/febpseudolabelling/submission(16).csv')

    train['pseudo']=ptrain
    test['pseudo']=ptest['target']

In [None]:
s=train.head()
s.style.set_table_styles([cell_hover, index_names, headers])

In [None]:
train.drop('row_id',axis=1,inplace=True)
duplicates_train = train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

train.drop_duplicates(keep='first', inplace=True)
duplicates_train = train.duplicated().sum()

print('Train data shape:', train.shape)
print('Duplicates in train data: {0}'.format(duplicates_train))
train.reset_index(drop=True,inplace=True)


# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">CHECK MISSING VALUES</p>

<p style="background-color:#FFFFFF;font-family:calibri; color:#000000;font-size:120%;">There are no missing data in train and test datasets.</p>

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

print("=="*30)
print('TRAIN')
print("=="*30)
display(missing_data(train).style.set_table_styles([cell_hover, index_names, headers]))
print("=="*30)
print('TEST')
print("=="*30)
display(missing_data(test).style.set_table_styles([cell_hover, index_names, headers]))

# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">DESCRIBE TRAIN AND TEST</p>

<p style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;"><b>We can make few observations here:</b></p>

<ul style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;">
  <li>Mean,Standard deviation is relatively less for both train and test variable data</li>
  <li>Mean,std are quite the same in both train and test</li>
  <li>Since all the values lie between -1 to 1 we won't  be need to scale the features</li>
  <li>Train and test data looks very much similar.</li>
</ul>

In [None]:
print("=="*30)
print('TRAIN')
print("=="*30)
s=train.describe()
display(s.style.set_table_styles([cell_hover, index_names, headers]))
print("=="*30)
print('TEST')
print("=="*30)
s=test.describe()
display(s.style.set_table_styles([cell_hover, index_names, headers]))


# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">TARGET DISTRIBUTION</p>

<ul style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;">
  <li>Classes appears to be balanced. (Good thing)</li>
  <li>Each target value has about 20000 frequency</li>
</ul>

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(train, x="target",color='target', template='plotly_white',opacity=0.7)

fig.show()

# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">FEATURES DISTRIBUTION</p>

<ul style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;">
  <li>All the values are highly skewed around zero</li>
  <li>there are some columns where we are having two local peaks</li>
</ul>

In [None]:
numerical_columns=test.columns[1:13]
num_rows, num_cols = 4,3
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, 20))
f.suptitle('Distribution of Features', fontsize=16)

for index, column in enumerate(train[numerical_columns].columns):
    i,j = (index // num_cols, index % num_cols)
    g = sns.kdeplot(train[column], color="m", shade=True, label="%.2f"%(train[column].skew()), ax=axes[i,j])
    g = g.legend(loc="best")

f.delaxes(axes[3,2 ])

plt.tight_layout()
plt.show()

In [None]:
# violin plot

In [None]:
import plotly.graph_objects as go

numerical_columns=test.columns[3:7]
num_rows, num_cols = 4,1
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(20, 20))
f.suptitle('Distribution of Targets wit hfeatures', fontsize=16)

set3=['FF6363','FFAB76','FFFDA2','BAFFB4','8946A6']

for index, column in enumerate(train[numerical_columns].columns):
    i,j = (index // num_cols, index % num_cols)
    g=sns.violinplot(x=column, y='target',palette=['r','g','b','m','y'], data=train,ax=axes[i],scale='width',linewidth=0.1)
    g = g.legend(loc="best")

f.delaxes(axes[0])
# f.delaxes(axes[3, 2])
plt.tight_layout()
plt.show()

# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">CORRELATION</p>

<ul style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;">
  <li>starting features are highly uncorrelated</li>
  <li>you can change the slices or add column names to check the correlation between them </li>
</ul>

In [None]:
import plotly.express as px
cols=train.columns[1:25]
z =train[cols].corr()

fig = px.imshow(z, text_auto=True, aspect="auto")
fig.show()

In [None]:
def unique_data(data):
    total = data.nunique()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    for col in data.columns:
        dtype = str(data[col].dtype)
    return(np.transpose(tt))
print("=="*30)
print('TRAIN')
print("=="*30)
display(unique_data(train).style.set_table_styles([cell_hover, index_names, headers]))
print("=="*30)
print('TEST')
print("=="*30)
display(unique_data(test).style.set_table_styles([cell_hover, index_names, headers]))

# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">CONVERTING BACK TO INTEGERS</p>

<ul style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;">
  <li>FROM THIS PART I HAVE TAKEN REFERENCE FROM AMOBROSM'S NOTEBOOK THINK IS PROVIDED IN REFERENCES</li>
</ul>

# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">KMEANS FOR VALIDATION SPLIT AND EXTRA TREES+NAIVE BIAS FOR TRAINING</p>

<ul style="background-color:#FFFFFF;font-family:calibri; color:#0077b6;font-size:120%;">
  <li>i have clustered the data into 8 clusters i haven't tested how changing clusters will affect score</li>
  <li>I have used extra tress for training as so far it is the breakthrough model for this competition</li>
</ul>

In [None]:
from sklearn.preprocessing import LabelEncoder
cont_features =[col for col in train.columns if col not in ["row_id",'target']]
cat_features = []

target_map={'Streptococcus_pyogenes':0, 'Salmonella_enterica':1,
       'Enterococcus_hirae':2, 'Escherichia_coli':3, 'Campylobacter_jejuni':4,
       'Streptococcus_pneumoniae':5, 'Staphylococcus_aureus':6,
       'Escherichia_fergusonii':7, 'Bacteroides_fragilis':8,
       'Klebsiella_pneumoniae':9}

target_encoder = LabelEncoder()
train["target"] = target_encoder.fit_transform(train["target"])
target=train['target']
if pseudolabel:
    train['pseudo']=train['pseudo'].map(lambda x:target_map[x])
    test['pseudo']=test['pseudo'].map(lambda x:target_map[x])
k_fold = StratifiedKFold(n_splits=n_folds, random_state=seed, shuffle=True)

In [None]:
sample_weight=train.value_counts().values

In [None]:
et_pred = []
rf_pred = []


et_scores = []
rf_scores = []

et_train_preds=np.zeros(len(train))
rf_train_preds=np.zeros(len(train))

for fold, (train_id, test_id) in tqdm(enumerate(k_fold.split(train,train['target']))):
    print('####### Fold: ', fold)
    
    # Splitting
    X_train, y_train = train.iloc[train_id][cont_features], target[train_id]
    X_valid, y_valid = train.iloc[test_id][cont_features], target[test_id]
    
    # Model
    et_model = ExtraTreesClassifier(
        n_estimators=n_estimators,
        n_jobs=-1,
        verbose=1
    )
    
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        n_jobs=-1,
                    
    )
    # Training
    sample_weight_train=sample_weight[train_id]
    sample_weight_valid=sample_weight[test_id]
    
    et_model.fit(X_train, y_train,sample_weight_train)
    rf_model.fit(X_train, y_train,sample_weight_train)
    
    # Evaluation
    et_valid_pred = et_model.predict(X_valid)
    rf_valid_pred = rf_model.predict(X_valid)
    
    et_valid_score = accuracy_score(y_valid, et_valid_pred,sample_weight=sample_weight_valid)
    rf_valid_score = accuracy_score(y_valid, rf_valid_pred,sample_weight=sample_weight_valid)
    

    
    print(f'Extra Trees Accuracy score: {et_valid_score:6f}\n')
    print(f'Random Forest_SCORE Accuracy score: {rf_valid_score:6f}\n')
    et_scores.append(et_valid_score)
    rf_scores.append(rf_valid_score)

    # Prediction for submission
    et_pred.append(et_model.predict_proba(test[cont_features]))
    rf_pred.append(rf_model.predict_proba(test[cont_features]))

In [None]:
def post_processing(y_probs, train, tune = []):
    y_prob = sum(y_probs) / len(y_probs)
    target_distribution = train['target'].value_counts().sort_index() / len(train) * 100
    def get_diff(tune):
        y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob + tune, axis=1))
        return target_distribution - pd.Series(y_pred_tuned).value_counts().sort_index() / len(test) * 100

    if len(tune) == 0:
        tune = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        diff = get_diff(tune)
        while abs(diff).max() > 0.1:
            for i in range(len(diff)):
                if diff[i] > 0.1:
                    tune[i] += 0.001
                    break
                if diff[i] < -0.1:
                    tune[i] -= 0.001
                    break
            diff = get_diff(tune)

    # Credits to https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants
    print(tune)
    y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob + tune, axis=1))
    print(pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100)
    return y_pred_tuned

In [None]:
tune = [0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0]

In [None]:
y_preds=et_pred
y_preds.extend(rf_pred)

y_pred_tuned = post_processing(et_pred, train)
y_pred_tuned_with_tune = post_processing(et_pred, train, tune)

In [None]:
submission=pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
submission['target']=y_pred_tuned_with_tune

In [None]:
submission.to_csv('submission.csv',index=False)

In [None]:
# pseudolabels=pd.DataFrame(np.array(tpreds)).astype(int)
# pseudolabels.to_csv('pseudo_labels.csv',index=False)

# <p style="background-color:#000000;font-family:newtimeroman;color:#FFFFFF;font-size:150%;text-align:center;border-radius:2px 2px;">REFERENCES</p>
<ul>
    <li>Analysis of Identification Method for Bacterial Species and Antibiotic Resistance Genes Using Optical Data From DNA Oligomers</li>
    <li><a href='https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense'>Notebook by ambrosM</a></li>