<h1>
    <p style="text-align:center; font-size:180%"> Tabular Playground Series - Feb 2022</p> 
</h1>

<center>
    <img src="https://media.snl.no/media/58661/article_topimage_28881401596_d2c61ee954_o.jpg" width="600" height="1800"> 
</center>

<h2>
    <p style="text-align:center; font-family:Verdana; letter-spacing:0.5px; font-size:120%"> Predicting bacteria species based on repeated lossy measurements of DNA snippets
    </p>
</h2> 

# 1. Importing Libraries

In [None]:
# Data manipulation
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Maths
import numpy as np

# Patching sklearn
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

# Model Building 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
import shap
from sklearn.metrics import accuracy_score



# Lgbm
from lightgbm import LGBMClassifier
import lightgbm as lgbm
import optuna
from optuna.integration import LightGBMPruningCallback

# Visualization
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import pprint

# Settings
sns.set(rc = {'figure.figsize': (26, 8)})
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style = "ticks", rc = custom_params)

# 2. Data Loading and EDA

In [None]:
# Data loading
train_data = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col = 'row_id')
test_data = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col = 'row_id')
submission = pd.read_csv("../input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
# Drop Not relevant columns
def drop_nrv(df):
    na_df = pd.DataFrame(df.isna().sum(), 
                         columns = ['Number of NaN'])
    
    na_df = na_df.sort_values('Number of NaN', 
                              ascending = False).head(10)
    
    na_df['Perc'] = round(na_df['Number of NaN']/len(train_data.index)*100, 2)
    
    # Drop columns that have more than 50% of NaN
    to_drop = list(na_df[na_df['Perc'] > 50.00].index)
    na_df['Perc'] = na_df['Perc'].astype(str) + '%'
    df.drop(to_drop, 
            inplace = True, axis = 1)
    
    if to_drop == []:
        print('No column has been removed (< 50% NaN)\n',
                f'Overall missing values: {(df.isna().sum().sum())/(df.shape[0]*df.shape[1])}%\n', sep = '')
    else:
        print(to_drop, ' columns have been removed (> 50% NaN)\n', sep = '')
    
    return na_df, df.astype('float64', errors = 'ignore')

print('TRAIN DATA\n')
missing_val_train, train_data = drop_nrv(train_data)
print(missing_val_train, '\n\n', '-'*80, '\n', sep = '')

print('TEST DATA\n')
missing_val_test, test_data = drop_nrv(test_data)
print(missing_val_test, '\n\n', '-'*80, '\n', sep = '')

print(f'Train Data rows: {train_data.shape[0]} \nTrain Data Columns: {train_data.shape[1]}\n')
print(f'Test Data rows: {test_data.shape[0]} \nTest Data Columns: {test_data.shape[1]}')

In [None]:
# Removing Target Column
target_bacteria_list = train_data['target'].unique()
target_bacteria = train_data['target']
train_data.drop('target', axis = 1, inplace = True)

In [None]:
def obj_int_identifier(train_data, test_data):

    num_unique_val = pd.DataFrame(train_data.nunique(), columns = ['Unique Values']).sort_values(by = 'Unique Values')
    cat = num_unique_val[num_unique_val['Unique Values']<=10].index
    cont  = num_unique_val[num_unique_val['Unique Values']>10].index
    
    train_data_cont_var = train_data.filter(cont).columns
    train_data_disc_var = train_data.filter(cat).columns
    
    print('Train data total columns: ', 
          len(train_data_cont_var)+len(train_data_disc_var), 
          '\nContinuous Features: ', len(train_data_cont_var),
          '\nDiscrete Features: ', len(train_data_disc_var), '\n', sep ='')
    
    test_data_cont_var = test_data.filter(cont).columns
    test_data_disc_var = test_data.filter(cat).columns

    print('Test data total columns: ', 
          len(test_data_cont_var)+len(test_data_disc_var), 
          '\nContinuous Features: ', len(test_data_cont_var),
          '\nDiscrete Features: ', len(test_data_disc_var), sep ='')
    
    return train_data_cont_var, train_data_disc_var, test_data_cont_var, test_data_disc_var

train_data_cont_var, train_data_disc_var, test_data_cont_var, test_data_disc_var = obj_int_identifier(train_data, test_data)

In [None]:
# Align Features
def discrepancies_check(train, test):
    dict_train = {}
    dict_test = {}
    
    for el in train_data_disc_var:
        dict_train[el] = train[el].unique().tolist()
    
    for el in test_data_disc_var:
        dict_test[el] = test[el].unique().tolist()
    
    if dict_train.keys() == dict_test.keys():
        print('Train and Test set have the same discrete features.\nResults:')
    else: 
        print('Pay attention, different discrete features in Train and Test!\n')

    dict_diff = {}
    train_or_test = {}
    
    for key in dict_train.keys():
        if set(dict_train[key]) ^ set(dict_test[key]) != set():
            dict_diff[key] = list(set(dict_train[key]) ^ set(dict_test[key]))
    
    for key in dict_train.keys():
        if (set(dict_test[key]) - set(dict_train[key]) != set()) & (set(dict_train[key]) - set(dict_test[key]) != set()):
            train_or_test[key] = 'Both'        
        elif set(dict_train[key]) - set(dict_test[key]) != set():
            train_or_test[key] = 'Train'
        elif set(dict_test[key]) - set(dict_train[key]) != set():
            train_or_test[key] = 'Test'        
        elif set(dict_train[key]) ^ set(dict_test[key]) == set():
            pass
        else:
            print('Pay attention possible errors!')
    
    df = pd.DataFrame(index = dict_diff.keys(), columns = ['Discrepancies'])
    df['Discrepancies'] = dict_diff.values()
    
    df1 = pd.DataFrame(index = train_or_test.keys(), columns = ['Where'])
    df1['Where'] = train_or_test.values()
    
    final_df = df.merge(df1, right_index = True, left_index = True)
    
    if final_df.empty:
        print('\nNo discrepancies!')
    else:
        return final_df
    
discrepancies_check(train_data, test_data)

In [None]:
target_count = pd.DataFrame(target_bacteria.value_counts())
target_count

fig = go.Figure(data=[go.Bar(
    x = target_count.index,
    y = target_count.target,
    marker={'color': target_count.target,
            'colorscale': 'RdBu'}
)])

fig.update_traces(marker_line_width=1.5, 
                  opacity=0.8)

fig.update_layout(title_text = 'Bacteria Species Overview',
                  template = 'plotly_white')

In [None]:
target_count = pd.DataFrame(target_bacteria.value_counts())
target_count

fig = go.Figure(data=[go.Bar(
    x = target_count.index,
    y = np.round(target_count.target/(target_count.target.sum()),4)*100,
    marker={'color': target_count.target,
            'colorscale': 'RdBu'}
)])

fig.update_traces(marker_line_width=1.5, 
                  opacity=0.8)

fig.update_layout(title_text = 'Bacteria Species Overview (percentage)',
                  template = 'plotly_white')

In [None]:
feat_corr = pd.DataFrame(train_data.corr())

fig = go.Figure(data=go.Heatmap(
        z=feat_corr.values,
        x=feat_corr.columns,
        y=feat_corr.columns,
        colorscale='RdBu'))

fig.update_layout(
    title='Correlation between all features')

fig.show()

In [None]:
high_corr = feat_corr.abs()
high_corr = high_corr.unstack()
high_corr = pd.DataFrame(high_corr).reset_index()
high_corr.columns = ['Feature 1', 'Feature 2', 'Abs Correlation']

high_corr = high_corr[(high_corr['Abs Correlation'] != 1) &
                      (high_corr['Abs Correlation'] >= .5)]

high_corr = high_corr.drop_duplicates(subset=['Feature 1', 'Feature 2'])
high_corr = high_corr.sort_values(by = 'Abs Correlation', ascending = False)
high_corr

In [None]:
def feature_engineering(df):
    
    df['MEAN'] = df.mean(axis=1)
    df['MEDIAN'] = df.median(axis=1)
    df['STD'] = df.std(axis=1)
    df['SKEW'] = df.skew(axis=1)
    
    return df

feature_engineering(train_data);
feature_engineering(test_data);

In [None]:
def scaling_feat(train_set, test_set):
    print(f'Dimensions before scaling: \ntrain_set: {train_set.shape} \ntest_set: {test_set.shape}')
    
    scaler = StandardScaler()

    train_set_scaled = scaler.fit_transform(train_set)
    test_set_scaled = scaler.transform(test_set)

    train_set = pd.DataFrame(train_set_scaled, index=train_set.index, columns=train_set.columns)
    test_set = pd.DataFrame(test_set_scaled, index=test_set.index, columns=test_set.columns)
    
    print(f'\nDimensions after scaling: \ntrain_set: {train_set.shape} \ntest_set: {test_set.shape}')
    
    return train_set, test_set

train_set, test_set = scaling_feat(train_data, test_data)

# 3. Model

In [None]:
train_x, validation_x, train_y, validation_y = train_test_split(train_set, 
                                                                target_bacteria, 
                                                                test_size=0.2,
                                                                random_state=1505)

train_x.columns = train_set.columns
validation_x.columns = train_set.columns

In [None]:
train_x, test_x, train_y, test_y = train_test_split(train_x, 
                                                    train_y, 
                                                    test_size=0.2,
                                                    random_state=1505)

train_x.columns = train_set.columns
test_x.columns = train_set.columns

In [None]:
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)
print(validation_x.shape, validation_y.shape)

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(target_bacteria)
target_bacteria_enc = le.transform(target_bacteria)

In [None]:
N_SPLITS = 10
ESTIMATORS = 500

scores = []
y_probs = []
folds = StratifiedKFold(n_splits=N_SPLITS, 
                        shuffle=True)

for fold, (train_id, test_id) in enumerate(folds.split(train_set, target_bacteria_enc)):  
    
    X_train = train_set.iloc[train_id]
    y_train = target_bacteria_enc[train_id]
    X_valid = train_set.iloc[test_id]
    y_valid = target_bacteria_enc[test_id]
    
    model = ExtraTreesClassifier(
        n_estimators=ESTIMATORS,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print("Fold:", fold + 1, "Accuracy:", valid_score)
    scores.append(valid_score)
    y_probs.append(model.predict_proba(test_set))


In [None]:
print("Mean accuracy score:", np.array(scores).mean())

In [None]:
y_prob = sum(y_probs) / len(y_probs)
y_prob += np.array([0, 0, 0.06, 0.06, 0.01, 0, 0, 0.02, 0, 0.01])
y_pred_tuned = le.inverse_transform(np.argmax(y_prob, axis=1))
pd.Series(y_pred_tuned, index=test_set.index).value_counts().sort_index() / len(test_set) * 100

# 4. Predictions

In [None]:
# Test CSV Submission

submission = pd.DataFrame(test_set.index, columns = ['row_id'])
submission['target'] = y_pred_tuned 
submission.to_csv('submission.csv', index=False)

submission.head()