In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pylab as pl
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/test.csv')
sample_sub = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
train['target'].value_counts()

## Exploring a bit of a new thing : 

### Coloring a dataframe based on Frequencies

In [None]:
train.head().style.background_gradient(cmap = "Blues")

In [None]:
train.head().style.background_gradient(cmap = "Spectral")

# Label Encoding the 'target' column

In [None]:
from sklearn import preprocessing 

le = preprocessing.LabelEncoder()

train['target'] = le.fit_transform(train['target'])

# Model-based and Sequential Feature Selection

# Importance of Features

### Selecting features based on Feature Importance from Co-efficients

### LASSOCV

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LassoCV

X = train.drop(['target', 'id'], axis = 1)
y = train['target']



In [None]:
plt.figure(figsize=(10, 12))

lasso = LassoCV().fit(X, y)
importance = np.abs(lasso.coef_)
feature_names = np.array(X.columns)
plt.barh(feature_names, importance)
plt.title("Feature Importances via Coefficients [ Lasso CV ]")
plt.figure(figsize=(12, 22))
plt.show()

### Selecting Features based on Importance

In [None]:
from sklearn.feature_selection import SelectFromModel

from time import time

threshold = np.sort(importance)[-3]

tic = time()
sfm = SelectFromModel(lasso, threshold = threshold).fit(X, y)
toc = time()
print("Features Selcted by SelectFromModel : "f"{feature_names[sfm.get_support()]}")
print(f"Done in {toc - tic : .3f}s")

In [None]:
feature_names_lassocv = feature_names

# OBSERVATION : 

**'feature_13' 'feature_29' 'feature_36' are useful features as per LassoCV** 

# Selecting Features with Sequential Feature Selection

Greedy procedure where , at each iteration, we choose the best new feature to add to our selected features based a Cross-Validation Score. 
The procedure is repeated until we reachthe desired number of selected Features. 

We can also go back in reverse direction **(backward SFS)** i.e. start with all features and greedily chose features to remove one by one. 

**BTW this has been running Endlessly. 
Not a good option !**

In [None]:
# from sklearn.feature_selection import SequentialFeatureSelector

# tic_fwd = time()
# sfs_forward = SequentialFeatureSelector(lasso, n_features_to_select = 4, direction = 'forward').fit(X, y)
# toc_fwd = time()

# tic_bwd = time()
# sfs_backward = SequentialFeatureSelector(lasso, n_features_to_select = 4, direction = 'backward').fit(X, y)
# toc_bwd = time()

# print("Features Selected by Forward Sequential Selection : "f"{feature_names[sfs_forward.get_support()]}")
# print(f"Done in {toc_fwd - tic_fwd:.3f}s")

# print("Features Selected by Backward Sequential Selection: "f"{feature_names[sfs_backward.get_support()]}")
# print(f"Done in {toc_bwd - tic_bwd: .3f}s")


## Tree Based Feature Selection

Used to compute **Impurity-Based Feature Importances** , which in turn can be used to discard irrelevant features

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(n_estimators = 50)
clf = clf.fit(X, y)

model = SelectFromModel(clf, prefit = True)
feature_names_extratreesclf = feature_names[model.get_support()]

print("Features Selcted by Extra Tree Classifier and SelectFromModel : "f"{feature_names[model.get_support()]}")


In [None]:
feature_names_extratreesclf.shape

### Looks like there has been a reduction. 49 features -> 23 features

### Feature Importance based on Mean Decrease in Impurity and Feature Permutation

## Feature Selection with respect to the Mean Decrease in Impurity

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

feature_names = np.array(X.columns)

forest = RandomForestClassifier(random_state = 0)
forest.fit(X_train, y_train)

In [None]:
import time

start_time = time.time()
importances = forest.feature_importances_
std = np.std([
    tree.feature_importances_ for tree in forest.estimators_], axis = 0)

elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances:" f"{elapsed_time:.3f} seconds" )


In [None]:
import pandas as pd


forest_importances = pd.Series(importances, index = feature_names)

fig, ax = plt.subplots(figsize = (10, 12))
forest_importances.plot.barh(std, ax)
ax.set_title("Feature importances using MDI (Mean Decrease in Impurity)")
ax.set_ylabel(" Mean Decrease in Impurity")
fig.tight_layout()


**OBSERVATION : Feature Importance is different from what we found with LassoCV**

In [None]:
feature_importance_mdi = forest_importances

In [None]:
type(feature_importance_mdi)

In [None]:
forest_importances.loc[forest_importances > np.mean(forest_importances)].index

## Feature Permutation 

Permutation Feature importance overcomes limitations of impurity-based-feature importance: they do not have bias toward high-cardinality features can be computed on a left-out test set

An Interesting Package that I came across. 
**sklearn.inspection is a base version of ExplainableAI concepts**

In [None]:
from sklearn.inspection import permutation_importance

start_time = time.time()
result = permutation_importance(
                               forest, X_test, y_test, n_repeats = 10, random_state = 42)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances :" f"{elapsed_time:.3f} seconds")

forest_importances = pd.Series(result.importances_mean, index = feature_names)

In [None]:
fig, ax = plt.subplots(figsize = (10, 12))
forest_importances.plot.barh(result.importances_std, ax)
ax.set_title("Feature Importances using permutation on Full Model")
ax.set_xlabel("Mean Accuracy decrease")
fig.tight_layout()
plt.show()

## Recursive Feature Elimination

**This also seems to run for quite a lot of time. Not a good option**

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.svm import SVC
# from sklearn.model_selection import StratifiedKFold
# from sklearn.feature_selection import RFECV

# svc = SVC(kernel = "linear")

# min_features_to_select = 4 # Min number of features to consider
# rfecv = RFECV(estimator = svc, step = 1, cv = StratifiedKFold(2), scoring = 'accuracy',
#              min_features_to_select = min_features_to_select)

# rfecv.fit(X_train, y_train)

# print("Optimal Number of Features : %d" % rfecv.n_features_)

# plt.figure()
# plt.xlabel("Number of features selected")
# plt.ylabel("Cross Validation Score (nb of Correct Classifications)")
# plt.plot(range(min_features_to_select, 
#               len(rfecv.grid_scores_) + min_features_to_select),
#         rfecv.grid_scores_)
# plt.show()

## Chi Square Test

In [None]:
from sklearn.feature_selection import chi2

X1 = X.abs()
chi_scores = chi2(X1, y)


In [None]:
chi_scores

**here first array represents chi square values and second array represnts p-values**

In [None]:
p_values = pd.Series(chi_scores[1], index = X.columns)
p_values.sort_values(ascending = False, inplace = True)

In [None]:
p_values.plot.bar(figsize = (10, 12))

# OBSERVATION : 

**Feature 6 has the highest p-value, hence it is Independent of the values in the 'target' column.
It cannot be considered for Model Training**

* 'feature_13' 'feature_29' 'feature_36' are useful as per LassoCV
* Below features are selected by ExtraTreeClassifier : 

'feature_3' 'feature_7' 'feature_8' 'feature_9' 'feature_14' 'feature_15'
 'feature_17' 'feature_18' 'feature_19' 'feature_21' 'feature_23'
 'feature_24' 'feature_28' 'feature_31' 'feature_34' 'feature_35'
 'feature_38' 'feature_40' 'feature_41' 'feature_48' 'feature_49' 
 
* Mean Decrease in Impurity : 

'feature_3', 'feature_7', 'feature_8', 'feature_9', 'feature_12',
'feature_14', 'feature_15', 'feature_17', 'feature_18', 'feature_19',
'feature_21', 'feature_24', 'feature_28', 'feature_31', 'feature_34',
'feature_35', 'feature_38', 'feature_40', 'feature_41', 'feature_48',
'feature_49'

# PIPELINES

In [None]:
from functools import wraps
import datetime as dt

def log_step(func):
    
    @wraps(func)
    def wrapper(*args, **kwargs):
        
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape = {result.shape} took {time_taken}s")
        return result
    return wrapper

In [None]:
@log_step
def start_pipeline(dataf):
    return dataf.copy() 

## LABEL ENCODING

In [None]:
from sklearn import preprocessing

@log_step
def label_encode(data):
    le = preprocessing.LabelEncoder()

    for c in data.columns:

        if (data[c].dtype == 'object'):
            data[c] = le.fit_transform(data[c])

    return data
        

# CORRELATIONS of different features with the "target" column in Descending Order

In [None]:
@log_step
def corelation_target(data, target):
    
    """
    Find Co-relation of different features with the "Target" column in Descending Order
    """
    plt.figure(figsize = (8, 12))

    heatmap = sns.heatmap(data.corr()[[target]].drop(index = target, axis = 0).sort_values(by = target, ascending = False),
                         vmin = -1,
                         vmax = 1, 
                         annot = True, 
                         cmap = 'BrBG')

    heatmap.set_title(f"Features Correlating with {target} column", 
                      fontdict = {'fontsize':18}, pad = 16)
    
    return data

This information sometimes gets lost in the Heatmap

In [None]:
@log_step
def corelation_horizontal_target(data, target):
    
    """
    Horizontal Bar Plot of the Co-relation of individual features with the Target Column 
    """
    plt.figure(figsize=(10, 12))

    corr = data.corr()[[target]].drop(index = target, axis = 0) # Removes the 1st row i.e. Corelation of target with itself
    plt.barh(corr.index, corr.reset_index(drop = True).to_numpy().ravel())
    plt.title("Corelation with target")
    plt.figure(figsize=(12, 22))
    plt.show()
    
    return data

In [None]:
@log_step
def zero_percent(data):
    
    """
    Horizontal Bar Plot of Percentage of Data containing '0' in each feature
    """
    
    raw_light_palette = [
        (0, 122, 255), # Blue
        (255, 149, 0), # Orange
        (52, 199, 89), # Green
        (255, 59, 48), # Red
        (175, 82, 222),# Purple
        (255, 45, 85), # Pink
        (88, 86, 214), # Indigo
        (90, 200, 250),# Teal
        (255, 204, 0)  # Yellow
    ]

    light_palette = np.array(raw_light_palette) / 255

    zero_data = ((data.iloc[:, :50] == 0 ).sum() / len(data) * 100)[::-1]
    fig, ax = plt.subplots(1, 1, figsize = (10, 19))

    ax.barh(zero_data.index, 100, color = '#dadada', height = 0.6)
    barh = ax.barh(zero_data.index, zero_data, color = light_palette[1], height = 0.6)
    ax.bar_label(barh, fmt = '%.01f %%', color = 'black')

    # Line noting the data area boundaries
    ax.spines[['left', 'bottom']].set_visible(False)

    # xticks : Set the current label of x-axis
    ax.set_xticks([])

    ax.set_title('# of Zeros (by feature)', loc = 'center', fontweight = 'bold', fontsize = 15)
    plt.show()
    
    return data

In [None]:
@log_step
def bargraph_average_by_class_by_feature(data, target):
    
    """
    Bar Graph Plot of Mean of Each value (Class) in a Feature 
    """
    
    raw_dark_palette = [
    (10, 132, 255), # Blue
    (255, 159, 10), # Orange
    (48, 209, 88),  # Green
    (255, 69, 58),  # Red
    (191, 90, 242), # Purple
    (94, 92, 230),  # Indigo
    (255, 55, 95),  # Pink
    (100, 210, 255),# Teal
    (255, 214, 10)  # Yellow
    ]

    dark_palette = np.array(raw_dark_palette)/255

    fig, axes = plt.subplots(13, 4, figsize = (10, 16))

    target_order = sorted(data[target].unique())
    mean = data.groupby(target).mean().sort_index()
    std = data.groupby(target).std().sort_index()

    for idx, ax in zip(range(50), axes.flatten()):
        #main code
        ax.bar(mean[f'feature_{idx}'].index, mean[f'feature_{idx}'],
              color = dark_palette[:4], width = 0.6)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_xlabel('')
        ax.set_ylabel('')
        ax.margins(0.1)
        ax.spines['left'].set_visible(False)
        ax.set_title(f'Feature_{idx}', loc = 'right', weight = 'bold', fontsize = 11)

    axes.flatten()[-1].axis('off')
    axes.flatten()[-2].axis('off')

    fig.supxlabel('AVERAGE by class (by feature)', ha = 'center', fontweight = 'bold')

    fig.tight_layout()
    plt.show()

    return data

In [None]:
train_df = (train
           .pipe(start_pipeline)
           .pipe(label_encode)
           .pipe(corelation_target, target = 'target')
           .pipe(corelation_horizontal_target, target = 'target')
           .pipe(zero_percent)
           .pipe(bargraph_average_by_class_by_feature, target = 'target'))

We cannot validate with the test dataset, as there is no 'target' column . 

Have to split the train dataset.


In [None]:
     

from cycler import cycler


raw_light_palette = [
    (0, 122, 255), # Blue
    (255, 149, 0), # Orange
    (52, 199, 89), # Green
    (255, 59, 48), # Red
    (175, 82, 222),# Purple
    (255, 45, 85), # Pink
    (88, 86, 214), # Indigo
    (90, 200, 250),# Teal
    (255, 204, 0)  # Yellow
]

raw_dark_palette = [
    (10, 132, 255), # Blue
    (255, 159, 10), # Orange
    (48, 209, 88),  # Green
    (255, 69, 58),  # Red
    (191, 90, 242), # Purple
    (94, 92, 230),  # Indigo
    (255, 55, 95),  # Pink
    (100, 210, 255),# Teal
    (255, 214, 10)  # Yellow
]

raw_gray_light_palette = [
    (142, 142, 147),# Gray
    (174, 174, 178),# Gray (2)
    (199, 199, 204),# Gray (3)
    (209, 209, 214),# Gray (4)
    (229, 229, 234),# Gray (5)
    (242, 242, 247),# Gray (6)
]

raw_gray_dark_palette = [
    (142, 142, 147),# Gray
    (99, 99, 102),  # Gray (2)
    (72, 72, 74),   # Gray (3)
    (58, 58, 60),   # Gray (4)
    (44, 44, 46),   # Gray (5)
    (28, 28, 39),   # Gray (6)
]


light_palette = np.array(raw_light_palette)/255
dark_palette = np.array(raw_dark_palette)/255
gray_light_palette = np.array(raw_gray_light_palette)/255
gray_dark_palette = np.array(raw_gray_dark_palette)/255

mpl.rcParams['axes.prop_cycle'] = cycler('color',dark_palette)
mpl.rcParams['figure.facecolor']  = gray_dark_palette[-2]
mpl.rcParams['figure.edgecolor']  = gray_dark_palette[-2]
mpl.rcParams['axes.facecolor'] =  gray_dark_palette[-2]

white_color = gray_light_palette[-2]
mpl.rcParams['text.color'] = white_color
mpl.rcParams['axes.labelcolor'] = white_color
mpl.rcParams['axes.edgecolor'] = white_color
mpl.rcParams['xtick.color'] = white_color
mpl.rcParams['ytick.color'] = white_color

mpl.rcParams['figure.dpi'] = 200

mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False



 **Soft Voting Ensemble Starter :** https://www.kaggle.com/manabendrarout/soft-voting-ensemble-starter-tps-may21

In [None]:
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category = FutureWarning)
warnings.filterwarnings('ignore', category = RuntimeWarning)
warnings.filterwarnings('ignore', category = UserWarning)
warnings.filterwarnings('ignore', category = sklearn.exceptions.UndefinedMetricWarning)

import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import os
#fmin : Minimize function using simplex downhill algorithm
from scipy.optimize import fmin as scip_fmin

# visualization 
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style = "whitegrid")

# Machine Learning

# Utils
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_validate
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn import preprocessing
import category_encoders as ce

# Feature Selection
from sklearn.feature_selection import chi2, f_classif, f_regression
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile, VarianceThreshold

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier



In [None]:
def seed_everything(seed = RANDOM_SEED):
    
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
train_df.describe().T

# Feature Scaling

To bring all features into a similar scale let's use simple scaler to scale all the features

In [None]:
not)features = ['id', 'target']
features = []

for feat in train_df.columns:
    
    if feat not in not_features:
        features.append(feat)
        
print(features)

In [None]:
scaler = preprocessing.StandardScaler()
scaler.fit(train_df[features])
train_df[features] = scaler.transform(train_df[features])
test_df[features] = scaler.transform(test_df[features])

# KFOLD SPLITS

Before moving to feature engineering, it's better to perform cross validation splits. 

In that way, we will not risk any data leakage and would be more certain of the validation set being aptly representative of the real world unknown data. 

In [None]:
NUM_SPLITS = 5

train_df["kfold"] = -1
train_df = train_df.sample(frac = 1).reset_index(drop = True)
y = train_df.target.values
kf = StratifiedKFold(n_splits = NUM_SPLITS)

for f, (t_, v_) in enumerate(kf.split(X = train_df, y = y)):
    train_df.loc[v_, 'kfold'] = f
    
train_df.head()

# FEATURE SELECTION

We need to select only the important features for better performance of the model. 
An unnecessary in best case scenario will not add to any productive calculation of the algorithm or in worst case scenario 'confuse' the model. `

To DO THE SAME LET'S CREATE A WRAPPER CLASS THAT HAS ALL BUILD IN STATISTICAL TESTS REQUIRED TO PERFORM FEATURE SELECTION AND TAKE SOME BASIC INPUTS FROM USER and spits out the required features 


In [None]:
# FROM abhishek thakur's book 

class UnivariateFeatureSelection:
    
    def __init__(self, n_features, problem_type, scoring, return_cols = True):
        
        """
        Custom Univariate Feature Selction wrapper on different Univariate Feature selection 
        models from Scikit-Learn. 
        : param n_features: SelectPercentile if Float else SelectKBest
        : param problem_type : classification or regression
        : param scoring : scoring function, string
        """
        
        self.n_features = n_features
        
        if problem_type = "classification":
            
            valid_scoring = {
                "f_classif": f_classif, 
                "chi2" : chi2,
                "mutual_info_classif": mutual_info_classif
            }
            
        else : 
            valid_scoring = {
                "f_regression" : f_regression,
                "mutual_info_regression" : mutual_info_regression
            }
            
        if scoring not in valid_scoring:
            raise Exception("Invalid scoring function")
            
        if isinstance(n_features, int):
            
            self.selection = SelectKBest(
                                            valid_scoring[scoring])