## Feature Engineering Notebook 

### Import Libraries


In [38]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
from scipy.stats import normaltest

import math
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model  import LogisticRegression

from sklearn.neighbors import KNeighborsRegressor

# imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import (
    RandomUnderSampler,
    CondensedNearestNeighbour,
    TomekLinks,
    OneSidedSelection,
    EditedNearestNeighbours,
    RepeatedEditedNearestNeighbours,
    AllKNN,
    NeighbourhoodCleaningRule,
    NearMiss,
    InstanceHardnessThreshold
)

from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE,
)

from imblearn.combine import SMOTEENN, SMOTETomek


from imblearn.ensemble import (
    BalancedBaggingClassifier,
    BalancedRandomForestClassifier,
    RUSBoostClassifier,
    EasyEnsembleClassifier,
)

from sklearn.ensemble import (
    RandomForestClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
)


# adding common folder location to sys.path
import sys
sys.path.append('../common')

from helper import get_config

### Loading Config

In [5]:
#config = get_config()

### Load Dataset

In [6]:
# path to your dataset, can be a csv file or xlsx
dataset_path = "../Bank_Personal_Loan_Modelling.xlsx"

## use code as per the type of data source

## use below line to read data from csv file
## df = pd.read_csv(dataset_path)
df = pd.read_excel(dataset_path, sheet_name = 1, index_col=0)

In [None]:
df.head()

In [None]:
target = 'Personal Loan'
# df_x = df.drop(columns=[target])
# df_y = df[target]

Let's separate out, numerical, categorical and numerical_normal and numerical_non_normal attributes<br/>
This list will be used for outliers treatment and other transformation later.

### Separating Numerical and Categorical attributes along with normal and non normal numerical attributes

In [None]:
# value counts method can be used to see if an attribute contains categorical data or continous data
unique_val_in_cols = df.apply( lambda col : col.nunique()).sort_values()
print(unique_val_in_cols)

In [None]:
# decide reasnable threshold value for separating categorical and numerical attributes based on above result
threshold = 10

In [None]:
categorical_attributes = list(unique_val_in_cols[unique_val_in_cols < threshold].keys())
numerical_attributes = list(unique_val_in_cols[unique_val_in_cols > threshold].keys())

In [None]:
def normal_test(df, significance = .01):
    """
    Function to perform ks test and test against normal distribution using  
    D’Agostino, R. B. (1971), “An omnibus test of normality for moderate and large sample size”
    
    frame: a pandas dataframe
    significance: float. Alpha level for which the null hypotesis will be rejected (H0: series comes from a normal distribution)
    plot: Boolean, whether or not plot a histogram for resulting columns
    
    returns a dataframe with only those columns that follow a normal distribution according to test.
"""
    columns = df.columns.tolist()
    non_normal_columns = []

    for col in columns:
        aux = df[col]
    
        _, p = normaltest(aux)
    
        if p <= significance:
            # col is not normally distributed
            non_normal_columns.append(col)
        
    normal_columns = [cols for cols in columns if cols not in non_normal_columns]
    return normal_columns, non_normal_columns

In [None]:
normal_attributes, non_normal_attributes = normal_test(df[numerical_attributes])

In [None]:
print("Number of attributes:")
print("categorical:{0}".format(len(categorical_attributes)))
print("numerical:{0}".format(len(numerical_attributes)))
print("normal attributes:{0}".format(len(normal_attributes)))
print("non normal attributes:{0}".format(len(non_normal_attributes)))

#### Removing Target column from the categorical list of variables so that it does not get transformed

In [None]:
categorical_attributes.remove(target)

### Perform basic data cleaning as per observations from EDA
* For this dataset, we know that the 'Experience' column has minor percentage of negative values(which is wrong) so let's treat it before using any other transformation methods.

In [None]:
print("Number of negative values: {0}".format(len(df[df['Experience'] < 0])))

df['Experience'] = df['Experience'].apply(lambda x : np.nan if x < 0 else x)

print("Number of negative values after imputing with NaN: {0}".format(len(df[df['Experience'] < 0])))
print("Number of NANs: {0}".format(df['Experience'].isna().sum()))

### 1. Outlier Treatment

#### 1.1 Outlier treatment for numerical attributes

Outlier treatment for Non-Normal distribution

In [None]:
#Treating outliers with zero coding-Any value less than zero will be made zero
def outliers_ZeroCoding(X,variable):
    X.loc[X[variable]<0, variable] = 0

In [None]:
#Treating outliers with top coding-Any value greater than maximum limit will be capped at maximum
def outliers_TopCoding_quantile(df,variable):
    # top coding: upper boundary for outliers according to interquantile proximity rule
    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
    Upper_fence = df[variable].quantile(0.75) + (IQR * 3)
    df.loc[df[variable]>Upper_fence, variable] = Upper_fence

In [None]:
#Treating outliers with top coding-Any value less than minimum limit will be capped at minimum
def outliers_BottomCoding_quantile(df,variable):
    # bottom coding: lower boundary for outliers according to interquantile proximity rule
    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
    Lower_fence = df[variable].quantile(0.25) - (IQR * 3)
    df.loc[df[variable]<Lower_fence, variable] = Lower_fence

In [None]:
for col in non_normal_attributes:
    outliers_TopCoding_quantile(df,col)
    outliers_BottomCoding_quantile(df,col)

Outlier treatment for Normal distribution

In [None]:
#Treating outliers with top coding-Any value greater than maximum limit will be capped at maximum
def outliers_TopCoding_gaussian(df,variable):
    # top coding: upper boundary for outliers according to gaussian rule
    Upper_fence = df[variable].mean()+3*df[variable].std()
    df.loc[df[variable]>Upper_fence, variable] = Upper_fence

In [None]:
#Treating outliers with top coding-Any value less than minimum limit will be capped at minimum
def outliers_BottomCoding_gaussian(df,variable):
    # bottom coding: lower boundary for outliers according to gaussian rule
    Lower_fence = df[variable].mean()-3*df[variable].std()
    df.loc[df[variable]<Lower_fence, variable] = Lower_fence

In [None]:
for col in normal_attributes:
    outliers_TopCoding_gaussian(df,col)
    outliers_BottomCoding_gaussian(df,col)

Convert the non-normal distribution to normal

In [None]:
def transform_BoxCox(df,variable):
    df[variable+'_boxcox'], param = stats.boxcox(df[variable])
    print('Optimal lambda: ', param)

#### 1.2 Outlier treatment for categorical attributes

In [None]:
def rare_new_imputation(df,variable,rare_cat):
    temp = df.groupby([variable])[variable].count()/np.float(len(df))
    rare_cat = [x for x in temp.loc[temp<0.05].index.values]
    df[variable+'_rare_imp'] = np.where(df[variable].isin(rare_cat), 'Others', df[variable])

In [None]:
def rare_freq_imputation(df,variable,rare_cat,frequent_cat):
    # create new variables, with freq labels imputed
    # by the most frequent category
    df[variable+'_freq_imp'] = np.where(df[variable].isin(rare_cat), frequent_cat, df[variable])

### 2. Missing Values Imputation

#### 2.1 Imputation for numerical attributes

In [None]:
# function for KNN model-based imputation of missing values using features without NaN as predictors
def impute_model_basic(df):
    cols_nan = df.columns[df.isna().any()].tolist()
    cols_no_nan = df.columns.difference(cols_nan).values
    for col in cols_nan:
        test_data = df[df[col].isna()]
        train_data = df.dropna()
        knr = KNeighborsRegressor(n_neighbors=5).fit(train_data[cols_no_nan], train_data[col])
        df.loc[df[col].isna(), col] = knr.predict(test_data[cols_no_nan])
    return df

In [None]:
# function for KNN model-based imputation of missing values using features without NaN as predictors,
#   including progressively added imputed features
def impute_model_progressive(df):
    cols_nan = df.columns[df.isna().any()].tolist()
    cols_no_nan = df.columns.difference(cols_nan).values
    while len(cols_nan) > 0:
        col = cols_nan[0]
        test_data = df[df[col].isna()]
        train_data = df.dropna()
        knr = KNeighborsRegressor(n_neighbors=5).fit(train_data[cols_no_nan], train_data[col])
        df.loc[df[col].isna(), col] = knr.predict(test_data[cols_no_nan])
        cols_nan = df.columns[df.isna().any()].tolist()
        cols_no_nan = df.columns.difference(cols_nan).values
    return df

In [None]:
# function for imputing missing data according to a given impute_strategy:
#  drop_rows: drop all rows with one or more missing values
#  drop_cols: drop columns with one or more missing values
#  model_basic: KNN-model-based imputation with fixed predictors
#  model_progressive: KNN-model-based imputation with progressively added predictors
#  mean, median, most_frequent: imputation with mean, median or most frequent values
#
#  cols_to_standardize: if provided, the specified columns are scaled between 0 and 1, after imputation
def impute_data(df_cleaned, impute_strategy=None, cols_to_standardize=None):
    df = df_cleaned.copy()
    if impute_strategy == 'drop_rows':
        df = df.dropna(axis=0)
    elif impute_strategy == 'drop_cols':
        df = df.dropna(axis=1)
    elif impute_strategy == 'model_basic':
        df = impute_model_basic(df)
    elif impute_strategy == 'model_progressive':
        df = impute_model_progressive(df)
    else:
        arr = SimpleImputer(missing_values=np.nan,strategy=impute_strategy).fit(
          df.values).transform(df.values)
        df = pd.DataFrame(data=arr, index=df.index.values, columns=df.columns.values)
    if cols_to_standardize != None:
        cols_to_standardize = list(set(cols_to_standardize) & set(df.columns.values))
        df[cols_to_standardize] = df[cols_to_standardize].astype('float')
        df[cols_to_standardize] = pd.DataFrame(data=MinMaxScaler().fit(
          df[cols_to_standardize]).transform(df[cols_to_standardize]),
                                             index=df[cols_to_standardize].index.values,
                                             columns=df[cols_to_standardize].columns.values)
    return df

In [None]:
df[numerical_attributes] = impute_data(df[numerical_attributes], 'model_progressive')

#### 2.2 Imputation for categorical attributes

In [None]:
def impute_na_freq(df, variable):
    # find out most frequent category
    most_frequent_category = df.groupby([variable])[variable].count().sort_values(ascending=False).index[0] 
    
    ## replace missing values with most frequent category
    df[variable].fillna(most_frequent_category, inplace=True)

In [None]:
def impute_na_addCat(df, variable):
    if((df[variable].isnull().sum())>0):
        df[variable+'_NA'] = np.where(df[variable].isnull(), 'Missing', df[variable])

In [None]:
for cols in categorical_attributes:
    impute_na_addCat(df,cols)
    impute_na_freq(df,cols)

### 3. Encoding of categorical attributes

In [None]:
def CategoricalEncoding_OneHot(df,variable):
    return pd.get_dummies(df, columns=[variable])

In [None]:
#Using weight of evidence encoding technique
def CategoricalEncoding_WOE(df,variable,target_variable):
    # now we calculate the probability of target=1 
    prob_df = df.groupby([variable])[target_variable].mean()
    prob_df = pd.DataFrame(prob_df)
    
    # and now the probability of target = 0 
    # and we add it to the dataframe
    prob_df['target_0'] = 1-prob_df[target_variable]
    prob_df.loc[prob_df[target_variable] == 0, target_variable] = 0.001
    prob_df['WoE'] = np.log(prob_df[target_variable]/prob_df['target_0'])
    ordered_labels = prob_df['WoE'].to_dict()
    df[variable+'_ordered'] = df[variable].map(ordered_labels)

In [None]:
#Replace labels by risk factor encoding technique
def CategoricalEncoding_RiskFactor(df,variable,target_variable):
    ordered_labels = df.groupby([variable])[target_variable].mean().to_dict()
    df[variable+'_ordered'] = df[variable].map(ordered_labels)

In [None]:
def CategoricalEncoding_Monotonicity(df,variable,target_variable):
    ordered_labels=df.groupby([variable])[target_variable].mean().sort_values().index
    ordinal_label = {k:i for i, k in enumerate(ordered_labels, 1)}
    df[variable+'_ordered']=df[variable].map(ordinal_label)

In [None]:
#Replace labels by risk factor encoding technique
def CategoricalEncoding_PRE(df,variable,target_variable):
    # now we calculate the probability of target=1 
    prob_df = df.groupby([variable])[target_variable].mean()
    prob_df = pd.DataFrame(prob_df)
    
    # and now the probability of target = 0 
    # and we add it to the dataframe
    prob_df['target_0'] = 1-prob_df[target_variable]
    prob_df.loc[prob_df['target_0'] == 0, 'target_0'] = 0.001
    prob_df['PRE'] = prob_df[target_variable]/prob_df['target_0']
    ordered_labels = prob_df['PRE'].to_dict()
    df[variable+'_ordered'] = df[variable].map(ordered_labels)

Sample dataset categorical attributes is already encoded, hence no need for this step.

In [None]:
# for col in ['Family', 'Education']:
#      df = CategoricalEncoding_OneHot(df,col)

### 4. Scaling of Attributes

In [None]:
# 1.Standard Scalar: z = (x - x_mean) / std
def scaler_Standard(df):
    # separate x and y
    df_x = df.drop(columns=[target])
    columns = df_x.columns
    index = df_x.index
    # the scaler - for standardisation
    from sklearn.preprocessing import StandardScaler
    # set up the scaler
    scaler = StandardScaler()
    # fit the scaler to the train set, it will learn the parameters
    scaler.fit(df_x)
    # transform train and test sets
    df_scaled = scaler.transform(df_x)
    # let's transform the returned NumPy arrays to dataframes 
    df_scaled = pd.DataFrame(df_scaled, columns=columns, index=index)
    # join back 
    df_scaled[target] = df[target]
    return df_scaled

In [None]:
# 2.Mean Normalisation: z=(x-x_mean)/(x_max-x_min)
def scaler_MeanNormalisation(df):
    # separate x and y
    df_x = df.drop(columns=[target])
    means = df_x.mean(axis=0)
    ranges = df_x.max(axis=0)-df_x.min(axis=0)
    df_scaled = (df_x - means) / ranges
    # join back 
    df_scaled[target] = df[target]
    return df_scaled

In [None]:
# 3.MinMaxScaling:x_scaled=(x-x_min)/(x_max-x_min)
def scaler_MinMax(df):
    # separate x and y
    df_x = df.drop(columns=[target])
    columns = df_x.columns
    index = df_x.index
    # the scaler - for min-max scaling
    from sklearn.preprocessing import MinMaxScaler
    # set up the scaler
    scaler = MinMaxScaler()
    # fit the scaler to the train set, it will learn the parameters
    scaler.fit(df_x)
    # transform train and test sets
    df_scaled = scaler.transform(df_x)
    # let's transform the returned NumPy arrays to dataframes 
    df_scaled = pd.DataFrame(df_scaled, columns=columns, index=index)
    # join back 
    df_scaled[target] = df[target]
    return df_scaled

In [None]:
# 4.MaxAbsScaling:x_scaled=x/x_max
def scaler_MaxAbs(df):
    # separate x and y
    df_x = df.drop(columns=[target])
    columns = df_x.columns
    index = df_x.index
    # the scaler - for min-max scaling
    from sklearn.preprocessing import MaxAbsScaler
    # set up the scaler
    scaler = MaxAbsScaler()
    # fit the scaler to the train set, it will learn the parameters
    scaler.fit(df_x)
    # transform train and test sets
    df_scaled = scaler.transform(df_x)
    # let's transform the returned NumPy arrays to dataframes 
    df_scaled = pd.DataFrame(df_scaled, columns=columns, index=index)
    # join back 
    df_scaled[target] = df[target]
    return df_scaled

In [None]:
# 5.RobustScaling:x_scaled = x - x_median / ( x.quantile(0.75) - x.quantile(0.25) )
def scaler_Robust(df):
    # separate x and y
    df_x = df.drop(columns=[target])
    columns = df_x.columns
    index = df_x.index
    # the scaler - for min-max scaling
    from sklearn.preprocessing import RobustScaler
    # set up the scaler
    scaler = RobustScaler()
    # fit the scaler to the train set, it will learn the parameters
    scaler.fit(df_x)
    # transform train and test sets
    df_scaled = scaler.transform(df_x)
    # let's transform the returned NumPy arrays to dataframes 
    df_scaled = pd.DataFrame(df_scaled, columns=columns, index=index)
    
    # join back 
    df_scaled[target] = df[target]
    return df_scaled

In [None]:
df_scaled = scaler_Robust(df)

### 5. Handling Class Imbalance

1. Under Sampling
* refer notebook example [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-01-Random-Undersampling.ipynb)
* documentation [here]

In [2]:
# imblearn random under sampling
# params: strategy: 'majority'(it specifies to undersample majority class to have 1:1 ratio)
#         strategy: 0.5 (resultant ratio will be 1:0.5)

def apply_random_undersampling(df, strategy='auto'):
    # define oversampling strategy
    rus = RandomUnderSampler(
        sampling_strategy=strategy,  # 'auto' - samples only the majority class
        random_state=0,  # for reproducibility
        replacement=True # if it should resample with replacement
    )  
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_under, y_train_under = rus.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_under[target] = y_train_under
    return x_train_under

2. Condensed Nearest Neighbour
* refere example notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-02-Condensed-Nearest-Neighbours.ipynb)
* refer doc here

In [None]:
def apply_cnn_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    cnn = CondensedNearestNeighbour(
        sampling_strategy=strategy,  # undersamples only the majority class
        random_state=0,            # for reproducibility
        n_neighbors=1,             # default
        n_jobs=4                   # I have 4 cores in my laptop
    )   
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = cnn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

3. Tomet Link
* refere notebook example [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-03-Tomek-Links.ipynb)
* refer doc here

In [None]:
def apply_tometlink_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    tl = TomekLinks(
        sampling_strategy=strategy,  # undersamples only the majority class
        n_jobs=4                   # I have 4 cores in my laptop
    )    
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = tl.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

4. One Sided Selection
* refer notebook example [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-04-One-Sised-Selection.ipynb)
* refer doc here.

In [7]:
def apply_onesidedselection_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    oss = OneSidedSelection(
        sampling_strategy=strategy,  # undersamples only the majority class
        random_state=0,            # for reproducibility
        n_neighbors=1,             # default
        n_jobs=4                   # I have 4 cores in my laptop
    )   
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = oss.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

5. EditedNearestNeighbours
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-05-Edited-Nearest-Neighbours.ipynb)
* refer doc [here]

In [8]:
def apply_editednearestneighbour_undersampling(df, strategy='majority'):
    
    # define oversampling strategy
    enn = EditedNearestNeighbours(
        sampling_strategy='auto',  # undersamples only the majority class
        n_neighbors=3,
        kind_sel='all',            # all neighbours need to have the same label as the observation examined
        n_jobs=4                   # I have 4 cores in my laptop 
    ) 
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = enn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

6. Repeated Edited Nearest Neighbours
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-05-Edited-Nearest-Neighbours.ipynb)
* refer doc [here]

In [10]:
def apply_repeated_enn_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    renn = RepeatedEditedNearestNeighbours(
        sampling_strategy=strategy, # removes only the majority class
        n_neighbors=3,            # 3 KNN
        kind_sel='all',           # all neighbouring observations should show the same class
        n_jobs=4,                 # 4 processors in my laptop
        max_iter=100              # maximum number of iterations 
    )
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = renn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

7. All K Nearest Neighbours
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-07-All-KNN.ipynb)
* refer doc [here]

In [12]:
def apply_allknn_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    allknn = AllKNN(
        sampling_strategy=strategy,  # undersamples only the majority class
        n_neighbors=3,
        kind_sel='all',            # all neighbours need to have the same label as the observation examined
        n_jobs=4                   # I have 4 cores in my laptop
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = allknn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

8. Neighbourd Cleaning Rule
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-08-Neighbourhood-Cleaning-Rule.ipynb)
* refer doc [here]

In [14]:
def apply_neighbourhood_cleaning_rule_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    ncr = NeighbourhoodCleaningRule(
        sampling_strategy=strategy,# removes only the majority class
        n_neighbors=3,           # 3 KNN
        kind_sel='all',          # all neighbouring observations should show the same class
        n_jobs=4,                # 4 processors in my laptop
        threshold_cleaning=0.5   # threshold no exclude or not observations
    )   
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = ncr.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

9. NearMiss [v1|v2|v3]
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-09-NearMiss.ipynb)
* refer doc [here]

In [15]:
def apply_nearmiss_v1_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    nm1 = NearMiss(
        sampling_strategy=strategy,  # undersamples only the majority class
        version=1,
        n_neighbors=3,
        n_jobs=4                   # I have 4 cores in my laptop  
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = nm1.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [18]:
def apply_nearmiss_v2_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    nm2 = NearMiss(
        sampling_strategy=strategy,  # undersamples only the majority class
        version=2,
        n_neighbors=3,
        n_jobs=4                   # I have 4 cores in my laptop  
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = nm2.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [19]:
def apply_nearmiss_v3_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    nm3 = NearMiss(
        sampling_strategy=strategy,  # undersamples only the majority class
        version=3,
        n_neighbors=3,
        n_jobs=4                   # I have 4 cores in my laptop  
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = nm3.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

10. InstanceHardnessThreshold
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-10-Instance-Hardness-Class.ipynb)
* refer doc [here]

In [20]:
def apply_instance_hardness_sampling_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    iht = InstanceHardnessThreshold(
        # TODO - review if we need to pass classifier as a parameter 
        # select a classifier, in this case Random Forests
        estimator=RandomForestClassifier(n_estimators=100, random_state=0),
        sampling_strategy='auto',  # undersamples only the majority class
        random_state=0,
        n_jobs=4,                  # have 4 processors in my laptop
        cv=3                       # cross validation fold 
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = iht.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

<!-- #### COMPARISON
![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png) -->

In [None]:
# COMPARISON
# ![image.png](attachment:image.png)
# ![image-2.png](attachment:image-2.png)

#### OVER SAMPLING

1. Random Over Sampler [Notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-01-Random-Oversampling.ipynb), [Document]()


In [24]:
def apply_random_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    ros = RandomOverSampler(
        sampling_strategy=strategy, # samples only the minority class
        random_state=0,  # for reproducibility
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

2. SMOTE [Notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-02-SMOTE.ipynb) [Document]()

In [26]:
def apply_smote_oversampling(df, strategy='auto'):
    sm = SMOTE(
        random_state=42,
        sampling_strategy=strategy,
        k_neighbours=5,
        n_jobs=4
    )

    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

    x_train_sm[target] = y_train_sm
    return x_train_sm

3. SMOTE Nominal Continous for categorical data [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-03-SMOTE-NC.ipynb), [Document]()

In [24]:
def apply_smotenc_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    smnc = SMOTENC(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,            # for reproducibility
        k_neighbors=5,
        n_jobs=4,
        categorical_features=[2,3] # indeces of the columns of categorical variables
    )    
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = smnc.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

4. ADASYN [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-04-ADASYN.ipynb), [Document]()

In [27]:
def apply_adasyn_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    ada = ADASYN(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        n_neighbors=5,
        n_jobs=4
    )   
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = ada.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

5. BORDERLINE SMOTE [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-05-Borderline-SMOTE.ipynb), [Document]()

In [27]:
def apply_borderline1_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm_b1 = BorderlineSMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=5,
        m_neighbors=10,
        kind='borderline-1',
        n_jobs=4
    )  
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = sm_b1.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [28]:
def apply_borderline2_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm_b1 = BorderlineSMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=5,
        m_neighbors=10,
        kind='borderline-2',
        n_jobs=4
    )  
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = sm_b1.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

6. SVM SMOTE [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-06-SVM-SMOTE.ipynb), [Document]()

In [29]:
def apply_svmsmote_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm = SVMSMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,              # for reproducibility
        k_neighbors=5,
        m_neighbors=10,
        n_jobs=4,
        svm_estimator = svm.SVC(kernel='linear')
    )  
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = sm.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

7. K-Means SMOTE [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-07-K-Means-SMOTE.ipynb), [Document]()

In [30]:
def apply_kmeanssmote_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm = KMeansSMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,              # for reproducibility
        k_neighbors=2,
        n_jobs=None,
        kmeans_estimator=KMeans(n_clusters=3, random_state=0),
        cluster_balance_threshold=0.1,
        density_exponent='auto'
    )  
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = sm.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

#### COMBINATION OF UNDER AND OVER SAMPLING

1. SMOTE + ENN [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-06-Over-and-Undersampling/06-01-SMOTEENN-and-SMOTETomek.ipynb), [Document]()

In [30]:
def apply_sm_enn_sampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm = SMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=5,
        n_jobs=4
    )
    
    # define under sampling strategy
    # need ENN  as argument of SMOTEENN
    enn = EditedNearestNeighbours(
        sampling_strategy=strategy,
        n_neighbors=3,
        kind_sel='all',
        n_jobs=4)

    smenn = SMOTEENN(
        sampling_strategy='auto',  # samples only the minority class
        random_state=0,  # for reproducibility
        smote=sm,
        enn=enn,
        n_jobs=4
    )
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = smenn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

2. SMOTE + Tomek [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-06-Over-and-Undersampling/06-01-SMOTEENN-and-SMOTETomek.ipynb), [Document]()

In [32]:
def apply_sm_tomek_sampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm = SMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=5,
        n_jobs=4
    )
    
    # define under sampling strategy
    # need tomek as argument of SMOTETomek
    tl = TomekLinks(
        sampling_strategy='all',
        n_jobs=4)

    smtomek = SMOTETomek(
        sampling_strategy='auto',  # samples only the minority class
        random_state=0,  # for reproducibility
        smote=sm,
        tomek=tl,
        n_jobs=4
    )
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = smtomek.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

#### ENSEMBLE IMBALANCED LEARNING TECHNIQUE
* *TODO: imblearn comes with model + sampling techniques, review to move this to more appropriate place* as these techniques is different from just the data sampling methods

In [34]:
# just re-sampling methods (no classifier)

resampling_dict = {
    
    'random': RandomUnderSampler(
        sampling_strategy='auto',
        random_state=0,
        replacement=False,
    ),

    'smote': SMOTE(
        sampling_strategy='auto',
        random_state=0,
        k_neighbors=5,
        n_jobs=4,
    ),
}

In [39]:
# ensemble methods (with or without resampling)

ensemble_dict = {

    # balanced random forests (bagging)
    'balancedRF': BalancedRandomForestClassifier(
        n_estimators=20,
        criterion='gini',
        max_depth=3,
        sampling_strategy='auto',
        n_jobs=4,
        random_state=2909,
    ),

    # bagging of Logistic regression, no resampling
    'bagging': BaggingClassifier(
        base_estimator=LogisticRegression(random_state=2909),
        n_estimators=20,
        n_jobs=4,
        random_state=2909,
    ),

    # bagging of Logistic regression, with resampling
    'balancedbagging': BalancedBaggingClassifier(
        base_estimator=LogisticRegression(random_state=2909),
        n_estimators=20,
        max_samples=1.0,  # The number of samples to draw from X to train each base estimator
        max_features=1.0,  # The number of features to draw from X to train each base estimator
        bootstrap=True,
        bootstrap_features=False,
        sampling_strategy='auto',
        n_jobs=4,
        random_state=2909,
    ),

    # boosting + undersampling
    'rusboost': RUSBoostClassifier(
        base_estimator=None,
        n_estimators=20,
        learning_rate=1.0,
        sampling_strategy='auto',
        random_state=2909,
    ),

    # bagging + boosting + under-sammpling
    'easyEnsemble': EasyEnsembleClassifier(
        n_estimators=20,
        sampling_strategy='auto',
        n_jobs=4,
        random_state=2909,
    ),
}

In [41]:
## function to train random forests and evaluate the peensembleormance

# ensemble = ensemble_dict['choose_ensemble_technique']

def run_ensemble(ensemble, X_train, X_test, y_train, y_test):
    
    ensemble.fit(X_train, y_train)

    print('Train set')
    pred = ensemble.predict_proba(X_train)
    print(
        'ensembleBoost roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = ensemble.predict_proba(X_test)
    print(
        'ensembleBoost roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

    return roc_auc_score(y_test, pred[:, 1])

#### Cost Sensitive learning approaches
* **Misclassification cost as part of learning**
    1. Defining the class_weight for those estimators that allow it, when we set the estimator. it can take values - |None|balanced|{0:1, and 1:10}(misclassification of class 1 will be penalized 10 times)|
    2. Passing the sample_weight vector with the weights for every single observation, when we *fit the estimator*. Sample weight is the vector of the same length as y, containing the weight or penalty for each individual observation. It's more flexible as it allows us to set weight to the observation and not the classes.
    NOTE: the costs such as 'class_weight' can be optimized using the hyperparameter optimization techniques.
    
* **MetaCost learning** - This is recent method and most likely has not been introduced in the popular libraries. Idea is to use the conditional risk of misclassifying the observations using Bayes Conditional Probabilities. For example refer [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-08-Cost-Sensitive-Learning/08-03-MetaCost.ipynb) and video in udemy course.

#### Probability Calibration
* Refer slides [here](https://amueller.github.io/COMS4995-s20/slides/aml-10-calibration-imbalanced-data/#53) for understadning the topic
* Refer [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/tree/master/Section-09-Probability-Calibration) here for example for calibrated classifiers.

### 6. Dimensionality Reduction

In [None]:
def fit_PCA(x_train):
    variance = []
    for num_components in range(2,len(x_train.columns)):
        #print('Aplying PCA with',num_components,'components:')
        pca = PCA(n_components=num_components)
        pca.fit(x_train)
        #print('explained variance ratio:',np.sum(pca.explained_variance_ratio_))
        variance.append(np.sum(pca.explained_variance_ratio_))

    plt.figure()
    plt.plot(range(2,len(x_train.columns)),variance)
    plt.xlabel('num of components')
    plt.ylabel('explained variance total')
    plt.title('PCA components vs explained variance')
    plt.show()

In [None]:
def fit_transform_PCA(x_train, n_components):
    pca = PCA(n_components=n_components)
    x_train_pca = pca.fit_transform(x_train)
    x_train_pca = pd.DataFrame(x_train_pca)
    return x_train_pca

### Save transformed dataset

In [None]:
df_scaled.head()

In [None]:
df_scaled.to_excel('../Bank_Personal_Loan_Modelling_transformed.xlsx')

In [None]:
# imblearn random under sampling
# params: strategy: 'majority'(it specifies to undersample majority class to have 1:1 ratio)
#         strategy: 0.5 (resultant ratio will be 1:0.5)

def apply_undersampling(df, strategy='majority'):
    # define oversampling strategy
    undersample = RandomUnderSampler(sampling_strategy=strategy)
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_under, y_train_under = undersample.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_under[target] = y_train_under
    return x_train_under