In [1]:
import pandas as pd
import numpy as np
import imblearn
from sklearn.model_selection import StratifiedKFold

### Making few helper function that will be used in main file

In [2]:
# A function that will handle class imbalance

def handle_classimbalance(data):
    # Making two temporary separations
    X_temp = data.iloc[:,:-1]
    Y_temp = data.iloc[:,-1]
    #Applying SMOTE on the training data
    smote = imblearn.over_sampling.SMOTE(sampling_strategy=1.0, k_neighbors=5)
    X_temp, Y_temp = smote.fit_resample(X_temp, Y_temp)
    data = pd.concat([X_temp, Y_temp], axis=1)
    return data

In [3]:
# A custom function to remove outliers from the dataset
def removeOutliers(data, col):
    Q3 = np.quantile(data[col], 0.75)
    Q1 = np.quantile(data[col], 0.25)
    IQR = Q3 - Q1
      
    global outlier_free_list
    global filtered_data
      
    lower_range = Q1 - 1.5 * IQR
    upper_range = Q3 + 1.5 * IQR
    outlier_free_list = [x for x in data[col] if (
        (x > lower_range) & (x < upper_range))]
    filtered_data = data.loc[data[col].isin(outlier_free_list)]
    return filtered_data

In [4]:
# A function to split original data into training an

def TrainTestSet(data):
    # Dividing data into 5 splits using stratified kfold
    data['kfold']=-1
    data = data.sample(frac=1).reset_index(drop=True)
    y = data.Customer_Attrition.values

    kf = StratifiedKFold(n_splits=5)

    for f,(t_,v_) in enumerate(kf.split(X=data,y=y)):
        data.loc[v_,'kfold']=f

    data.to_csv('folds.csv',index=False)

    # First four folds will be used for training and last fold will be used for making the test set 

    df = pd.read_csv('folds.csv')
    train_data = df.loc[df['kfold']!=4]
    train_data.drop('kfold',axis=1,inplace=True)
    print("Shape of training data is: ",train_data.shape)
    test_data = df.loc[df['kfold']==4]
    test_data.drop('kfold',axis=1,inplace=True)
    print("Shape of testing data is: ",test_data.shape)
    return train_data,test_data