In [2]:
# Importing libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, classification_report, recall_score, confusion_matrix, make_scorer, fbeta_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
%matplotlib inline


#my_details 
__author__ = "sreetam dev"
__email__  = "sreetamkumardev@gmail.com"

In [103]:
def loading_files_df(file):
    '''stores csv file as dataframe'''
    df_bank = pd.read_csv(file, sep = ";")
    return df_bank

def removing_instances(feature, df):
    '''removing duplicate instances and duration column from the file'''
    df      = df.drop_duplicates()
    df_bank = df.drop(feature,axis =1)
    return df_bank

def grouping_basic_education(feature,df):
    '''assigning basic education feature in to one common feature'''
    df[feature] = np.where(df[feature]=='basic.9y','Basic', df[feature])
    df[feature] = np.where(df[feature]=='basic.6y','Basic', df[feature])
    df[feature] = np.where(df[feature]=='basic.4y','Basic', df[feature])
    return df

def mapping_target(feature,df):
    '''mapping categorical values to binary values for the target feature'''
    df[feature]       = df[feature].map({'yes':1, 'no':0})
    df_cat_feat       = df.describe(include =['O']).columns
    df                = pd.get_dummies(data = df , columns = df_cat_feat)
    df_bank_processed = df.copy()
    return df_bank_processed

def fixing_skewness(feature,df):
    '''transforming features that add to skewness'''
    '''perform this step for n times if n features have to be transformed'''
    df[feature] = np.log(df[feature]+1)
    return df

def scaling_features(df):
    '''scaling features before modelling to have uniform measurement scale in terms of central tendency'''
    scaler          = MinMaxScaler() 
    df_nr_feat      = df.describe(include = [np.number]).columns
    df[ df_nr_feat] = scaler.fit_transform(df[df_nr_feat])
    return df

def extracting_feature(target,df):
    '''storing our  features as  X '''
    X = df.drop([target],1).values
    return X

def extracting_target(target,df):
    '''storing our target as y  '''
    y = df[target].values
    return y

def split_features_target(X,y):
    '''forming our train and test instances'''
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42,stratify=y)
    return X_train, X_test, y_train, y_test
    
def sampling_feature_target(target,X_train, y_train,df):
    '''preparing our sampled instances and storing them as datasets'''
    X_columns    = df.drop([target],1).columns
    smote_tomek  = SMOTETomek(sampling_strategy = 'auto')
    X_smt, y_smt = smote_tomek.fit_sample(X_train, y_train)
    df_X_smt     = pd.DataFrame(data = X_smt, columns = X_columns)
    df_y_smt     = pd.DataFrame(data = y_smt, columns = [target])
    return X_smt, y_smt, df_X_smt, df_y_smt

def inspecting_model(X_smt,y_smt,X_t,model):
    
    '''pass sampled X ,y and X_test instances'''
    rfe                      = RFE(model,n_features_to_select = 20)
    X_smt_rfe                = rfe.fit_transform(X_smt, y_smt)
    X_test_rfe               = rfe.transform(X_t)
    # model  = model.fit(X_smt_rfe,y_smt)
    no_stratified_folds      = StratifiedKFold(n_splits = 5, random_state= 1 )
    crossval_score_model     = cross_val_score(model,X_smt_rfe ,y_smt, scoring = 'accuracy', cv = no_stratified_folds,n_jobs= 1, error_score='raise'  )
    Model                    = print(model)
    Accuracy_Model           = np.mean(crossval_score_model)
    Standard_Deviation_Model = np.std(crossval_score_model)
    return  Accuracy_Model, Standard_Deviation_Model 

def initiating_final_model(X_smt,y_smt,X_t,y_t):
    '''pass  X smote sample ,y smote sample and X_test instances'''
    
    model_rfc       = RandomForestClassifier(n_estimators= 200, max_features= 'auto', max_depth= 20 , criterion= 'gini')
    rfe             = RFE(model_rfc,n_features_to_select = 5)
    X_smt_rfe       = rfe.fit_transform(X_smt, y_smt)
    X_test_rfe      = rfe.transform(X_t)
    model_rfc.fit(X_smt_rfe,y_smt)
    y_pred          = model_rfc.predict(X_test_rfe)
    conf_matrix_rfe = confusion_matrix(y_t,y_pred)
    classi_repo     = classification_report(y_t, y_pred)
    return model_rfc, rfe, X_test_rfe, y_pred, conf_matrix_rfe,classi_repo 

def metrics_model(y_t, y_pred):
    '''fetching confusion matrix, accuracy, precision and recall'''
    
    
    conf_matrix_rfe = confusion_matrix(y_t,y_pred)
    TP              = conf_matrix_rfe[1,1]
    FN              = conf_matrix_rfe[1,0]
    FP              = conf_matrix_rfe[0,1]
    TN              = conf_matrix_rfe[0,0]
    accuracy_sc     = round(metrics.accuracy_score(y_t, y_pred),2)
    recall_sc       = round(metrics.recall_score(y_t, y_pred),2)
    precision_sc    = round(metrics.precision_score(y_t, y_pred),2)
    
    
    return  TP, FN, FP, TN, conf_matrix_rfe,accuracy_sc,recall_sc, precision_sc

def model_threshold(model, X_test_rfe,y_t,y_pred):
    '''altering threshold of prediction probabilities'''
    y_pred_prob           = model.predict_proba(X_test_rfe)[:,1]
    y_pred_threshold      = np.where(y_pred_prob< 0.45, 0 , 1)
    thres_conf_matrix_rfe = confusion_matrix(y_t,y_pred)
    
    TP_thres = thres_conf_matrix_rfe[1,1]
    FN_thres = thres_conf_matrix_rfe[1,0]
    FP_thres = thres_conf_matrix_rfe[0,1]
    TN_thres = thres_conf_matrix_rfe[0,0]
    

    thres_accuracy_score  = round(metrics.accuracy_score(y_t, y_pred),2)
    thres_recall_score    = round(metrics.recall_score(y_t, y_pred),2)
    thres_precision_score = round(metrics.precision_score(y_t, y_pred),2)
    
    
    return TP_thres, FN_thres, FP_thres, TN_thres, thres_conf_matrix_rfe,thres_accuracy_score, thres_recall_score, thres_precision_score

def selected_features(rfe, df_X_smt):
    '''findind out the important features for the model'''
    columns             = df_X_smt.columns
    val                 = pd.Series(rfe.support_,index = columns)
    features_chosen_rfe = val[val==True].index 
    
    return features_chosen_rfe


    

In [None]:


'''1. savig the dataset as dataframe'''
df_bank = loading_files_df("bank-additional-full.csv")

'''2. removing duplicate instances and the duration column'''
df_bank = removing_instances('duration', df_bank)

'''3. grouping the basic education instances into one'''
df_bank = grouping_basic_education("education",df_bank)

'''4. mapping the the target column values with binary values'''

df_bank_processed = mapping_target('y',df_bank)

'''5. handling skewness asscoaited with features'''
df_bank_processed = fixing_skewness('age',df_bank_processed)
df_bank_processed = fixing_skewness('campaign',df_bank_processed)

'''6.Scaling numerical features'''
df_bank_processed = scaling_features(df_bank_processed)

'''7.Storing our features as  X'''
X = extracting_feature('y',df_bank_processed)

'''8.storing our target  as y '''
y = extracting_target('y',df_bank_processed)

'''9.train and test instances. Note: maintain the order of the variables'''
X_train, X_test, y_train, y_test = split_features_target(X,y)

'''10.sampled instances and their dataframes'''
X_smt, y_smt, df_X_smt, df_y_smt = sampling_feature_target('y',X_train, y_train,df_bank_processed)

'''11.Preparing models for sampled X ,y and X_test instances'''
model_lr   = LogisticRegression(random_state = 42, penalty = "l2")
model_rfc  = RandomForestClassifier(n_estimators= 200, max_features= 'auto', max_depth= 20 , criterion= 'gini')
list_model = [model_lr, model_rfc]
    
for model in list_model:
    Accuracy_Model, Standard_Deviation_Model = inspecting_model(X_smt, y_smt,X_test,model)
    print("For the model accuracy is {} and standard devaition is {}".format( Accuracy_Model, Standard_Deviation_Model))


'''12. X smote sample ,y smote sample and X_test instances'''

model_rfc, rfe, X_test_rfe, y_pred, conf_matrix_rfe,classi_repo = initiating_final_model(X_smt,y_smt,X_test,y_test)

'''13.fetching confusion matrix, accuracy, precision and recall'''

TP, FN, FP, TN, conf_matrix_rfe,accuracy_sc,recall_sc, precision_sc = metrics_model(y_test, y_pred)

'''14. altering threshold of the predicted probabilities'''

TP_thres, FN_thres, FP_thres, TN_thres, thres_conf_matrix_rfe,thres_accuracy_score, thres_recall_score, thres_precision_score = model_threshold(model_rfc,X_test_rfe,y_test,y_pred)


'''15.finding out the features for prediction'''

features_chosen_rfe = selected_features(rfe, df_X_smt)