In [None]:
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 30 18:37:16 2017

@author: ssunkara1
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

from matplotlib import pyplot as plt
import warnings
warnings.simplefilter('ignore')

import time
start_time = time.clock()

#%%
try:
    _ = seed
except NameError:
    seed = None

np.random.seed(seed)

data_root = 'C:/Users/ssunkara1/AppData/Local/bipy/12007280/notebooks/My Notebooks/team_notebooks/'
data_df_total = pd.read_csv(data_root + 'Data/Credit Data/cs-training.csv', index_col=0)
result_column = 'SeriousDlqin2yrs'

train_idx, test_idx = train_test_split(data_df_total.index.values, test_size=0.3,
                                       stratify=data_df_total[result_column]
                                       )
strat_cv = StratifiedKFold(n_splits=5, shuffle=True)

train_data = data_df_total.loc[train_idx]
test_data = data_df_total.loc[test_idx]

try:
    _ = reset_seed
except NameError:
    reset_seed = False

if reset_seed:
    np.random.seed(None)   

#%%
impute_income = True
stack_models = True
fill_smart = True
fit_expanded = False
group_models = False
rescale_models = False
plot_figure = True

# %%
overdue_cols = ['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate']
rev_lines_col = 'RevolvingUtilizationOfUnsecuredLines'

def clean_data_for_prediction(data_frame):
    # remove all nan monthly incomes
    # remove NumberOfDaysLate >= 90.
    # remove Revolving Utilization of Credit Lines >= 4.
    # remove obscene values of DebtRatio.
    # income greater than 1.
    reduced_df = data_frame.copy()
    reduced_df = reduced_df[~reduced_df['MonthlyIncome'].isnull()]
    reduced_df = reduced_df[reduced_df['MonthlyIncome'] > 100.]
    
    for c in overdue_cols:
        reduced_df = reduced_df[reduced_df[c] <= 90.]
    
    reduced_df = reduced_df[reduced_df[rev_lines_col] <= 4.]
    return reduced_df

def train_income_model(data_frame):
    col_train_data = clean_data_for_prediction(data_frame)
    col_train_X, col_train_y = col_train_data.drop(['MonthlyIncome', 'DebtRatio', result_column], axis=1), col_train_data['MonthlyIncome']
    col_train_X = col_train_X.fillna(data_median)
    
    col_model = GradientBoostingRegressor(n_estimators=300, max_depth=7, max_features=4, 
                                          learning_rate=0.1,
                                          min_weight_fraction_leaf=0.0001)
    col_model.fit(col_train_X, col_train_y)
    return col_model

#%%:
def clean_train_data(train_df):
    train_df = train_df.copy()
    
    data_median = train_df.median()
    nan_income = train_df.index[np.logical_or(train_df['MonthlyIncome'].isnull(), train_df['MonthlyIncome'] < 100.)]

    if impute_income:
        test_income_data = train_df.loc[nan_income].drop(['MonthlyIncome', 'DebtRatio', result_column], axis=1)
        test_income_data = test_income_data.fillna(data_median)
        income_fill_values = income_model.predict(test_income_data)
    
        train_df.loc[nan_income, 'MonthlyIncome'] = income_fill_values
        train_df.loc[nan_income, 'DebtRatio'] = train_df.loc[nan_income, 'DebtRatio'] / income_fill_values       
    else:
        train_df.loc[nan_income, 'MonthlyIncome'] = data_median['MonthlyIncome']
        train_df.loc[nan_income, 'DebtRatio'] = train_df.loc[nan_income, 'DebtRatio'] / data_median['MonthlyIncome']

    fill_values = {}
    fill_values['MonthlyIncome'] = data_median['MonthlyIncome']
    
    for col in overdue_cols:
        num_overdue_df = train_df.loc[train_df[col] >= 90]
        if fill_smart:
            over_due_fill_values = train_df.loc[~train_df.index.isin(num_overdue_df.index)].groupby(result_column).mean()
            fill_values[col] = over_due_fill_values[col].mean()
            train_df.loc[num_overdue_df.index, col] = train_df.loc[num_overdue_df.index, result_column].map(lambda x: over_due_fill_values.loc[x, col])
        else:                
            fill_values[col] = train_df[col].median()
            train_df.loc[num_overdue_df.index, col] = train_df[col].median()
 
    ## filling the value for revolving unsecured lines.
    rev_filtered_df = train_df[train_df[rev_lines_col] >= 4.0]
    if fill_smart:
        rev_fill_values = train_df.loc[~train_df.index.isin(rev_filtered_df.index)].groupby(result_column).median()
        fill_values[rev_lines_col] = rev_fill_values[rev_lines_col].mean()
        train_df.loc[rev_filtered_df.index, rev_lines_col] = train_df.loc[rev_filtered_df.index, result_column].map(lambda x: rev_fill_values.loc[x, rev_lines_col])
    else:    
        train_df.loc[rev_filtered_df.index, rev_lines_col] = train_df[rev_lines_col].median()
        fill_values[rev_lines_col] = train_df[rev_lines_col].median()
    return train_df, fill_values 


def clean_test_data(test_df, fill_values, fill_values_other):
    test_df = test_df.copy()
    nan_income_idxs = test_df.index[np.logical_or(test_df['MonthlyIncome'].isnull(), test_df['MonthlyIncome'] < 100.)]
    
    if impute_income:
        test_income_data = test_df.loc[nan_income_idxs].drop(['MonthlyIncome', 'DebtRatio', result_column], axis=1)
        test_income_data = test_income_data.fillna(fill_values_other)
        income_fill_values = income_model.predict(test_income_data)   
        test_df.loc[nan_income_idxs, 'MonthlyIncome'] = income_fill_values
        test_df.loc[nan_income_idxs, 'DebtRatio'] = test_df.loc[nan_income_idxs, 'DebtRatio'] / income_fill_values               
    else:
        test_df.loc[nan_income_idxs, 'MonthlyIncome'] = fill_values['MonthlyIncome']
        test_df.loc[nan_income_idxs, 'DebtRatio'] = test_df.loc[nan_income_idxs, 'DebtRatio'] / fill_values['MonthlyIncome']

    for c in overdue_cols:
        fill_idxs = test_df.index[test_df[c] >= 90]
        test_df.loc[fill_idxs, c] = fill_values[c]

    fill_rev_idxs = test_df.index[test_df[rev_lines_col] >= 4.0]
    test_df.loc[fill_rev_idxs, rev_lines_col] = fill_values[rev_lines_col]

    test_df = test_df.fillna(fill_values_other)
    return test_df

def add_features(data_frame):
    return_dataframe = data_frame.copy()
    return_dataframe[rev_lines_col+'ind'] = return_dataframe[rev_lines_col] == 0.
    return_dataframe['overdue_ind'] = (return_dataframe[overdue_cols].sum(axis=1) == 0)
    return return_dataframe

#%%
data_median = train_data.median()
if impute_income:
    income_model = train_income_model(train_data)

train_data_clean, fill_dict = clean_train_data(train_data)
data_median = train_data_clean.median()

## fill in the remaining values with the median
train_data_clean = train_data_clean.fillna(data_median)
train_data_clean = add_features(train_data_clean)

test_data_cleaned = clean_test_data(test_data, fill_dict, data_median)
test_data_cleaned = add_features(test_data_cleaned)

X_train = train_data_clean.drop(result_column, axis=1)
y_train = train_data_clean[result_column]

X_test = test_data_cleaned.drop(result_column, axis=1)
y_test = test_data_cleaned[result_column]

#%%
## common utility functions
def average_model_preds(*probs):
    avg_probs = np.mean(probs, axis=0)
    avg_preds = avg_probs > 0.5
    return (avg_probs, avg_preds)

def get_model_preds(model, predictors):
    # get the probabilities and predictions for the model
    return (model.predict_proba(predictors)[:, 1], model.predict(predictors))

def get_total_model_preds(model, train_predictors, test_predictors):
    train_probs, train_preds = get_model_preds(model, train_predictors)
    test_probs, test_preds = get_model_preds(model, test_predictors)
    
    return ((train_probs, train_preds), (test_probs, test_preds))

def eval_preds(y_true, y_probs, y_preds):
    return {'precision': precision_score(y_true, y_preds),
            'accuracy': accuracy_score(y_true, y_preds),
            'recall': recall_score(y_true, y_preds),
            'auc': roc_auc_score(y_true, y_probs)}

def get_model_eval(true_train, train_predictions, true_test=None, test_predictions=None):
    train_eval = eval_preds(true_train, *train_predictions)
    if true_test is None:
        return pd.Series(train_eval)
    else:
        test_eval = eval_preds(true_test, *test_predictions)
        return pd.DataFrame([train_eval, test_eval], index=['Train', 'Test'])
    
def get_sample_weights(y_train, power=1.0):
    y_train = pd.Series(y_train)
    return y_train.map(1. - (y_train.value_counts() / len(y_train))) ** power    

def probas_to_classes(probas):
    return (probas >= 0.5).astype(float)

#%%
from keras.models import Sequential
# from keras.utils.np_utils import probas_to_classes
from keras.layers import Dense
from keras.layers import Dropout
from keras import regularizers

import keras
import pandas as pd

class LossCallback(keras.callbacks.Callback):
    def on_train_begin(self, *args):
        self.train_auc = []
        self.test_auc = []
    
    def on_epoch_end(self, *args):
        train_probs = self.model.predict(X_train_norm)
        test_probs = self.model.predict(X_test_norm)
        self.train_auc.append(roc_auc_score(y_train.values.flatten(), train_probs))
        self.test_auc.append(roc_auc_score(y_test.values.flatten(), test_probs))

test_call_back = LossCallback()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_norm = scaler.fit_transform(X_train.values)
X_test_norm = scaler.transform(X_test.values)
reg_param = 0.
reg_param_2 = 0.

first_reg = regularizers.l2(reg_param)
second_reg = regularizers.l2(reg_param_2)

dropout_prob = 0.2
'''
model = Sequential()
model.add(Dense(100, input_dim=X_train_norm.shape[1], activation='relu',
                W_regularizer=regularizers.l2(reg_param)))

model.add(Dense(100, activation='relu',
                W_regularizer=regularizers.l2(reg_param_2)))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
'''
model = Sequential()
model.add(Dense(20, input_dim=X_train_norm.shape[1], activation='relu',
                W_regularizer=first_reg))
model.add(Dropout(dropout_prob))

#model.add(Dense(80, activation='relu', W_regularizer=first_reg))
#model.add(Dropout(dropout_prob))
#model.add(Dense(60, activation='relu', W_regularizer=first_reg))
#model.add(Dropout(dropout_prob))
#
#model.add(Dense(30, activation='relu', W_regularizer=second_reg))
model.add(Dense(10, activation='relu', W_regularizer=second_reg))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_train_norm, y_train.values, verbose=0,
          nb_epoch=50, batch_size=5000,
           callbacks=[test_call_back])

train_probs = model.predict(X_train_norm).flatten()
train_preds = probas_to_classes(train_probs)

test_probs = model.predict(X_test_norm).flatten()
test_preds = probas_to_classes(test_probs)

model_eval = get_model_eval(y_train, [train_probs, train_preds],
                            y_test, [test_probs, test_preds])
print(model_eval)
train_auc = model_eval.loc['Train', 'auc']
test_auc = model_eval.loc['Test', 'auc']

if plot_figure:
    plt.figure()
    plt.plot(test_call_back.train_auc)
    plt.plot(test_call_back.test_auc)
    plt.show()

#%%
end_time = time.clock()
print('Time Taken:', end_time - start_time)    
