In [1]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation, svm, metrics, tree, \
decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, \
OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import ParameterGrid
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
f1_score, roc_auc_score, precision_recall_curve
import random
import pylab as pl
import matplotlib.pyplot as plt
from scipy import optimize
import csv
import time


def define_clfs_params():


    clfs = {
        'GB': GradientBoostingClassifier(learning_rate=0.1, subsample=0.5, \
            max_depth=5, n_estimators=100),
        
            }

    grid = { 
    
    'GB': {'n_estimators': [100], 'learning_rate' : [0.1], 'subsample' : [0.5], 'max_depth': [5]},
    
           }

    return clfs, grid

def clf_loop(models_to_run,clfs,grid,X,y):

    best_model = ''
    best_params = ''
    best_auc = -1


    with open ('output/results2.csv', 'w') as csvfile:
        w = csv.writer(csvfile, delimiter=',')
        w.writerow(['Classification_Model', 'Parameters', 'auc', 'time'])

        X_train, X_test, y_train, y_test = train_test_split(X, y, \
            test_size=0.2, random_state=0)
        for index,clf in enumerate([clfs[x] for x in models_to_run]):
            
            current_model = models_to_run[index]
            current_params = grid[current_model]

            start_time = time.time()
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    print(clf)

                    if hasattr(clf,'predict_proba'):
                       y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
                    else: 
                       y_pred_probs = clf.fit(X_train, y_train).decision_function(X_test)

                    threshold = np.sort(y_pred_probs)[::-1][int(.05*len(y_pred_probs))]
                    #print (threshold)
                    end_time=time.time()
                    print(models_to_run[index], "used:",end_time-start_time)
                    #print (precision_at_k(y_test,y_pred_probs,.05))
                    #plot_precision_recall_n(y_test,y_pred_probs,clf)
                    current_auc = roc_auc_score(y_test, y_pred_probs)

                    if current_auc > best_auc:
                        best_model = current_model
                        best_params = current_params
                        best_pred_y = y_pred_probs
                    print("AUC:",current_auc)
                    print()
                except IndexError as e:
                    print ('Error:',e)
                    continue
                w.writerow([current_model, clf, current_auc, end_time-start_time])
    plot_precision_recall_n(y_test, y_pred_probs, best_model)
    print("~"*101)
    print(best_model,best_params, best_auc)
    return best_model, best_params, best_auc, best_pred_y

def evaluate(y_true, y_predict):
    evaluation = dict()

    try:
        evaluation['accuracy'] = accuracy_score(y_true, y_predict)
        evaluation['precision'] = precision_score(y_true, y_predict)
        evaluation['recall'] = recall_score(y_true, y_predict)
        evaluation['f1'] = f1_score(y_true, y_predict)
        evaluation['area_under_curve'] = roc_auc_score(y_true, y_predict),
        evaluation['precision_at_k'] = precision_at_k(y_true,y_predict, 0.05)

    except:
        print("No metrics.")

    return evaluation

def plot_precision_recall_n(y_true, y_prob, model_name):
    from sklearn.metrics import precision_recall_curve
    y_score = y_prob
    precision_curve, recall_curve, \
    pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)
    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    
    name = model_name
    plt.title(name)
    #plt.savefig(name)
    plt.show()

def precision_at_k(y_true, y_scores, k):
    threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))]
    y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores])
    return metrics.precision_score(y_true, y_pred)

def get_y_x(df):
    y = df['SeriousDlqin2yrs']
    df.drop('SeriousDlqin2yrs', axis = 1, inplace = True)
    return y, df

def main(filename): 
    clfs,grid = define_clfs_params()
    models_to_run=['KNN', 'RF', 'LR', 'GB','DT', 'SVM']

    #get X and y
    df = pd.read_csv(filename, index_col = 0)
    y, X = get_y_x(df)

    best_model, best_params, best_auc, best_pred_y = clf_loop(models_to_run,clfs,grid,X,y)




In [2]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation, svm, metrics, tree, \
decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, \
OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import ParameterGrid
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
f1_score, roc_auc_score, precision_recall_curve
import random
import pylab as pl
import matplotlib.pyplot as plt
from scipy import optimize
import csv
import time


def define_clfs_params():


    clfs = {
        'GB': GradientBoostingClassifier(learning_rate=0.1, subsample=0.5, \
            max_depth=5, n_estimators=100),
        
            }

    grid = { 
    
    'GB': {'n_estimators': [100], 'learning_rate' : [0.1], 'subsample' : [0.5], 'max_depth': [5]},
    
           }

    return clfs, grid

def clf_loop(models_to_run,clfs,grid,X,y):

    best_model = ''
    best_params = ''
    best_auc = -1


    with open ('output/results2.csv', 'w') as csvfile:
        w = csv.writer(csvfile, delimiter=',')
        w.writerow(['Classification_Model', 'Parameters', 'auc', 'time'])

        X_train, X_test, y_train, y_test = train_test_split(X, y, \
            test_size=0.2, random_state=0)
        for index,clf in enumerate([clfs[x] for x in models_to_run]):
            
            current_model = models_to_run[index]
            current_params = grid[current_model]

            start_time = time.time()
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    print(clf)

                    if hasattr(clf,'predict_proba'):
                       y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
                    else: 
                       y_pred_probs = clf.fit(X_train, y_train).decision_function(X_test)

                    threshold = np.sort(y_pred_probs)[::-1][int(.05*len(y_pred_probs))]
                    #print (threshold)
                    end_time=time.time()
                    print(models_to_run[index], "used:",end_time-start_time)
                    #print (precision_at_k(y_test,y_pred_probs,.05))
                    #plot_precision_recall_n(y_test,y_pred_probs,clf)
                    current_auc = roc_auc_score(y_test, y_pred_probs)

                    if current_auc > best_auc:
                        best_model = current_model
                        best_params = current_params
                        best_pred_y = y_pred_probs
                    print("AUC:",current_auc)
                    print()
                except IndexError as e:
                    print ('Error:',e)
                    continue
                w.writerow([current_model, clf, current_auc, end_time-start_time])
    plot_precision_recall_n(y_test, y_pred_probs, best_model)
    print("~"*101)
    print(best_model,best_params, best_auc)
    return best_model, best_params, best_auc, best_pred_y

def evaluate(y_true, y_predict):
    evaluation = dict()

    try:
        evaluation['accuracy'] = accuracy_score(y_true, y_predict)
        evaluation['precision'] = precision_score(y_true, y_predict)
        evaluation['recall'] = recall_score(y_true, y_predict)
        evaluation['f1'] = f1_score(y_true, y_predict)
        evaluation['area_under_curve'] = roc_auc_score(y_true, y_predict),
        evaluation['precision_at_k'] = precision_at_k(y_true,y_predict, 0.05)

    except:
        print("No metrics.")

    return evaluation

def plot_precision_recall_n(y_true, y_prob, model_name):
    from sklearn.metrics import precision_recall_curve
    y_score = y_prob
    precision_curve, recall_curve, \
    pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)
    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    
    name = model_name
    plt.title(name)
    #plt.savefig(name)
    plt.show()

def precision_at_k(y_true, y_scores, k):
    threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))]
    y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores])
    return metrics.precision_score(y_true, y_pred)

def get_y_x(df):
    y = df['SeriousDlqin2yrs']
    df.drop('SeriousDlqin2yrs', axis = 1, inplace = True)
    return y, df

def main(filename): 
    clfs,grid = define_clfs_params()
    models_to_run=[ 'GB']

    #get X and y
    df = pd.read_csv(filename, index_col = 0)
    y, X = get_y_x(df)

    best_model, best_params, best_auc, best_pred_y = clf_loop(models_to_run,clfs,grid,X,y)





In [3]:
main('training_imputed.csv')

GB
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=5, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=0.5, verbose=0,
              warm_start=False)
('GB', 'used:', 37.34626913070679)
('AUC:', 0.85500104865259563)
()
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
('GB', {'n_estimators': [100], 'subsample': [0.5], 'learning_rate': [0.1], 'max_depth': [5]}, -1)


In [4]:
df_test = pd.read_csv('cs-test.csv')

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab
import sys
import random
from __future__ import division
from numpy import nan

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)

#Evaluation metrics is a separate pyhton file containing some useful functions for evaluation of different methods. One of the function is evaluating a binary classifier
from EvaluationMetrics import bin_classif_eval

#This function reads in a csv file as a dataframe
def readcsvfile(file_name):
    df =  pd.read_csv(file_name, header = 0)
#We discussed in the previous TA session that NA values in some fields were coded numerically.
#I replace these numerical values with 'nan'

    df = df.replace({'NumberofTimes90DaysLate':{98:nan, 97:nan, 96:nan}, 'NumberofTime60-89DaysPastDueNotWorse':{98:nan, 97:nan, 96:nan}, 'NumberofTime30-59DaysPastDueNotWorse':{98:nan, 97:nan, 96:nan}, 'NumberofDependents':{20:nan}, 'age':{0:nan}})
    return df


#This function describes a datarame i.e. it prints column names, head and tail of the data,summary statistics, number of missing values in each column and the correlation matrix
def summary_statistics(df):
    pd.set_option('display.width', 18)
    print 'Column Names:', "\n", df.columns.values
    print 'First Few Rows of Data:', "\n", df.head()
    print 'Last Few Rows of Data:', "\n", df.tail()
    print 'Summary Statistics:', "\n", df.describe(include = 'all')
    print 'Number of Missing Values:', "\n", df.isnull().sum()
    
    for col_name in df:
        print ('Data Type %s: %s' %(col_name, df[col_name].dtype))
        
    print 'Correlation Matrix :', "\n", df.corr().unstack()
    
    
    
    
def plot_histogram(df, hist_var):
    fig = df[hist_var].hist()
    fig.set_title('Histogram for ' + hist_var)
    plt.draw()
    plt.savefig(hist_var)
    plt.close()


def plot_bar(df, bar_var):
    fig =df.groupby(bar_var).size().plot(kind='bar')
    fig.set_xlabel(bar_var) #defines the x axis label
    fig.set_ylabel('Number of Observations') #defines y axis label
    fig.set_title(bar_var+' Distribution') #defines graph title
    plt.draw()
    plt.savefig(bar_var)
    plt.close('all')
    
histogram_variables = ['serious_dlqin2yrs','revolving_utilization_of_unsecured_lines', 'age', 'number_of_time30-59_days_past_due_not_worse', 'debt_ratio', 'monthly_income', 'number_of_open_credit_lines_and_loans', 'number_of_times90_days_late', 'number_real_estate_loans_or_lines', 'number_of_time60-89_days_past_due_not_worse', 'number_of_dependents']





bar_variables = ['serious_dlqin2yrs']








#This function prints histograms for each column of a data frame
def data_histogram(df):
    df.hist()
    plt.savefig('histograms.png')
    
#This function plots grouped columns with mean of the group
def plot_by_group_mean(df,columns, group_by_col):
    df[columns].groupby(group_by_col).mean().plot()
    file_name = 'plot_by_' + group_by_col + '.png'
    plt.savefig(file_name)
    

#This function converts a categorical variable in a data frame into binary dummies and then drops the original categorical variable
def categorical_to_binary_dummies(df,Category):
    dummies = pd.get_dummies(df['Category'], Category, drop_first = True)
    df = df.join(dummies)
    return df
    
#This function takes a dataframe and a column name and discretizes a continuous variable into bins
def discretize_bins_values(df,col_name, bins, verbose = False):
    new_col = 'bins_' + str(col_name)
    df[new_col] = pd.cut(df[col_name], bins = bins, include_lowest = True, labels = False)
    
    if verbose:
        print pd.value_counts(data[new_col])
        
    return new_col

#This function takes a dataframe and a column name and discretizes a continuous variable into  bins based on quantiles
def discretize_bins_quantiles(df,col_name,number_of_bins, verbose = False):
    new_col = 'bins_' + str(col_name)
    df[new_col] = pd.qcut(df[col_name],number_of_bins, labels = False)
    
    if verbose:
        print pd.value_counts(data[new_col])
        
    return new_col


#This function returns the log of a column. useful to get log income
def log_column(df,col_name):
    log_col = 'log_' + str(col_name)
    df[log_col] = df[col_name].apply(lambda x: np.log(x+1))
    return log_col
#This function plots the histogram of a log variable

def plot_log(df,var):
    lb = 0
    ub = 15
    increment = 0.5
    plt.gca().set_xscale('log')
    fig = df[var].hist(bins = np.exp(np.arrange(lb,ub,increment)))
    fig.set_xlabel('log'+var)
    plt.savefig('log'+var)
    plt.close()
    

##Imputing Missing values in Training Data Set and Filling in Missing values in testing dataset with stored values in Testing Dataset

#This function fills the missing values in a column fn a datafraframe with mean, median or mode

def impute_missing_values(df,var,method):
   
        if method == 'mean':
               mean = df[var].mean()
               return mean

        elif method == 'median':
               median = df[var].median()
               return median

        elif method == 'mode':
               mode = df[var].mode[0]
               return mode


#This function fills the missing values in a column fn a datafraframe with a specified value

def replace_missing_values(df,var,value):
         df[var] = df[var].fillna(value)
         return df
        
        


In [6]:
df_train = pd.read_csv('cs-training.csv')

In [7]:
ImputedMonthlyIncome = df_train['MonthlyIncome'].median()
ImputedNumberOfDependents = df_train['NumberOfDependents'].median()

In [8]:
df_test = replace_missing_values(df_test, ['MonthlyIncome'], ImputedMonthlyIncome)

In [9]:
df_test = replace_missing_values(df_test, ['NumberOfDependents'], ImputedNumberOfDependents)

In [10]:
y_var_name = 'SeriousDlqin2yrs'
X_var_names = [
  'RevolvingUtilizationOfUnsecuredLines',
  'age',
  'NumberOfTime30-59DaysPastDueNotWorse',
  'DebtRatio',
  'MonthlyIncome',
  'NumberOfOpenCreditLinesAndLoans',
  'NumberOfTimes90DaysLate',
  'NumberRealEstateLoansOrLines',
  'NumberOfTime60-89DaysPastDueNotWorse',
  'NumberOfDependents']

In [11]:
final_model = GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=5, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=0.5, verbose=0,
              warm_start=False)

In [12]:
df = pd.read_csv('training_imputed.csv')

In [13]:
final_model.fit(X=df[X_var_names], y=df.SeriousDlqin2yrs)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=5, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=0.5, verbose=0,
              warm_start=False)

In [14]:
final_test_pred_probs =final_model.predict_proba(X=df_test[X_var_names])