In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_categorical_dtype

from pandas_summary import DataFrameSummary
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss,auc,precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,precision_recall_curve, classification_report,confusion_matrix
from sklearn.model_selection import StratifiedKFold,train_test_split,cross_val_score,cross_validate
from sklearn.preprocessing import RobustScaler,PowerTransformer
from imblearn.over_sampling import RandomOverSampler,SMOTE
# import xgboost
# from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
bold='\033[1m'


In [3]:
PATH= "https://s3.amazonaws.com/datarobot_public_datasets/DR_Demo_Lending_Club.csv"

In [4]:
df_raw = pd.read_csv(PATH, low_memory=False,parse_dates=['earliest_cr_line'])

In [5]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [6]:
df_train, df_test = train_test_split(df_raw, test_size=0.30,random_state=42)

In [7]:
def preview_data(df):   

    # Make a backup copy
    df_cpu_cpy = df.copy()
    
    #Columns available in dataset
    print(bold+"\nAttributes of the dataset\n", df.columns.values)
    
    #Dimensions of the dataset
    print(bold+"\nNo of rows:", df.shape[0])
    print(bold+"No of columns:", df.shape[1])
    
    #View the datatypes
    df.info()
    
    #Check for missing values
    print(bold+"\nMissing values in the dataset:",df.isnull().sum().max())
    print(bold+"\nMissing values in the dataset:",df.columns[df.isnull().any()].tolist())
    
    
    

In [8]:
def summary_stat(df):
    
    print("--"*40)    
    print(bold+"Summary Statistics of numeric features:" )
    print("--"*40)
    print(df.describe())
    print("--"*40)
    print(bold+"Summary Statistics of categorical features:")
    print("--"*40)
#     print(df.describe(include=['O']))

In [9]:
import re
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, 
                                     infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
   
#     for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 
#             'Dayofyear', 'Is_month_end', 'Is_month_start', 
#             'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 
#             'Is_year_start'):    
    df[targ_pre+'Year'] = getattr(fld.dt,'year')
    
#     df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [10]:
def fill_missing(df,col,name):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum():
            df[name+'_na']=np.isnan(df[name]) * 1
            df[name] = col.fillna(col.median()) 
    else:
        df[name] =col.fillna(col.mode().iloc[0])   
        

In [11]:
def fix_cat_levels(df,col,name,max_cat_num):
    df_len=len(df)
    val_list=col.value_counts().nlargest(max_cat_num).index
    df[name] = col.where(col.isin(val_list), 'others')

    return df        
             

In [12]:
def num_cat_cols(df):
    numerics = ['float64', 'int64']
    num_ds =df.select_dtypes(include=numerics)  
    cat_ds =df.select_dtypes(exclude=numerics)
    num_cols=num_ds.keys().tolist()
    cat_cols=cat_ds.keys().tolist()
    return num_cols,cat_cols

In [13]:
def transform_cat_features(df,n,c,max_cat_num):
#     cat_cols=num_cat_cols(df,isnum=False)
    if is_string_dtype(c):
        if c.nunique()>max_cat_num :df=fix_cat_levels(df,c,n,max_cat_num)    
    return df
       


In [14]:
def vectorize(df,col,isVector=None):
        df[col]=df[col].fillna('na')
        if isVector:
            vectoriser = TfidfVectorizer()
            features = vectoriser.fit_transform(df) 
        else:
            df[col]=df[col].apply(len)
    

In [15]:
def normalise(df):
    num_cols,cat_cols=num_cat_cols(df)
    std = RobustScaler()
    x = df[num_cols].values
    x_scaled = std.fit_transform(x)
    df_temp = pd.DataFrame(x_scaled, columns=num_cols, index = df.index)
    df[num_cols] = df_temp
    return df

In [16]:
def process_df(df,y_fld,drop_flds=None,do_scale=None,max_cat_num=None,text_flds=None):
    '''

     :param df: train/test
     :param y_fld: 'is_bad'
     :param drop_flds: 'id'
     :param do_scale: None
     :param max_cat_num: 10
     :param text_flds: 'Notes,Purpose'
     :return: [df,labels]
    '''

    if not drop_flds:
        drop_flds=[]
    df=df.copy()
    y=df[y_fld].values
    df.drop(drop_flds+[y_fld],axis=1,inplace=True)
    for n,c in df.items():fill_missing(df,c,n)
    for col in text_flds:
        if col in df.columns:vectorize(df,col)
    df=normalise(df)
    for n,c in df.items():transform_cat_features(df,n,c,max_cat_num)
    res=[pd.get_dummies(df,drop_first=True),y]  
    return res
    
        

In [17]:
def align_testdataset(train_cols,test):
    missing_cols = set(train_cols ) - set( test.columns )
    # Add a missing column in test set with default value equal to 0
    for c in missing_cols:
        test[c] = 0
    # Ensure the order of column in the test set is in the same order than in train set
    test = test[train_cols]
    return test

In [18]:
def Prepare_data(df):
    '''
    -Extract year from the 'earliest_cr_line' date
    -Perform the preprocessing on the data
      -Drop id column and target column from the train/test data
      -Fill missing values of numeric columns with median and categorical columns with mode
      -Normalise numeric columns using robustscalar
      -Transformed categorical columns with levels > 10
      -Converted text columns to numeric columns by replacing with length of text
      -dummy encoding on categorical columns

    :param df: source df_train /df_test
    :return: df_train/df_test,train_labels,test_labels
    '''
    add_datepart(df, 'earliest_cr_line')
    df, labels = process_df(df, 'is_bad', drop_flds=['Id'], max_cat_num=10, text_flds=['Notes', 'purpose'])
    return df, labels

In [19]:
def resample_data(df, labels):
    '''
    Oversample data using SMOTE to handle the imbalanced data
    :param df: source:df_train/df_test
    :param labels: train_labels/test_labels
    :return: x_sampled, y_sampled
    '''
    sampler = SMOTE()
    x_sampled, y_sampled = sampler.fit_sample(df, labels)
    return x_sampled, y_sampled

In [20]:
def crossvalidate(model, df, labels):
    '''
    Crossvalidate train data using StratifiedKFold with 5 splits

    :param model: log_model/gbt_model
    :param df: train/test
    :param labels: train_labels/test_labels
    :return:
    '''
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = ['neg_log_loss', 'f1']

    scores = cross_validate(model, df, labels, cv=skf, scoring=scoring)
    print('Log loss score# (1) mean: {} (2)variance: {}'.format(-np.mean(scores['test_neg_log_loss']),
                                                                np.var(scores['test_neg_log_loss'])))
    print('F1 score# (1) mean: {} (2)variance: {}'.format(np.mean(scores['test_f1']), np.var(scores['test_f1'])))

In [21]:
def train_model(model, df, labels):
    '''
    Crossvalidate and fit the model on training data
    :param model: log_model/gbt_model
    :param df: train/test
    :param labels: train_labels/test_labels
    :return:
    '''
    crossvalidate(model, df, labels)
    model.fit(df, labels)
    return model
    

In [22]:
def test_model(model, df, labels):
    '''
    Make predictions using trained models on test data
    Calculate logloss and f1 score for each model
    :param model: log_model/gbt_model
    :param df: train/test
    :param labels: train_labels/test_labels
    :return:
    '''
    predictions = model.predict(df)
    f1 = f1_score(predictions, labels)
    pred_probs = model.predict_proba(df)
    logloss = log_loss(labels, pred_probs, eps=1e-15)
    print('Prediction Log loss score# : %.2f' % (logloss))
    print("Prediction F1 score # : %.2f " % (f1))

In [23]:
def train_test(df,df_test):
    
      
    '''
    -Preprocess and resample train data
    -Crossvalidate the '12 regularised logistic regression' and  'GradientBoostingClassifier' models using stratified 5 folds
    -Calculate logloss and f1 score for the models on train data predictions
    -Preprocess and resample test data
    -Predict the labels of test data using trained models
    -Calculate logloss and f1 score for the predictions
    :param df: Train dataset
    :param df_test: Test dataset
    '''

    log_model = LogisticRegression(class_weight='balanced', penalty='l2', random_state=42)
    gbt_model = GradientBoostingClassifier(max_features='sqrt',
                                           n_estimators=100,
                                           learning_rate=0.02,
                                           max_depth=10,
                                           subsample=0.8)
    print('Preprocessing training data:')
    df_train, train_labels = Prepare_data(df)
    df_train_re, train_labels_re = resample_data(df_train, train_labels)

    print('Preprocessing testing data:')
    df_test, test_labels = Prepare_data(df_test)
    df_test = align_testdataset(df_train.columns, df_test)
    df_test_re, test_labels_re = resample_data(df_test, test_labels)

    # Logistic regression
    print('Training Logistic regression model...')
    print('Logistic regression cross validation scores:')
    print("--" * 40)
    log_model = train_model(log_model, df_train_re, train_labels_re)

    print('Testing Logistic regression model...')
    print('Logistic regression test scores:')
    print("--" * 40)
    test_model(log_model, df_test_re, test_labels_re)


    # GradientBoosting Classifier
    print('Training GradientBoosting Classifier model model...')
    print('GradientBoosting Classifier cross validation scores:')
    print("--" * 40)
    gbt_model=train_model(gbt_model,df_train_re,train_labels_re)
    print('Testing GradientBoosting Classifier model...')
    print('GradientBoosting Classifier test scores:')
    print("--" * 40)
    test_model(gbt_model,df_test_re,test_labels_re)
    

    
    
    

In [None]:
train_test(df_train,df_test)


Preprocessing training data:
Preprocessing testing data:
Training Logistic regression model...
Logistic regression cross validation scores:
--------------------------------------------------------------------------------
Log loss score# (1) mean: 0.6068071797292562 (2)variance: 7.934491249527645e-05
F1 score# (1) mean: 0.6656572847416996 (2)variance: 4.531882457848716e-05
Testing Logistic regression model...
Logistic regression test scores:
--------------------------------------------------------------------------------
Prediction Log loss score# : 0.63
Prediction F1 score # : 0.64 
Training GradientBoosting Classifier model model...
GradientBoosting Classifier cross validation scores:
--------------------------------------------------------------------------------
