#### Loading the packages and creating spark and sparkcontext objectsa

In [1]:
from pyspark.sql import SparkSession
import numpy as np

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

import pyspark
from pyspark.ml import feature, regression, Pipeline, evaluation
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import functions as fn, Row
from pyspark import sql


from pyspark.ml.stat import Correlation 
from pyspark.ml.feature import VectorAssembler

import matplotlib.pyplot as plt
import pandas as pd

from pyspark.ml.stat import Correlation
from sklearn.preprocessing import Imputer

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline

#### Loading the dataset

In [2]:
#df = spark.createDataFrame(pd.read_csv('new_data.csv', low_memory = False))
df = spark.read.format("csv").option("header", "true").option("inferSchema","true").load("data_master.csv")

#### Excluding extra column added during spark dataframe creation

In [3]:
df = df.drop(fn.col('_c0')).drop(fn.col('Unnamed: 0'))

#### Converting loan_status values to 0s & 1s

In [4]:
df= df.withColumn('loan_status', fn.when(fn.col('loan_status') == 'Default',0).otherwise(1))

#### Dropping features with Null value count greater than 100000

In [5]:
df = df.drop(fn.col('mths_since_last_delinq')).\
drop(fn.col('mths_since_last_record')).\
drop(fn.col('mths_since_last_major_derog')).\
drop(fn.col('mths_since_recent_bc_dlq')).\
drop(fn.col('mths_since_recent_revol_delinq')).\
drop(fn.col('deferral_term')).\
drop(fn.col('hardship_amount')).\
drop(fn.col('hardship_length')).\
drop(fn.col('hardship_dpd')).\
drop(fn.col('orig_projected_additional_accrued_interest')).\
drop(fn.col('hardship_payoff_balance_amount')).\
drop(fn.col('hardship_last_payment_amount')).\
drop(fn.col('settlement_amount')).\
drop(fn.col('settlement_percentage')).\
drop(fn.col('settlement_term')).\
drop(fn.col('annual_inc_joint')).\
drop(fn.col('next_pymnt_d')).\
drop(fn.col('hardship_type')).\
drop(fn.col('dti_joint')).\
drop(fn.col('verification_status_joint')).\
drop(fn.col('revol_bal_joint')).\
drop(fn.col('sec_app_earliest_cr_line')).\
drop(fn.col('sec_app_inq_last_6mths')).\
drop(fn.col('sec_app_mort_acc')).\
drop(fn.col('sec_app_open_acc')).\
drop(fn.col('sec_app_revol_util')).\
drop(fn.col('sec_app_open_act_il')).\
drop(fn.col('sec_app_num_rev_accts')).\
drop(fn.col('sec_app_chargeoff_within_12_mths')).\
drop(fn.col('sec_app_collections_12_mths_ex_med')).\
drop(fn.col('sec_app_mths_since_last_major_derog')).\
drop(fn.col('hardship_reason')).\
drop(fn.col('hardship_status')).\
drop(fn.col('hardship_start_date')).\
drop(fn.col('hardship_end_date')).\
drop(fn.col('payment_plan_start_date')).\
drop(fn.col('hardship_loan_status')).\
drop(fn.col('debt_settlement_flag_date')).\
drop(fn.col('settlement_status')).\
drop(fn.col('settlement_date'))

In [6]:
#Dataset dimension
print((df.count(), len(df.columns)))

(564400, 100)


#### Dropping na values across the dataset

In [7]:
df = df.na.drop()

In [8]:
#Dataset dimension
print((df.count(), len(df.columns)))

(404922, 100)


In [9]:
df.select('policy_code').distinct().show()

+-----------+
|policy_code|
+-----------+
|          1|
+-----------+



#### Dropping the feature policy_code as it has no unique value to contribute to the model

In [10]:
df = df.drop(fn.col('policy_code'))

#### Balancing the number of data points in the dataframe based on loan_status value
#### Here, we are trying to have equal number of data points for loan_status values of 0 and 1

In [11]:
df_zero = df.where('loan_status = 0')
df_one = df.where('loan_status = 1')

In [12]:
df_zero.count()

329912

In [13]:
df_one.count()

75010

In [14]:
df_zero_to_add, df_removed = df_zero.randomSplit([0.24, 0.76], seed=0)
df_new = df_one.union(df_zero_to_add)
df_new.groupBy("loan_status").count().show()

+-----------+-----+
|loan_status|count|
+-----------+-----+
|          1|75010|
|          0|79400|
+-----------+-----+



In [15]:
df = df_new

#### Selecting columns with the type 'int' and 'double' as numericColumnList and replacing na values with zeros

In [16]:
numericColumnList = [item[0] for item in df.dtypes if (item[1].startswith('int') | item[1].startswith('double')) ]
df_numeric = df[numericColumnList].na.fill(0) #Impute 0

#### Number of features wiith type 'int' and 'float'

In [17]:
len(numericColumnList)

80

#### Generating a correlation matrix of data with all the numeric features

In [18]:
va_corr = feature.VectorAssembler(inputCols = numericColumnList, outputCol = 'features')
features_sample_df = va_corr.transform(df_numeric)
corr = Correlation.corr(features_sample_df,'features').head()
x = corr[0].toArray()
y = pd.DataFrame(x)
y.columns = numericColumnList
y.index = numericColumnList

In [19]:
y

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,loan_status,dti,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
loan_amnt,1.000000,1.000000,0.999995,0.391881,0.106926,0.950547,0.075514,0.284569,-0.076721,-0.007618,...,-0.023639,-0.020709,0.076661,0.012029,-0.054320,0.015152,0.316839,0.253257,0.355129,0.217327
funded_amnt,1.000000,1.000000,0.999995,0.391881,0.106926,0.950547,0.075514,0.284569,-0.076721,-0.007618,...,-0.023639,-0.020709,0.076661,0.012029,-0.054320,0.015152,0.316839,0.253257,0.355129,0.217327
funded_amnt_inv,0.999995,0.999995,1.000000,0.392185,0.107049,0.950440,0.075552,0.284568,-0.076728,-0.007656,...,-0.023652,-0.020720,0.076702,0.012040,-0.054367,0.015142,0.316868,0.253251,0.355124,0.217316
term,0.391881,0.391881,0.392185,1.000000,0.354823,0.160917,0.054964,0.060293,-0.081272,0.021630,...,-0.008854,0.006536,0.033330,0.022854,0.001511,-0.005983,0.107022,0.095893,0.071464,0.086403
int_rate,0.106926,0.106926,0.107049,0.354823,1.000000,0.157380,-0.014232,-0.062633,0.037223,0.185094,...,0.039272,0.161753,-0.091295,0.275667,0.062420,0.019401,-0.120980,0.022160,-0.218117,-0.009050
installment,0.950547,0.950547,0.950440,0.160917,0.157380,1.000000,0.063831,0.270291,-0.049501,0.017485,...,-0.018117,-0.002636,0.059620,0.045651,-0.046835,0.021518,0.280684,0.240404,0.319877,0.200814
emp_length,0.075514,0.075514,0.075552,0.054964,-0.014232,0.063831,1.000000,0.066119,0.016885,0.047817,...,-0.007729,0.027338,-0.004398,0.028388,0.025048,0.010359,0.129138,-0.007881,0.075376,-0.014964
annual_inc,0.284569,0.284569,0.284568,0.060293,-0.062633,0.270291,0.066119,1.000000,0.000362,-0.172891,...,-0.001246,0.041798,0.007685,0.014646,-0.032442,0.036759,0.380172,0.284055,0.248058,0.253611
loan_status,-0.076721,-0.076721,-0.076728,-0.081272,0.037223,-0.049501,0.016885,0.000362,1.000000,-0.055328,...,0.002584,0.080536,0.017386,-0.034582,0.038665,0.000972,0.043455,-0.000095,-0.008425,-0.000945
dti,-0.007618,-0.007618,-0.007656,0.021630,0.185094,0.017485,0.047817,-0.172891,-0.055328,1.000000,...,-0.024534,0.015462,0.111453,0.146740,-0.027987,-0.038538,0.011195,0.250423,0.058756,0.291526


#### Sorting and displaying the correlation values with respect to loan_status

In [20]:
loan_st_corr = y.iloc[8]
loan_st_corr = loan_st_corr.to_frame().reset_index()

In [21]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
loan_st_corr.sort_values(by ='loan_status', ascending = False)

Unnamed: 0,index,loan_status
8,loan_status,1.0
25,last_pymnt_amnt,0.641893
20,total_rec_prncp,0.59977
19,total_pymnt_inv,0.53465
18,total_pymnt,0.534632
33,open_il_24m,0.094473
45,acc_open_past_24mths,0.089173
55,mort_acc,0.082175
32,open_il_12m,0.081099
71,num_tl_op_past_12m,0.080536


#### Selecting columns of type 'string' as stringColumnList

In [22]:
stringColumnList = [item[0] for item in df.dtypes if item[1].startswith('string') or item[0]=='loan_status']

#### String Indexing the categorical variables and applying OneHotEncoding to convert string categorical features to numerics

In [23]:
categorical_columns = stringColumnList
string_indexer_models = []
one_hot_encoders = []
for col_name in categorical_columns:
    # OneHotEncoders map number indices column to column of binary vectors
    string_indexer_model = feature.StringIndexer(inputCol=col_name, outputCol="{0}_indexed".format(col_name)).fit(df)
    df = string_indexer_model.transform(df)
    string_indexer_models.append(string_indexer_model)
    
    one_hot_encoder = feature.OneHotEncoder(inputCol="{0}_indexed".format(col_name), outputCol="{0}_encoded".format(col_name), dropLast=False)
    df = one_hot_encoder.transform(df)
    one_hot_encoders.append(one_hot_encoder)

#### Creating Train, Validation and Test Splits

In [24]:
training_df, validation_df, testing_df = df.randomSplit([0.6, 0.3, 0.1], seed=0)#create 3 sets of data for cross-validation

In [25]:
[training_df.count(), validation_df.count(), testing_df.count()]

[92575, 46167, 15668]

#### Considering highly correlated features with a correlation values > 0.1
#### Also considered few features based on intuition which would best explain the loan_status

##### We also have eliminated certain features that had similar correlation with the output label

In [26]:
va = feature.VectorAssembler(inputCols=['loan_amnt',
 'term',
 'mort_acc',
 'funded_amnt_inv',
 'int_rate',
 'installment',
 'total_rec_int',
 'tot_hi_cred_lim',
 'sub_grade_encoded',
 'home_ownership_encoded',
 'verification_status_encoded',
 'purpose_encoded',
 'revol_util_encoded'] 
 ,outputCol='features')

#### Logistic regression implemntation based on correlation values & intutive feature selection

In [27]:
lr = LogisticRegression().setFeaturesCol('features').setLabelCol('loan_status')
pipe = Pipeline(stages=[va, lr])
pipe_model = pipe.fit(training_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='loan_status')
evaluator.evaluate(pipe_model.transform(validation_df))#Model 8 as per poster

0.6589241606323537

#### Random Forest Classifier implementation based on correlation values and intuitive feature selection

In [28]:
rf = RandomForestClassifier().setFeaturesCol('features').setLabelCol('loan_status')
rf_pipeline = Pipeline(stages=[va, rf]).fit(training_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='loan_status')
evaluator.evaluate(rf_pipeline.transform(validation_df))#model 7 as per poster

0.612630073888343