In [1]:
# This note books is to predict whether loan will be charged off or not; it's based on  Lending club Data.. 
# Data can be downloaded from here: https://www.kaggle.com/wendykan/lending-club-loan-data/data
# DecisionTreeClassifier is used to predict the bad loan
# For Transformation - One Hot Encoding and Vector Transformation is applied

# These files contain complete loan data for all loans issued through the 2007-2015, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information. The file containing loan data through the "present" contains complete loan data for all loans issued through the previous completed calendar quarter. Additional features include credit scores, number of finance inquiries, address including zip codes, and state, and collections among others. The file is a matrix of about 890 thousand observations and 75 variables. A data dictionary is provided in a separate file. 

## To check the accuracy of prediction BinaryClassificationEvaluator is used

1. Linear Regression
2. Logistic Regression
3. Linear Discriminant Analysis
4. Classification and Regression Trees
5. Naive Bayes
6. K-Nearest Neighbors
7. Learning Vector Quantization
8. Support Vector Machines
9. Bagging and Random Forest
10. Boosting and AdaBoost

In [2]:
# Load the Lending club data

data = spark.read.format("com.databricks.spark.csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/FileStore/tables/LendingClub_loan_data.csv")
data.cache()  # Cache data for faster reuse
df = data.sample(True,  0.15).limit(int(0.1 * data.count()))
df = df.select(df.id, df.member_id, df.loan_amnt, df.funded_amnt, df.funded_amnt_inv, df.term,  df.int_rate, \
          df.installment, df.grade, df.sub_grade, df.emp_title, df.emp_length, df.home_ownership, \
          df.annual_inc.cast("float"), df. verification_status, df.issue_d, df.loan_status, df.pymnt_plan, \
          df.url, df.desc, df.purpose, df.title, df.zip_code, df.addr_state, df.dti.cast("float"), \
          df.delinq_2yrs.cast("float"),  df.earliest_cr_line, df.inq_last_6mths.cast("float"), \
          df.mths_since_last_delinq.cast("int"), df.mths_since_last_record.cast("int"), \
          df.open_acc.cast("float"), df.pub_rec.cast("float") , df.revol_bal.cast("float"), \
          df.revol_util.cast("float"), df.total_acc.cast("float"), df.initial_list_status, 
          df.out_prncp.cast("float"), df.out_prncp_inv.cast("float"), \
          df.total_pymnt.cast("float"), df.total_pymnt_inv.cast("float"), \
          df.total_rec_prncp.cast("float"), df.total_rec_int.cast("float"),\
          df.total_rec_late_fee.cast("float"), df.recoveries.cast("float"), \
          df.collection_recovery_fee.cast("float"), df.last_pymnt_d, df.last_pymnt_amnt.cast("float"), \
          df.next_pymnt_d, df.last_credit_pull_d, df.collections_12_mths_ex_med.cast("float"), \
          df.mths_since_last_major_derog.cast("float"),  df.policy_code.cast("float"), df.application_type,  \
          df.annual_inc_joint.cast("float") , df.dti_joint.cast("float"), \
          df.verification_status_joint, df.acc_now_delinq.cast("float"), \
          df.tot_coll_amt.cast("float"), df.tot_cur_bal.cast("float"), \
          df.open_acc_6m.cast("float"), df.open_il_6m.cast("float"), \
          df.open_il_12m.cast("float"), df.open_il_24m.cast("float"), \
          df.mths_since_rcnt_il.cast("float"), df.total_bal_il.cast("float"),\
          df.il_util.cast("float"), df.open_rv_12m.cast("float"), \
          df.open_rv_24m.cast("float"), df.max_bal_bc.cast("float"), \
          df.all_util.cast("float"), df.total_rev_hi_lim.cast("float"), \
          df.inq_fi.cast("float"), df.total_cu_tl.cast("float"), df.inq_last_12m.cast("float"))


In [3]:
## Drop highly correlated columns
print(df.count())
drop_list = [ "id", "member_id",  "url",  "purpose",  "title",  "zip_code",  "emp_title",  "earliest_cr_line",  "term",  "sub_grade", "last_pymnt_d",  "next_pymnt_d",  "last_credit_pull_d",  "issue_d", "desc",  "addr_state", "pymnt_plan"]

df = df.select([column for column in df.columns if column not in drop_list])

  
cols = df.columns
df = df.fillna(-1)

In [4]:
###One-Hot Encoding
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
categorical_columns = ["grade", "emp_length", "home_ownership", "verification_status",  "initial_list_status", "application_type", "verification_status_joint"]


stages = [] # stages in our Pipeline
for categoricalCol in categorical_columns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index",  handleInvalid = "skip")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]



In [5]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "loan_status", outputCol = "label", handleInvalid = "skip")
stages += [label_stringIdx]

In [6]:
# Transform all features into a vector using VectorAssembler
numericCols = ["loan_amnt", "funded_amnt", "funded_amnt_inv", "int_rate",
       "installment", "annual_inc", "dti", "delinq_2yrs", "inq_last_6mths",
       "mths_since_last_delinq", "mths_since_last_record", "open_acc",
       "pub_rec", "revol_bal", "revol_util", "total_acc", "out_prncp",
       "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp",
       "total_rec_int", "total_rec_late_fee", "recoveries",
       "collection_recovery_fee", "last_pymnt_amnt",
       "collections_12_mths_ex_med", "mths_since_last_major_derog",
       "policy_code", "annual_inc_joint", "dti_joint", "acc_now_delinq",
       "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_il_6m",
       "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
       "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc", "all_util",
       "total_rev_hi_lim", "inq_fi", "total_cu_tl", "inq_last_12m"]
assemblerInputs = map(lambda c: c + "classVec", categorical_columns)
assemblerInputs = list(assemblerInputs) + numericCols

assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
print(assembler)
stages += [assembler]

In [7]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(df)
print(pipelineModel)
dataset = pipelineModel.transform(df)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
dataset = dataset.select(selectedcols)


In [8]:
print(df.count())
print(dataset.count())
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print( trainingData.count())
print (testData.count())

In [9]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)

# Train model with Training Data
dtModel = dt.fit(trainingData) 

In [10]:
print( "numNodes = ", dtModel.numNodes)
print ("depth = ", dtModel.depth)

In [11]:
# Make predictions on test data using the Transformer.transform() method.
predictions = dtModel.transform(testData)
predictions.printSchema()

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator()
accuracyDt = evaluator.evaluate(predictions)

In [13]:
print("Test Error = %g " % (1.0 - accuracyDt))
