In [None]:
# Add Origination Date and first payment data to check if the AUC improves

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer, OneHotEncoderEstimator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import IntegerType,DoubleType

In [3]:
spark = SparkSession.builder.appName("Train").getOrCreate()

In [10]:
# df = spark.read.option("sep",",").parquet("parquet/*.parquet")
df = spark.read.load("parquet/*.parquet",sep=",",inferSchema="true")

In [11]:
df.show(10)

+------------+-------+--------------------+-------------+-------+---------+----------------+------------------+---+----+--------------+---+---------------------+--------------------+------------+-------------+----------+----------------+--------------+---+--------------------+------------+------------------------+-------------------+------------------+-----------------+----------------+------------------+
|          id|channel|              seller|interest_rate|balance|loan_term|origination_date|first_payment_date|ltv|cltv|borrower_count|dti|borrower_credit_score|first_time_homebuyer|loan_purpose|property_type|unit_count|occupancy_status|property_state|zip|insurance_percentage|product_type|co_borrower_credit_score|first_payment_month|first_payment_year|origination_month|origination_year|foreclosure_status|
+------------+-------+--------------------+-------------+-------+---------+----------------+------------------+---+----+--------------+---+---------------------+--------------------+

In [12]:
df.columns

['id',
 'channel',
 'seller',
 'interest_rate',
 'balance',
 'loan_term',
 'origination_date',
 'first_payment_date',
 'ltv',
 'cltv',
 'borrower_count',
 'dti',
 'borrower_credit_score',
 'first_time_homebuyer',
 'loan_purpose',
 'property_type',
 'unit_count',
 'occupancy_status',
 'property_state',
 'zip',
 'insurance_percentage',
 'product_type',
 'co_borrower_credit_score',
 'first_payment_month',
 'first_payment_year',
 'origination_month',
 'origination_year',
 'foreclosure_status']

In [22]:
df = df.select(
"channel",
"seller",
"interest_rate",
"balance",
"loan_term",
"ltv",
"cltv",
"borrower_count",
"dti",
"borrower_credit_score",
"first_time_homebuyer",
"loan_purpose",
"property_type",
"unit_count",
"occupancy_status",
"property_state",
"zip",
"insurance_percentage",
"foreclosure_status"
)

In [37]:
#Move this sc

df = df.withColumn("interest_rate_double", df["interest_rate"].cast(DoubleType()))

df = df.withColumn("balance_double", df["balance"].cast(DoubleType()))

df = df.withColumn("loan_term_int", df["loan_term"].cast(IntegerType()))

df = df.withColumn("ltv_int", df["ltv"].cast(IntegerType()))

df = df.withColumn("cltv_int", df["cltv"].cast(IntegerType()))

df = df.withColumn("borrower_count_int", df["borrower_count"].cast(IntegerType()))

df = df.withColumn("dti_int", df["dti"].cast(IntegerType()))

df = df.withColumn("borrower_credit_score_int", df["borrower_credit_score"].cast(IntegerType()))

df = df.withColumn("unit_count_int", df["unit_count"].cast(IntegerType()))

df = df.withColumn("zip_int", df["zip"].cast(IntegerType()))

df = df.withColumn("insurance_percentage_double", df["insurance_percentage"].cast(IntegerType()))

df = df.withColumn("foreclosure_status_int", df["foreclosure_status"].cast(IntegerType()))

In [38]:
channel_indexer = StringIndexer(inputCol = "channel", outputCol= "channel_index")
channel_encoder = OneHotEncoder(inputCol = "channel_index", outputCol = "channelVec")

seller_indexer = StringIndexer(inputCol = "seller", outputCol = "seller_index")
seller_encoder = OneHotEncoder(inputCol = "seller_index", outputCol = "sellerVec")

first_time_homebuyer_indexer = StringIndexer(inputCol = "first_time_homebuyer", outputCol = "first_time_homebuyer_index")
first_time_homebuyer_encoder = OneHotEncoder(inputCol = "first_time_homebuyer_index", outputCol = "firsttimehomebuyerVec")

loan_purpose_indexer = StringIndexer(inputCol = "loan_purpose", outputCol = "loan_purpose_index")
loan_purpose_encoder = OneHotEncoder(inputCol = "loan_purpose_index", outputCol = "loanpurposeVec")

property_type_indexer = StringIndexer(inputCol = "property_type", outputCol = "property_type_index")
property_type_encoder = OneHotEncoder(inputCol = "property_type_index", outputCol = "propertytypeVec")

occupancy_status_indexer = StringIndexer(inputCol = "occupancy_status", outputCol = "occupancy_status_index")
occupancy_status_encoder = OneHotEncoder(inputCol = "occupancy_status_index", outputCol = "occupancystatusVec")

property_state_indexer = StringIndexer(inputCol = "property_state", outputCol = "property_state_index")
property_state_encoder = OneHotEncoder(inputCol = "property_state_index", outputCol = "propertystateVec")

In [39]:
assembler = VectorAssembler(inputCols = ['channelVec','sellerVec','interest_rate_double','balance_double','loan_term_int',
                                         'ltv_int','cltv_int','borrower_count_int','dti_int','borrower_credit_score_int',
                                         'firsttimehomebuyerVec','loanpurposeVec','propertytypeVec',
                                         'unit_count_int', 'occupancystatusVec','propertystateVec',
                                         'zip_int','insurance_percentage_double'],outputCol = "features")

In [40]:
log_reg = LogisticRegression(featuresCol="features",labelCol='foreclosure_status_int')

#css = ChiSqSelector(featuresCol='Scaled_features',outputCol='Aspect',labelCol='Outcome',fpr=0.05)

In [41]:
pipeline = Pipeline(stages=[channel_indexer, seller_indexer, first_time_homebuyer_indexer, loan_purpose_indexer,
                            property_type_indexer, occupancy_status_indexer, property_state_indexer,
                            channel_encoder, seller_encoder, first_time_homebuyer_encoder, loan_purpose_encoder,
                            property_type_encoder,occupancy_status_encoder,property_state_encoder,
                            assembler, log_reg])

In [42]:
train_data, test_data = df.randomSplit([0.7,0.3])

In [43]:
train_data

DataFrame[channel: string, seller: string, interest_rate: string, balance: string, loan_term: string, ltv: string, cltv: string, borrower_count: string, dti: string, borrower_credit_score: string, first_time_homebuyer: string, loan_purpose: string, property_type: string, unit_count: string, occupancy_status: string, property_state: string, zip: string, insurance_percentage: string, foreclosure_status: string, interest_rate_double: double, balance_double: double, loan_term_int: int, ltv_int: int, cltv_int: int, borrower_count_int: int, borrower_credit_score_int: int, unit_count_int: int, zip_int: int, insurance_percentage_double: int, foreclosure_status_int: int, dti_int: int]

In [44]:
fit_model = pipeline.fit(train_data)

In [45]:
results = fit_model.transform(test_data)

In [51]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='foreclosure_status_int')

In [52]:
results.select('foreclosure_status_int','prediction').show()

+----------------------+----------+
|foreclosure_status_int|prediction|
+----------------------+----------+
|                     1|       1.0|
|                     0|       1.0|
|                     1|       1.0|
|                     0|       1.0|
|                     1|       0.0|
|                     0|       0.0|
|                     0|       0.0|
|                     0|       0.0|
|                     0|       1.0|
|                     1|       1.0|
|                     0|       1.0|
|                     1|       1.0|
|                     0|       1.0|
|                     1|       0.0|
|                     0|       1.0|
|                     1|       1.0|
|                     1|       1.0|
|                     0|       1.0|
|                     0|       1.0|
|                     1|       1.0|
+----------------------+----------+
only showing top 20 rows



In [53]:
AUC = my_eval.evaluate(results)

In [54]:
AUC

0.5910139462220257