In [2]:
import findspark
findspark.init()

In [3]:
# import libraries
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
from pyspark.sql.functions import mean, stddev, col, log
from pyspark.sql.functions import to_date, dayofweek, to_timestamp
from pyspark.sql import types
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import year, month
from pyspark.sql.functions import dayofmonth, weekofyear
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import coalesce, first, lit
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import Bucketizer
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import datediff
from pyspark.sql.functions import when

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel

In [4]:
sc = SparkContext()

In [5]:
spark = SparkSession(sc)

## Chuẩn bị, chuẩn hóa dữ liệu, xác định input, output

In [6]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("Du lieu cung cap/HomeLoan/loan_sanction_train.csv", inferSchema=True,header=True)


In [7]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Loan_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- Loan_Amount_Term: integer (nullable = true)
 |-- Credit_History: integer (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Loan_Status: string (nullable = true)



In [8]:
print((data.count(), len(data.columns)))

(614, 13)


In [9]:
# Khi in bằng head thì định dạng hiển thị là row (khác với head ở pandas dataframe)
data.head(1)

[Row(Loan_ID='LP001002', Gender='Male', Married='No', Dependents='0', Education='Graduate', Self_Employed='No', ApplicantIncome=5849, CoapplicantIncome=0.0, LoanAmount=None, Loan_Amount_Term=360, Credit_History=1, Property_Area='Urban', Loan_Status='Y')]

In [10]:
data.show(3) 

+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001002|  Male|     No|         0| Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|
|LP001003|  Male|    Yes|         1| Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0| Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y|
+--------+------

In [11]:
for item in data.head():
    print(item) 

LP001002
Male
No
0
Graduate
No
5849
0.0
None
360
1
Urban
Y


In [12]:
data.count()

614

In [13]:
from pyspark.sql.functions import col,isnan, when, count
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]
   ).show()

+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|      0|    13|      3|        15|        0|           32|              0|                0|        22|              14|            50|            0|          0|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+



In [14]:
data = data.na.drop()
data.count()

480

### Spark Formatting of Data

In [15]:
# Train data

In [16]:
indexer = StringIndexer(inputCol='Gender', outputCol='Gender_inx')
indexer_model = indexer.fit(data)
data_indexed = indexer_model.transform(data)

In [17]:
data_indexed =  StringIndexer(inputCol='Married', 
                              outputCol='Married_inx').fit(data_indexed).transform(data_indexed)

In [18]:
data_indexed =  StringIndexer(inputCol='Education', 
                              outputCol='Education_inx').fit(data_indexed).transform(data_indexed)

In [19]:
data_indexed =  StringIndexer(inputCol='Self_Employed', 
                              outputCol='Self_Employed_inx').fit(data_indexed).transform(data_indexed)

In [20]:
data_indexed =  StringIndexer(inputCol='Property_Area', 
                              outputCol='Property_Area_inx').fit(data_indexed).transform(data_indexed)

In [21]:
data_indexed =  StringIndexer(inputCol='Loan_Status', 
                              outputCol='Loan_Status_inx').fit(data_indexed).transform(data_indexed)

In [22]:
from pyspark.sql.functions import col, regexp_replace
# Remove + in column Dependents
data_indexed = data_indexed.withColumn("Dependents", regexp_replace(col("Dependents"), "\\+", ""))
data_indexed = data_indexed.withColumn("Dependents", data_indexed["Dependents"].cast("double"))


In [23]:
data_indexed.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+----------+-----------+-------------+-----------------+-----------------+---------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|Gender_inx|Married_inx|Education_inx|Self_Employed_inx|Property_Area_inx|Loan_Status_inx|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+----------+-----------+-------------+-----------------+-----------------+---------------+
|LP001003|  Male|    Yes|       1.0|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|       0.0|        0.0|          0.0|              0.0|           

In [24]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [25]:
data_indexed.columns

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status',
 'Gender_inx',
 'Married_inx',
 'Education_inx',
 'Self_Employed_inx',
 'Property_Area_inx',
 'Loan_Status_inx']

In [26]:
assembler = VectorAssembler(
    inputCols=['Gender_inx', 'Married_inx', 'Dependents',
               'Education_inx', 'Self_Employed_inx','ApplicantIncome',
               'CoapplicantIncome','LoanAmount','Loan_Amount_Term',
               'Credit_History','Property_Area_inx'],
    outputCol='features'
)

In [27]:
data_pre = assembler.transform(data_indexed)

In [28]:
data_pre.select('features').show(2, truncate=False)

+-------------------------------------------------------+
|features                                               |
+-------------------------------------------------------+
|[0.0,0.0,1.0,0.0,0.0,4583.0,1508.0,128.0,360.0,1.0,2.0]|
|(11,[4,5,7,8,9,10],[1.0,3000.0,66.0,360.0,1.0,1.0])    |
+-------------------------------------------------------+
only showing top 2 rows



In [29]:
data_pre.show(2)

+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+----------+-----------+-------------+-----------------+-----------------+---------------+--------------------+
| Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|Gender_inx|Married_inx|Education_inx|Self_Employed_inx|Property_Area_inx|Loan_Status_inx|            features|
+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+----------+-----------+-------------+-----------------+-----------------+---------------+--------------------+
|LP001003|  Male|    Yes|       1.0| Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|       0.0|    

In [30]:
final_data = data_pre.select('features','Loan_Status_inx')

In [31]:
final_data.count()

480

In [32]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [33]:
train_data.describe().show()

+-------+-------------------+
|summary|    Loan_Status_inx|
+-------+-------------------+
|  count|                390|
|   mean|0.31794871794871793|
| stddev|0.46627760461053047|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [34]:
test_data.describe().show()

+-------+-------------------+
|summary|    Loan_Status_inx|
+-------+-------------------+
|  count|                 90|
|   mean|0.26666666666666666|
| stddev| 0.4446940622369092|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [35]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier,RandomForestClassifier

In [36]:
logistic = LogisticRegression(featuresCol='features', labelCol='Loan_Status_inx', predictionCol='prediction')
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='Loan_Status_inx')
rfc = RandomForestClassifier(featuresCol='features', labelCol='Loan_Status_inx')
gbt = GBTClassifier(featuresCol='features', labelCol='Loan_Status_inx')

In [37]:
logisticModel = logistic.fit(train_data)
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

### Model Comparison

In [38]:
logistic_predictions = logisticModel.transform(test_data)
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,  BinaryClassificationEvaluator

In [40]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="Loan_Status_inx", 
                                                  predictionCol="prediction", 
                                                  metricName="accuracy")

In [42]:
logistic_acc = acc_evaluator.evaluate(logistic_predictions)
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [44]:
print("Results:")
print('-'*80)
print('Logistic - accuracy: {0:2.2f}%'.format(logistic_acc*100))
print('-'*80)
print('A single decision tree - accuracy: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble - accuracy: {0:2.2f}%'.format(rfc_acc*100))
print('-'*80)
print('A ensemble using GBT - accuracy: {0:2.2f}%'.format(gbt_acc*100))

Results:
--------------------------------------------------------------------------------
Logistic - accuracy: 83.33%
--------------------------------------------------------------------------------
A single decision tree - accuracy: 77.78%
--------------------------------------------------------------------------------
A random forest ensemble - accuracy: 82.22%
--------------------------------------------------------------------------------
A ensemble using GBT - accuracy: 76.67%


 => Logistic và random forest có kết quả tốt nhất

## Test with loan_sanction_test

In [45]:
val_data = spark.read.csv("Du lieu cung cap/HomeLoan/loan_sanction_test.csv", inferSchema=True,header=True)

In [46]:
val_data.count()

367

In [47]:
val_data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in val_data.columns]
   ).show()

+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+
|Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+
|      0|    11|      0|        10|        0|           23|              0|                0|         5|               6|            29|            0|
+-------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+



In [48]:
val_data = val_data.na.drop()
val_data.count()

289

In [49]:
indexer = StringIndexer(inputCol='Gender', outputCol='Gender_inx')
indexer_model = indexer.fit(val_data)
data_indexed = indexer_model.transform(val_data)

In [50]:
data_indexed =  StringIndexer(inputCol='Married', 
                              outputCol='Married_inx').fit(data_indexed).transform(data_indexed)

In [51]:
data_indexed =  StringIndexer(inputCol='Education', 
                              outputCol='Education_inx').fit(data_indexed).transform(data_indexed)

In [52]:
data_indexed =  StringIndexer(inputCol='Self_Employed', 
                              outputCol='Self_Employed_inx').fit(data_indexed).transform(data_indexed)

In [53]:
data_indexed =  StringIndexer(inputCol='Property_Area', 
                              outputCol='Property_Area_inx').fit(data_indexed).transform(data_indexed)

In [54]:
from pyspark.sql.functions import col, regexp_replace
# Remove + in column Dependents
data_indexed = data_indexed.withColumn("Dependents", regexp_replace(col("Dependents"), "\\+", ""))
data_indexed = data_indexed.withColumn("Dependents", data_indexed["Dependents"].cast("double"))


In [55]:
data_indexed.show(5)

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+----------+-----------+-------------+-----------------+-----------------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Gender_inx|Married_inx|Education_inx|Self_Employed_inx|Property_Area_inx|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+----------+-----------+-------------+-----------------+-----------------+
|LP001015|  Male|    Yes|       0.0|    Graduate|           No|           5720|                0|       110|             360|             1|        Urban|       0.0|        0.0|          0.0|              0.0|              0.0|
|LP001022|  Male|    Yes|       1.0|    Graduate|           No|           3076|         

In [56]:
val_data_pre = assembler.transform(data_indexed)

In [57]:
val_data_pre.select('features').show(2, truncate=False)

+------------------------------------------------------+
|features                                              |
+------------------------------------------------------+
|(11,[5,7,8,9],[5720.0,110.0,360.0,1.0])               |
|(11,[2,5,6,7,8,9],[1.0,3076.0,1500.0,126.0,360.0,1.0])|
+------------------------------------------------------+
only showing top 2 rows



In [58]:
val_data_pre.show(2)

+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+----------+-----------+-------------+-----------------+-----------------+--------------------+
| Loan_ID|Gender|Married|Dependents|Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Gender_inx|Married_inx|Education_inx|Self_Employed_inx|Property_Area_inx|            features|
+--------+------+-------+----------+---------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+----------+-----------+-------------+-----------------+-----------------+--------------------+
|LP001015|  Male|    Yes|       0.0| Graduate|           No|           5720|                0|       110|             360|             1|        Urban|       0.0|        0.0|          0.0|              0.0|              0.0|(11,[5,7,8,9],[57...|
|LP001022|  Male

In [59]:
val_final_data = val_data_pre.select('features')

In [60]:
val_final_data.count()

289

In [61]:
# Dùng logisticModel để dự đoán.

In [62]:
predictions = logisticModel.transform(val_final_data)

In [63]:
predictions[['features','probability','prediction']].show(5,False)

+-------------------------------------------------------+----------------------------------------+----------+
|features                                               |probability                             |prediction|
+-------------------------------------------------------+----------------------------------------+----------+
|(11,[5,7,8,9],[5720.0,110.0,360.0,1.0])                |[0.9133358070200766,0.08666419297992345]|0.0       |
|(11,[2,5,6,7,8,9],[1.0,3076.0,1500.0,126.0,360.0,1.0]) |[0.9048844889864587,0.09511551101354132]|0.0       |
|(11,[2,5,6,7,8,9],[2.0,5000.0,1800.0,208.0,360.0,1.0]) |[0.8929858675050534,0.10701413249494662]|0.0       |
|(11,[1,3,5,7,8,9],[1.0,1.0,3276.0,78.0,360.0,1.0])     |[0.8320395078284291,0.16796049217157094]|0.0       |
|[0.0,0.0,0.0,1.0,1.0,2165.0,3422.0,152.0,360.0,1.0,0.0]|[0.8211106005822686,0.1788893994177314] |0.0       |
+-------------------------------------------------------+----------------------------------------+----------+
only showi

In [64]:
# Dùng random forest để dự đoán.

In [65]:
predictions = rfc_model.transform(val_final_data)

In [66]:
predictions[['features','probability','prediction']].show(20,False)

+-------------------------------------------------------+----------------------------------------+----------+
|features                                               |probability                             |prediction|
+-------------------------------------------------------+----------------------------------------+----------+
|(11,[5,7,8,9],[5720.0,110.0,360.0,1.0])                |[0.8469692475023163,0.15303075249768386]|0.0       |
|(11,[2,5,6,7,8,9],[1.0,3076.0,1500.0,126.0,360.0,1.0]) |[0.8823642274750746,0.11763577252492537]|0.0       |
|(11,[2,5,6,7,8,9],[2.0,5000.0,1800.0,208.0,360.0,1.0]) |[0.8140493095475206,0.18595069045247944]|0.0       |
|(11,[1,3,5,7,8,9],[1.0,1.0,3276.0,78.0,360.0,1.0])     |[0.780975312890536,0.2190246871094641]  |0.0       |
|[0.0,0.0,0.0,1.0,1.0,2165.0,3422.0,152.0,360.0,1.0,0.0]|[0.8489625512400671,0.1510374487599329] |0.0       |
|[1.0,1.0,1.0,1.0,0.0,2226.0,0.0,59.0,360.0,1.0,2.0]    |[0.5813893317918595,0.4186106682081404] |0.0       |
|(11,[2,3,