# CUSTOMER CHURN PREDICTION

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('churn').getOrCreate()
df = spark.read.csv('/content/Churn_Modelling.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- HasCrCard: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [None]:
import pandas as pd
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
RowNumber,1,2,3,4,5
CustomerId,15634602,15647311,15619304,15701354,15737888
Surname,Hargrave,Hill,Onio,Boni,Mitchell
CreditScore,619,608,502,699,850
Geography,France,Spain,France,France,Spain
Gender,Female,Female,Female,Female,Female
Age,42,41,42,39,43
Tenure,2,1,8,1,2
Balance,0.0,83807.86,159660.8,0.0,125510.82
NumOfProducts,1,1,3,2,1


In [None]:
import warnings
# import findspark
import pandas as pd
import seaborn as sns
from pyspark.ml.classification import GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer

In [None]:
spark_df = spark.read.csv('/content/Churn_Modelling.csv', inferSchema=True, header=True)
spark_df.show(10)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|

In [None]:
spark_df = spark_df.toDF(*[c.lower() for c in spark_df.columns])
spark_df.show(5)

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|rownumber|customerid| surname|creditscore|geography|gender|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|        0|             0|       93826.63|

In [None]:
spark_df.groupby("exited").count().show()

+------+-----+
|exited|count|
+------+-----+
|     1| 2037|
|     0| 7963|
+------+-----+



In [None]:
num_cols = [col[0] for col in spark_df.dtypes if col[1] != 'string']
spark_df.select(num_cols).describe().show()

+-------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-------------------+-------------------+-----------------+-------------------+
|summary|         rownumber|       customerid|      creditscore|               age|            tenure|          balance|     numofproducts|          hascrcard|     isactivemember|  estimatedsalary|             exited|
+-------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-------------------+-------------------+-----------------+-------------------+
|  count|             10000|            10000|            10000|             10000|             10000|            10000|             10000|              10000|              10000|            10000|              10000|
|   mean|            5000.5|  1.56909405694E7|         650.5288|           38.9218|            5.0128|76485.88928799961|        

In [None]:
from pyspark.sql.functions import when, count, col
spark_df.select([count(when(col(c).isNull(), c)).alias(c) for c in spark_df.columns]).toPandas().T

Unnamed: 0,0
rownumber,0
customerid,0
surname,0
creditscore,0
geography,0
gender,0
age,0
tenure,0
balance,0
numofproducts,0


In [None]:
spark_df = spark_df.drop('rownumber', "customerid", "surname")

In [None]:
spark_df = spark_df.withColumn('creditscore_salary', spark_df.creditscore / spark_df.estimatedsalary)
spark_df = spark_df.withColumn('creditscore_tenure', spark_df.creditscore * spark_df.tenure)
spark_df = spark_df.withColumn('balance_salary', spark_df.balance / spark_df.estimatedsalary)
spark_df.show(5)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+
|creditscore|geography|gender|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|               608|0.7446769036217226|
|        502|   France|Female| 42|     8| 159660.8|            3|        1|             0|      113931.57|     1|0.004406153623618106| 

In [None]:
spark_df.select('age').describe().toPandas().transpose()
spark_df.select("age").summary("count", "min", "25%", "50%","75%", "max").show()
bucketizer = Bucketizer(splits=[0, 35, 55, 75, 95], inputCol="age", outputCol="age_cat")
spark_df = bucketizer.setHandleInvalid("keep").transform(spark_df)
spark_df = spark_df.withColumn('age_cat', spark_df.age_cat + 1)

+-------+-----+
|summary|  age|
+-------+-----+
|  count|10000|
|    min|   18|
|    25%|   32|
|    50%|   37|
|    75%|   44|
|    max|   92|
+-------+-----+



In [None]:
spark_df.groupby("age_cat").count().show()

+-------+-----+
|age_cat|count|
+-------+-----+
|    1.0| 3679|
|    4.0|   54|
|    3.0|  828|
|    2.0| 5439|
+-------+-----+



In [None]:
spark_df.groupby("age_cat").agg({'exited': "mean"}).show()

+-------+--------------------+
|age_cat|         avg(exited)|
+-------+--------------------+
|    1.0| 0.07882576787170427|
|    4.0|0.018518518518518517|
|    3.0| 0.41183574879227053|
|    2.0| 0.25831954403382973|
+-------+--------------------+



In [None]:
spark_df = spark_df.withColumn("age_cat", spark_df["age_cat"].cast("integer"))

In [None]:
spark_df.withColumn('creditscore_2',
                    when(spark_df['creditscore'] < 301, "deep").
                    when((301 < spark_df['creditscore']) & (spark_df['creditscore'] < 601), "very poor").
                    when((500 < spark_df['creditscore']) & (spark_df['creditscore'] < 601), "poor").
                    when((601 < spark_df['creditscore']) & (spark_df['creditscore'] < 661), "fair").
                    when((661 < spark_df['creditscore']) & (spark_df['creditscore'] < 781), "good").
                    when((781 < spark_df['creditscore']) & (spark_df['creditscore'] < 851), "excellent").
                    otherwise("top")).show()

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+
|creditscore|geography|gender|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|creditscore_2|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|         fair|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|               608|0.7446769036217226|      2|         fair|
|        502|   France|Fe

In [None]:
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import udf

# udf ile fonksiyon yazma
def segment(tenure):
    if tenure < 5:
        return "segment_b"
    else:
        return "segment_a"

func_udf = udf(segment, StringType())
spark_df = spark_df.withColumn('segment', func_udf(spark_df['tenure']))
spark_df.show(5)

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+---------+
|creditscore|geography|gender|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|  segment|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+---------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|segment_b|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|               608|0.7446769036217226|      2|segment_b|
|        502|   France|Female| 42|     8| 159

In [None]:
spark_df.groupby("segment").count().show()

+---------+-----+
|  segment|count|
+---------+-----+
|segment_a| 5506|
|segment_b| 4494|
+---------+-----+



In [None]:
indexer = StringIndexer(inputCol="segment", outputCol="segment_label")
indexer.fit(spark_df).transform(spark_df).show(5)
temp_sdf = indexer.fit(spark_df).transform(spark_df)
spark_df = temp_sdf.withColumn("segment_label", temp_sdf["segment_label"].cast("integer"))
spark_df = spark_df.drop('segment')

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+---------+-------------+
|creditscore|geography|gender|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|  segment|segment_label|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+---------+-------------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|segment_b|          1.0|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|               608|0.7446769036217226|      2|

In [None]:
indexer = StringIndexer(inputCol="gender", outputCol="gender_label")
indexer.fit(spark_df).transform(spark_df).show(5)
temp_sdf = indexer.fit(spark_df).transform(spark_df)
spark_df = temp_sdf.withColumn("gender_label", temp_sdf["gender_label"].cast("integer"))
spark_df = spark_df.drop('gender')

+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+
|creditscore|geography|gender|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|segment_label|gender_label|
+-----------+---------+------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+
|        619|   France|Female| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|            1|         1.0|
|        608|    Spain|Female| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|               608|0.7446769036217

In [None]:
indexer = StringIndexer(inputCol="geography", outputCol="geography_label")
indexer.fit(spark_df).transform(spark_df).show(5)
temp_sdf = indexer.fit(spark_df).transform(spark_df)
spark_df = temp_sdf.withColumn("geography_label", temp_sdf["geography_label"].cast("integer"))
spark_df = spark_df.drop('geography')

+-----------+---------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+
|creditscore|geography|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|segment_label|gender_label|geography_label|
+-----------+---------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+
|        619|   France| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|            1|           1|            0.0|
|        608|    Spain| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|     

In [None]:
spark_df.show(5)

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+
|creditscore|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|segment_label|gender_label|geography_label|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+
|        619| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|            1|           1|              0|
|        608| 41|     1| 83807.86|            1|        0|             1|      112542.58|     0|0.005402399696186101|               608|0.7446769036217226|      2|         

In [None]:
encoder = OneHotEncoder(inputCols=["age_cat", "geography_label"], outputCols=["age_cat_ohe", "geography_label_ohe"])
spark_df = encoder.fit(spark_df).transform(spark_df)

In [None]:
stringIndexer = StringIndexer(inputCol='exited', outputCol='label')

temp_sdf = stringIndexer.fit(spark_df).transform(spark_df)
temp_sdf.show()

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+-------------+-------------------+-----+
|creditscore|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|segment_label|gender_label|geography_label|  age_cat_ohe|geography_label_ohe|label|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+-------------+-------------------+-----+
|        619| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|            1|           1|              0|(4,[2],[1.0])|      (2,[0],[1.0])|  1.0|
|        608

In [None]:
spark_df = temp_sdf.withColumn("label", temp_sdf["label"].cast("integer"))
spark_df.show(5)

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+-------------+-------------------+-----+
|creditscore|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|segment_label|gender_label|geography_label|  age_cat_ohe|geography_label_ohe|label|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+-------------+-------------------+-----+
|        619| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|            1|           1|              0|(4,[2],[1.0])|      (2,[0],[1.0])|    1|
|        608

In [None]:
cols = ['creditscore', 'age', 'tenure', 'balance','numofproducts', 'hascrcard',
        'isactivemember', 'estimatedsalary', 'creditscore_salary', 'creditscore_tenure',
        'balance_salary', 'segment_label', 'gender_label',
        'age_cat_ohe', 'geography_label_ohe']

In [None]:
va = VectorAssembler(inputCols=cols, outputCol="features")
va_df = va.transform(spark_df)
va_df.show()

+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+-------------+-------------------+-----+--------------------+
|creditscore|age|tenure|  balance|numofproducts|hascrcard|isactivemember|estimatedsalary|exited|  creditscore_salary|creditscore_tenure|    balance_salary|age_cat|segment_label|gender_label|geography_label|  age_cat_ohe|geography_label_ohe|label|            features|
+-----------+---+------+---------+-------------+---------+--------------+---------------+------+--------------------+------------------+------------------+-------+-------------+------------+---------------+-------------+-------------------+-----+--------------------+
|        619| 42|     2|      0.0|            1|        1|             1|      101348.88|     1|0.006107615594765329|              1238|               0.0|      2|            1|           1|      

In [None]:
final_df = va_df.select("features", "label")
final_df.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[619.0,42.0,2.0,0...|    1|
|[608.0,41.0,1.0,8...|    0|
|[502.0,42.0,8.0,1...|    1|
|(19,[0,1,2,4,7,8,...|    0|
|[850.0,43.0,2.0,1...|    0|
+--------------------+-----+
only showing top 5 rows



In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
final_df = scaler.fit(final_df).transform(final_df)

In [None]:
train_df, test_df = final_df.randomSplit([0.7, 0.3], seed=17)
train_df.show(5)

+--------------------+-----+--------------------+
|            features|label|     scaled_features|
+--------------------+-----+--------------------+
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
|(19,[0,1,2,3,4,5,...|    1|(19,[0,1,2,3,4,5,...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [None]:
test_df.show(5)

+--------------------+-----+--------------------+
|            features|label|     scaled_features|
+--------------------+-----+--------------------+
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [None]:
print("Training Dataset Count: " + str(train_df.count()))
print("Test Dataset Count: " + str(test_df.count()))

Training Dataset Count: 6949
Test Dataset Count: 3051


# MODELLING

# GBT CLASSIFIER

In [None]:
gbm = GBTClassifier(maxIter=100, featuresCol="features", labelCol="label")
gbm_model = gbm.fit(train_df)
y_pred = gbm_model.transform(test_df)
y_pred.show(5)

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|            features|label|     scaled_features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|[1.75024081719644...|[0.97070147009728...|       0.0|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|[1.26360126567415...|[0.92602695213284...|       0.0|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|[1.21202822933281...|[0.91864343014702...|       0.0|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|[1.75728443667020...|[0.97109946729348...|       0.0|
|(19,[0,1,2,3,4,5,...|    0|(19,[0,1,2,3,4,5,...|[1.14318764430859...|[0.90774234263874...|       0.0|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
y_pred.filter(y_pred.label == y_pred.prediction).count() / y_pred.count()

0.8597181252048509

# After Hyperparameter Tuning

In [None]:
evaluator = BinaryClassificationEvaluator()

gbm_params = (ParamGridBuilder()
              .addGrid(gbm.maxDepth, [2, 4, 6])
              .addGrid(gbm.maxBins, [20, 30])
              .addGrid(gbm.maxIter, [10, 20])
              .build())

cv = CrossValidator(estimator=gbm,
                    estimatorParamMaps=gbm_params,
                    evaluator=evaluator,
                    numFolds=5)

cv_model = cv.fit(train_df)
y_pred = cv_model.transform(test_df)
ac = y_pred.select("label", "prediction")
ac.filter(ac.label == ac.prediction).count() / ac.count()

0.8646345460504753

# Random Forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train_df)
predictions = rfModel.transform(test_df)


In [None]:
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Accuracy: 0.82
Test Area Under ROC: 0.8244635852497415


#After Hyperparameter Tunning

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 50, 100]) \
    .addGrid(rf.maxDepth, [5, 10, 20]) \
    .build()

# Create a CrossValidator with 5-fold cross-validation
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5,
                          parallelism=2)

In [None]:
cvModel = crossval.fit(train_df)
cvPreds = cvModel.transform(test_df)
evaluator.evaluate(cvPreds)

0.8516175244115107

# Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(test_df)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()

accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")
print('Test Area Under ROC', evaluator.evaluate(predictions))

Test Accuracy: 0.77
Test Area Under ROC 0.7713278983555402


# After HyperParamter Tuning

In [None]:
# Create a ParamGridBuilder to specify the hyperparameter values to search over
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.5, 0.75, 1.0]) \
    .addGrid(lr.maxIter, [100, 1000, 10000]) \
    .build()

# Create a CrossValidator with 5-fold cross-validation
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5,
                          parallelism=2)

In [None]:
cvModel = crossval.fit(train_df)
cvPreds = cvModel.transform(test_df)
evaluator.evaluate(cvPreds)

0.7726307766591225

# Decision Tree

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train_df)
predictions = dtModel.transform(test_df)

In [None]:
evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Accuracy: 0.30
Test Area Under ROC: 0.30225313482229915


# After Hyperparameter Tuning of DT

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = ParamGridBuilder() .addGrid(dt.maxDepth, [3, 5, 7]) .addGrid(dt.minInstancesPerNode, [1, 3, 5]) .build()

In [None]:
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(train_df)
cvPreds = cvModel.transform(test_df)
evaluator.evaluate(cvPreds)

0.501689950877697