In [1]:
# Create a Spark Session Object
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('supervised_ml').getOrCreate()

In [2]:
# Read the Dataset
df = spark.read.csv('Linear_regression_dataset.csv',inferSchema=True,header=True)
print((df.count(),len(df.columns)))

(1232, 6)


In [3]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)



In [4]:
df.show(10)

+-----+-----+-----+-----+-----+-----+
|var_1|var_2|var_3|var_4|var_5|label|
+-----+-----+-----+-----+-----+-----+
|  734|  688|   81|0.328|0.259|0.418|
|  700|  600|   94| 0.32|0.247|0.389|
|  712|  705|   93|0.311|0.247|0.417|
|  734|  806|   69|0.315| 0.26|0.415|
|  613|  759|   61|0.302| 0.24|0.378|
|  748|  676|   85|0.318|0.255|0.422|
|  669|  588|   97|0.315|0.251|0.411|
|  667|  845|   68|0.324|0.251|0.381|
|  758|  890|   64| 0.33|0.274|0.436|
|  726|  670|   88|0.335|0.268|0.422|
+-----+-----+-----+-----+-----+-----+
only showing top 10 rows



In [5]:
# Feature Engineering
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'label']

In [6]:
vec_assembler = VectorAssembler(inputCols=['var_1','var_2','var_3','var_4','var_5'],outputCol='features')
features_df = vec_assembler.transform(df)
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [7]:
features_df.select(['features','label']).show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[734.0,688.0,81.0...|0.418|
|[700.0,600.0,94.0...|0.389|
|[712.0,705.0,93.0...|0.417|
|[734.0,806.0,69.0...|0.415|
|[613.0,759.0,61.0...|0.378|
|[748.0,676.0,85.0...|0.422|
|[669.0,588.0,97.0...|0.411|
|[667.0,845.0,68.0...|0.381|
|[758.0,890.0,64.0...|0.436|
|[726.0,670.0,88.0...|0.422|
|[583.0,794.0,55.0...|0.371|
|[676.0,746.0,72.0...|  0.4|
|[767.0,699.0,89.0...|0.433|
|[637.0,597.0,86.0...|0.374|
|[609.0,724.0,69.0...|0.382|
|[776.0,733.0,83.0...|0.437|
|[701.0,832.0,66.0...| 0.39|
|[650.0,709.0,74.0...|0.386|
|[804.0,668.0,95.0...|0.453|
|[713.0,614.0,94.0...|0.404|
+--------------------+-----+
only showing top 20 rows



In [8]:
# Split the Dataset
train, test = features_df.randomSplit([0.75,0.25])
print(f"Size of train dataset: {train.count()}")

Size of train dataset: 957


In [9]:
print(f"Size of test dataset: {test.count()}")

Size of test dataset: 275


In [10]:
# Build and Train Linear Regression Model
from pyspark.ml.regression import LinearRegression
lr = LinearRegression()
lr_model = lr.fit(train)
predictions_df = lr_model.transform(test)
predictions_df.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  464|  640|   66|0.283| 0.22|0.301|[464.0,640.0,66.0...|0.31470481617220386|
|  470|  509|   76|0.289| 0.23|0.319|[470.0,509.0,76.0...|0.31280138011519926|
|  473|  499|   73|0.281|0.228|0.315|[473.0,499.0,73.0...|0.31754655205434246|
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...|0.33084277117254834|
|  524|  665|   65|0.287|0.224|0.336|[524.0,665.0,65.0...| 0.3359456374887837|
|  528|  652|   71|  0.3| 0.23|0.319|[528.0,652.0,71.0...| 0.3314590465273831|
|  543|  615|   76|0.294|0.233|0.333|[543.0,615.0,76.0...|0.34147070532239615|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...|0.33721349956428487|
|  564|  648|   74|0.294|0.236|0.337|[564.0,648.0,74.0...|0.35150291627544694|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

In [11]:
# Evaluate Linear Regression Model on Test Data
model_predictions = lr_model.evaluate(test)
model_predictions.r2

0.894175919423698

In [12]:
print(model_predictions.meanSquaredError)

0.00012626480950094274


In [13]:
# GLM
from pyspark.ml.regression import GeneralizedLinearRegression

In [14]:
glr = GeneralizedLinearRegression()
grl_model = glr.fit(train)
grl_model.coefficients

DenseVector([0.0004, 0.0, 0.0002, -0.7018, 0.4683])

In [15]:
grl_model.summary

Coefficients:
    Feature Estimate Std Error  T Value P Value
(Intercept)   0.2049    0.0162  12.6433  0.0000
      var_1   0.0004    0.0000  24.5135  0.0000
      var_2   0.0000    0.0000   4.1691  0.0000
      var_3   0.0002    0.0001   1.7301  0.0839
      var_4  -0.7018    0.0681 -10.3070  0.0000
      var_5   0.4683    0.0618   7.5817  0.0000

(Dispersion parameter for gaussian family taken to be 0.0002)
    Null deviance: 1.0342 on 951 degrees of freedom
Residual deviance: 0.1437 on 951 degrees of freedom
AIC: -5695.1271

In [16]:
# Evaluate the Model Performance on Test Data
model_predictions = grl_model.evaluate(test)
model_predictions.predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  464|  640|   66|0.283| 0.22|0.301|[464.0,640.0,66.0...|0.31470481617220386|
|  470|  509|   76|0.289| 0.23|0.319|[470.0,509.0,76.0...|0.31280138011519926|
|  473|  499|   73|0.281|0.228|0.315|[473.0,499.0,73.0...|0.31754655205434246|
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...|0.33084277117254834|
|  524|  665|   65|0.287|0.224|0.336|[524.0,665.0,65.0...| 0.3359456374887837|
|  528|  652|   71|  0.3| 0.23|0.319|[528.0,652.0,71.0...| 0.3314590465273831|
|  543|  615|   76|0.294|0.233|0.333|[543.0,615.0,76.0...|0.34147070532239615|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...|0.33721349956428487|
|  564|  648|   74|0.294|0.236|0.337|[564.0,648.0,74.0...|0.35150291627544694|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

In [17]:
model_predictions.aic

-1674.2943350078167

In [18]:
# GLM Binomial
glr = GeneralizedLinearRegression(family="Binomial")
glr_model = glr.fit(train)
model_predictions = glr_model.evaluate(test)
model_predictions.aic

291.64672106066456

In [19]:
# GLM Poisson
glr = GeneralizedLinearRegression(family="Poisson")
glr_model = glr.fit(train)
model_predictions = glr_model.evaluate(test)
model_predictions.aic

230.78768967062442

In [20]:
# GLM Gamma
glr = GeneralizedLinearRegression(family="Gamma")
glr_model = glr.fit(train)
model_predictions = glr_model.evaluate(test)
model_predictions.aic

-1635.0246682483435

In [21]:
# Build and Train Decision Tree Regressor Model
from pyspark.ml.regression import DecisionTreeRegressor
dec_tree = DecisionTreeRegressor()
dec_tree_model = dec_tree.fit(train)
dec_tree_model.featureImportances

SparseVector(5, {0: 0.9691, 1: 0.0159, 2: 0.0014, 3: 0.004, 4: 0.0095})

In [22]:
# Evaluate the Model Performance on Test Data
model_predictions = dec_tree_model.transform(test)
model_predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  464|  640|   66|0.283| 0.22|0.301|[464.0,640.0,66.0...|0.32063636363636366|
|  470|  509|   76|0.289| 0.23|0.319|[470.0,509.0,76.0...|0.32063636363636366|
|  473|  499|   73|0.281|0.228|0.315|[473.0,499.0,73.0...|0.32063636363636366|
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...|             0.3403|
|  524|  665|   65|0.287|0.224|0.336|[524.0,665.0,65.0...|              0.327|
|  528|  652|   71|  0.3| 0.23|0.319|[528.0,652.0,71.0...|0.32063636363636366|
|  543|  615|   76|0.294|0.233|0.333|[543.0,615.0,76.0...|0.32063636363636366|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...|             0.3403|
|  564|  648|   74|0.294|0.236|0.337|[564.0,648.0,74.0...| 0.3519770114942529|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

In [23]:
from pyspark.ml.evaluation import RegressionEvaluator
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(model_predictions)
print(f'The r-square value of DecisionTreeRegressor is {dt_r2}')

The r-square value of DecisionTreeRegressor is 0.8579748961141658


In [24]:
dt_evaluator = RegressionEvaluator(metricName='rmse')
dt_rmse = dt_evaluator.evaluate(model_predictions)
print(f'The rmse value of DecisionTreeRegressor is {dt_rmse}')

The rmse value of DecisionTreeRegressor is 0.013017616401907685


In [25]:
# Build and Train Random Forest Regressor Model
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor()
rf_model = rf.fit(train)
rf_model.featureImportances

SparseVector(5, {0: 0.4763, 1: 0.0425, 2: 0.0252, 3: 0.2029, 4: 0.2531})

In [26]:
rf_model.getNumTrees

20

In [27]:
model_predictions = rf_model.transform(test)
model_predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  464|  640|   66|0.283| 0.22|0.301|[464.0,640.0,66.0...| 0.3305856141503277|
|  470|  509|   76|0.289| 0.23|0.319|[470.0,509.0,76.0...|0.32575887381108737|
|  473|  499|   73|0.281|0.228|0.315|[473.0,499.0,73.0...|   0.32514797637519|
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...| 0.3433444808195728|
|  524|  665|   65|0.287|0.224|0.336|[524.0,665.0,65.0...|0.33359836231846396|
|  528|  652|   71|  0.3| 0.23|0.319|[528.0,652.0,71.0...| 0.3332565522283787|
|  543|  615|   76|0.294|0.233|0.333|[543.0,615.0,76.0...| 0.3277685960333096|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...| 0.3485841714565344|
|  564|  648|   74|0.294|0.236|0.337|[564.0,648.0,74.0...| 0.3518305757418549|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

In [28]:
# Evaluate the Model Performance on Test Data
rf_evaluator = RegressionEvaluator(metricName='r2')
rf_r2 = rf_evaluator.evaluate(model_predictions)
print(f'The r-square value of RandomForestRegressor is {rf_r2}')

The r-square value of RandomForestRegressor is 0.8686152430464547


In [29]:
rf_evaluator = RegressionEvaluator(metricName='rmse')
rf_rmse = rf_evaluator.evaluate(model_predictions)
print(f'The rmse value of RandomForestRegressor is {rf_rmse}')

The rmse value of RandomForestRegressor is 0.012520492227806858


In [30]:
# Build and Train a GBT Regressor Model
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor()
gbt_model = gbt.fit(train)
gbt_model.featureImportances

SparseVector(5, {0: 0.636, 1: 0.1079, 2: 0.0845, 3: 0.0826, 4: 0.089})

In [31]:
model_predictions = gbt_model.transform(test)
model_predictions.show()

+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|var_1|var_2|var_3|var_4|var_5|label|            features|         prediction|
+-----+-----+-----+-----+-----+-----+--------------------+-------------------+
|  464|  640|   66|0.283| 0.22|0.301|[464.0,640.0,66.0...|0.31843832127019095|
|  470|  509|   76|0.289| 0.23|0.319|[470.0,509.0,76.0...|0.31620392522954444|
|  473|  499|   73|0.281|0.228|0.315|[473.0,499.0,73.0...|0.31705322387073964|
|  513|  698|   61|0.298|0.236|0.339|[513.0,698.0,61.0...|0.33877562674319683|
|  524|  665|   65|0.287|0.224|0.336|[524.0,665.0,65.0...|0.32261735073282405|
|  528|  652|   71|  0.3| 0.23|0.319|[528.0,652.0,71.0...|0.31903917637757384|
|  543|  615|   76|0.294|0.233|0.333|[543.0,615.0,76.0...|0.32123471341196946|
|  550|  631|   76|0.306|0.235|0.318|[550.0,631.0,76.0...|0.33779005747499546|
|  564|  648|   74|0.294|0.236|0.337|[564.0,648.0,74.0...|0.35301969082307955|
|  567|  587|   84|0.301|0.238|0.349|[567.0,587.0,84

In [32]:
# Evaluate the Model Performance on Test Data
gbt_evaluator = RegressionEvaluator(metricName='r2')
gbt_r2 = gbt_evaluator.evaluate(model_predictions)
print(f"The r-square value of GradientBoostedRegressor is {gbt_r2}")

The r-square value of GradientBoostedRegressor is 0.8631390831694722


In [33]:
gbt_evaluator = RegressionEvaluator(metricName='rmse')
gbt_rmse = gbt_evaluator.evaluate(model_predictions)
print(f"The rmse value of GradientBoostedRegressor is {gbt_rmse}")

The rmse value of GradientBoostedRegressor is 0.01277875766980651


In [34]:
# Reas the Dataset Bank
df = spark.read.csv('bank_data.csv',inferSchema=True,header=True)
df.count()

41188

In [35]:
df.columns

['age',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed',
 'target_class']

In [36]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp.var.rate: double (nullable = true)
 |-- cons.price.idx: double (nullable = true)
 |-- cons.conf.idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr.employed: double (nullable = true)
 |-- target_class: string (nullable = true)



In [37]:
df.groupBy('target_class').count().show()

+------------+-----+
|target_class|count|
+------------+-----+
|          no|36548|
|         yes| 4640|
+------------+-----+



In [38]:
# Feature Engineering for Model
from pyspark.sql import functions as F
from pyspark.sql import *
df = df.withColumn("label",F.when(df.target_class=='no',F.lit(0)).otherwise(F.lit(1)))
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 4640|
|    0|36548|
+-----+-----+



In [39]:
from pyspark.ml.feature import OneHotEncoder,StringIndexer, VectorAssembler
def cat_to_num(df):
    for col in df.columns:
        # print(col)
        if (col != 'emp.var.rate' and col != 'cons.price.idx' and col != 'cons.conf.idx' and col != 'nr.employed'):
            stringIndexer = StringIndexer(inputCol=col,outputCol=col + "_index")
            model = stringIndexer.fit(df)
            indexed = model.transform(df)
            encoder = OneHotEncoder(inputCol=col + "_index",outputCol=col + "_vec").fit(indexed)
            df = encoder.transform(indexed)
    df_assembler = VectorAssembler(inputCols=['age','marital_vec','education_vec','default_vec','housing_vec','loan_vec'],outputCol='features')
    df = df_assembler.transform(df)
    return df.select(['features','label'])
df_new = cat_to_num(df)
df_new.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(17,[0,1,8,11,14,...|    0|
|(17,[0,1,5,12,14,...|    0|
|(17,[0,1,5,11,13,...|    0|
|(17,[0,1,9,11,14,...|    0|
|(17,[0,1,5,11,14,...|    0|
|(17,[0,1,6,12,14,...|    0|
|(17,[0,1,7,11,14,...|    0|
|(17,[0,1,10,12,14...|    0|
|(17,[0,2,7,11,13,...|    0|
|(17,[0,2,5,11,13,...|    0|
|(17,[0,1,10,12,14...|    0|
|(17,[0,2,5,11,13,...|    0|
|(17,[0,2,5,11,14,...|    0|
|(17,[0,3,8,11,13,...|    0|
|(17,[0,1,9,11,13,...|    0|
|(17,[0,1,6,12,13,...|    0|
|(17,[0,1,9,11,13,...|    0|
|(17,[0,1,9,12,13,...|    0|
|(17,[0,1,6,11,13,...|    0|
|(17,[0,2,6,12,14,...|    0|
+--------------------+-----+
only showing top 20 rows



In [40]:
# Split the Data into Train and Test Datasets
train,test = df_new.randomSplit([0.75,0.25])
print(f"Size of train dataset: {train.count()}")

Size of train dataset: 30887


In [41]:
print(f"Size of test dataset: {test.count()}")

Size of test dataset: 10301


In [42]:
# Build and Train the Logistic Regression Model
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
lr_model = lr.fit(train)
print(lr_model.coefficients)

[0.021969224929965316,-0.3211286097539499,0.12547294125201688,-0.4963978392897877,-0.3304028108696147,-0.527120025446747,-0.8150666209586042,-0.47371416721417997,-0.5295415287515769,-0.6399138184009561,-0.14415512245618622,1.0117559777777694,0.018863021626325336,0.03420872270238865,-0.01925771103285988,0.056344728387398825,-0.03433008885419535]


In [43]:
# Evaluate Performance on Training Data
lr_summary = lr_model.summary
print(f"Accuracy: {lr_summary.accuracy}")

Accuracy: 0.8864894615857805


In [44]:
print(f"Area Under ROC: {lr_summary.areaUnderROC}")

Area Under ROC: 0.6279086061422297


In [45]:
print(f"Recall: {lr_summary.weightedRecall}")

Recall: 0.8864894615857805


In [46]:
print(f"Precision: {lr_summary.weightedPrecision}")

Precision: 0.7858635655026469


In [47]:
lr_summary.precisionByThreshold.show()

+-------------------+-------------------+
|          threshold|          precision|
+-------------------+-------------------+
|0.33747925831104286| 0.6666666666666666|
|0.31636295294391115|                0.5|
|0.30683403913477036|             0.3125|
| 0.3018735488966755| 0.2727272727272727|
| 0.2937777849370645|0.38461538461538464|
|0.28924051593720757| 0.3333333333333333|
| 0.2845483882837043| 0.3611111111111111|
| 0.2773256738134998|0.36585365853658536|
|0.27275265862114284|0.34782608695652173|
|0.26841669438331783|0.39215686274509803|
|0.26476201251216336|0.36363636363636365|
|0.26272428861254626|              0.375|
| 0.2606100069019035|0.38235294117647056|
| 0.2588907539043346|0.38666666666666666|
| 0.2584910815096143| 0.3902439024390244|
|0.25767572140279715| 0.3723404255319149|
| 0.2543025563314383|0.37755102040816324|
| 0.2521325985541839|0.37383177570093457|
|0.25034772076714373| 0.3508771929824561|
|0.24811181262720183|0.34959349593495936|
+-------------------+-------------

In [48]:
lr_summary.roc.show()

+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|7.304335122895438E-5|0.001140901312036...|
|1.826083780723859...|0.001426126640045636|
|4.017384317592491...|0.001426126640045636|
| 5.84346809831635E-4|0.001711351968054...|
| 5.84346809831635E-4|0.002852253280091272|
|7.304335122895439E-4|0.002852253280091272|
|8.399985391329755E-4|0.003707929264118...|
| 9.49563565976407E-4|0.004278379920136908|
|0.001095650268434...|0.004563605248146035|
|0.001132171944048...|0.005704506560182544|
|0.001278258646506...|0.005704506560182544|
|0.001460867024579...|0.006845407872219...|
|0.001533910375808...|0.007415858528237307|
|0.001679997078265951| 0.00827153451226469|
|0.001826083780723...| 0.00912721049629207|
|0.002154778861254...|0.009982886480319452|
|0.002227822212483...|0.010553337136337706|
|0.002446952266169972|0.011409013120365089|
|0.002702603995471312|0.01140901

In [49]:
lr_summary.recallByThreshold.show()

+-------------------+--------------------+
|          threshold|              recall|
+-------------------+--------------------+
|0.33747925831104286|0.001140901312036...|
|0.31636295294391115|0.001426126640045636|
|0.30683403913477036|0.001426126640045636|
| 0.3018735488966755|0.001711351968054...|
| 0.2937777849370645|0.002852253280091272|
|0.28924051593720757|0.002852253280091272|
| 0.2845483882837043|0.003707929264118...|
| 0.2773256738134998|0.004278379920136908|
|0.27275265862114284|0.004563605248146035|
|0.26841669438331783|0.005704506560182544|
|0.26476201251216336|0.005704506560182544|
|0.26272428861254626|0.006845407872219...|
| 0.2606100069019035|0.007415858528237307|
| 0.2588907539043346| 0.00827153451226469|
| 0.2584910815096143| 0.00912721049629207|
|0.25767572140279715|0.009982886480319452|
| 0.2543025563314383|0.010553337136337706|
| 0.2521325985541839|0.011409013120365089|
|0.25034772076714373|0.011409013120365089|
|0.24811181262720183| 0.01226468910439247|
+----------

In [50]:
lr_summary.pr.show()

+--------------------+-------------------+
|              recall|          precision|
+--------------------+-------------------+
|                 0.0| 0.6666666666666666|
|0.001140901312036...| 0.6666666666666666|
|0.001426126640045636|                0.5|
|0.001426126640045636|             0.3125|
|0.001711351968054...| 0.2727272727272727|
|0.002852253280091272|0.38461538461538464|
|0.002852253280091272| 0.3333333333333333|
|0.003707929264118...| 0.3611111111111111|
|0.004278379920136908|0.36585365853658536|
|0.004563605248146035|0.34782608695652173|
|0.005704506560182544|0.39215686274509803|
|0.005704506560182544|0.36363636363636365|
|0.006845407872219...|              0.375|
|0.007415858528237307|0.38235294117647056|
| 0.00827153451226469|0.38666666666666666|
| 0.00912721049629207| 0.3902439024390244|
|0.009982886480319452| 0.3723404255319149|
|0.010553337136337706|0.37755102040816324|
|0.011409013120365089|0.37383177570093457|
|0.011409013120365089| 0.3508771929824561|
+----------

In [51]:
# Evaluate Performance for Test Data 
model_predictions = lr_model.transform(test)
model_predictions.columns

['features', 'label', 'rawPrediction', 'probability', 'prediction']

In [52]:
model_predictions.select(['label','probability','prediction']).show(10,False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[0.9021081812733895,0.09789181872661046]|0.0       |
|0    |[0.9001508887478817,0.09984911125211826]|0.0       |
|0    |[0.8961317145480946,0.1038682854519054] |0.0       |
|0    |[0.8940689578552654,0.10593104214473459]|0.0       |
|0    |[0.8898349323043317,0.11016506769566825]|0.0       |
|0    |[0.8898349323043317,0.11016506769566825]|0.0       |
|0    |[0.8876628016128597,0.11233719838714029]|0.0       |
|0    |[0.8854533560652911,0.11454664393470892]|0.0       |
|1    |[0.8854533560652911,0.11454664393470892]|0.0       |
|0    |[0.8832061728059836,0.11679382719401643]|0.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [53]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
lr_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
lr_auroc = lr_evaluator.evaluate(model_predictions)
print(f'The auroc value of Logistic Regression Model is {lr_auroc}')

The auroc value of Logistic Regression Model is 0.6217478094591657


In [54]:
lr_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
lr_aupr = lr_evaluator.evaluate(model_predictions)
print(f'The aupr value of Logistic Regression Model is {lr_aupr}')

The aupr value of Logistic Regression Model is 0.1728623284873477


In [55]:
true_pos = model_predictions.filter(model_predictions['label'] == 1).filter(model_predictions['prediction'] == 1).count()
actual_pos = model_predictions.filter(model_predictions['label'] == 1).count()
pred_pos = model_predictions.filter(model_predictions['prediction'] == 1).count()
print('Recall')
if (actual_pos != 0):
    float(true_pos)/(actual_pos)
print('Precision')
if (pred_pos != 0):
    float(true_pos)/(pred_pos)

Recall
Precision


In [56]:
# Build and Train Decision Tree Classifier Model
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt_model = dt.fit(train)
model_predictions = dt_model.transform(test)
model_predictions.select(['label','probability','prediction']).show(10,False)

+-----+---------------------------------------+----------+
|label|probability                            |prediction|
+-----+---------------------------------------+----------+
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
|1    |[0.895271524519896,0.10472847548010396]|0.0       |
|0    |[0.895271524519896,0.10472847548010396]|0.0       |
+-----+---------------------------------------+----------+
only showing top 10 rows



In [57]:
# Evaluate Performance of Test Data
dt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
dt_auroc = dt_evaluator.evaluate(model_predictions)
print(f'The auc value of Decision Tree Classifier Model is {dt_auroc}')

The auc value of Decision Tree Classifier Model is 0.45296650107384256


In [58]:
dt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
dt_aupr = dt_evaluator.evaluate(model_predictions)
print(f'The aupr value of Decision Tree Classifier Model is {dt_aupr}')

The aupr value of Decision Tree Classifier Model is 0.10112460095978193


In [59]:
true_pos = model_predictions.filter(model_predictions['label'] == 1).filter(model_predictions['prediction'] == 1).count()
actual_pos = model_predictions.filter(model_predictions['label'] == 1).count()
pred_pos = model_predictions.filter(model_predictions['prediction'] == 1).count()
print('Recall')
if (actual_pos != 0):
    float(true_pos)/(actual_pos)
print('Precision')
if (pred_pos != 0):
    float(true_pos)/(pred_pos)

Recall
Precision


In [60]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC()
lsvc_model = lsvc.fit(train)
model_predictions = lsvc_model.transform(test)
model_predictions.columns

['features', 'label', 'rawPrediction', 'prediction']

In [61]:
model_predictions.select(['label','prediction']).show(10,False)

+-----+----------+
|label|prediction|
+-----+----------+
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|0    |0.0       |
|1    |0.0       |
|0    |0.0       |
+-----+----------+
only showing top 10 rows



In [62]:
# Evaluate Performance on Test Data
svc_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
svc_auroc = svc_evaluator.evaluate(model_predictions)
print(f'The auc value of SupportVectorClassifier is {svc_auroc}')

The auc value of SupportVectorClassifier is 0.5968848848016877


In [63]:
svc_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
svc_aupr = svc_evaluator.evaluate(model_predictions)
print(f'The aupr value of SupportVectorClassifier is {svc_aupr}')

The aupr value of SupportVectorClassifier is 0.14725813117965697


In [64]:
true_pos = model_predictions.filter(model_predictions['label'] == 1).filter(model_predictions['prediction'] == 1).count()
actual_pos = model_predictions.filter(model_predictions['label'] == 1).count()
pred_pos = model_predictions.filter(model_predictions['prediction'] == 1).count()
print('Recall')
if (actual_pos != 0):
    float(true_pos)/(actual_pos)
print('Precision')
if (pred_pos != 0):
    float(true_pos)/(pred_pos)

Recall
Precision


In [65]:
# Build and Train SVM Train
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
nb_model = nb.fit(train)
model_predictions = nb_model.transform(test)
model_predictions.select(['label','probability','prediction']).show(10,False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[0.8562370871568835,0.14376291284311649]|0.0       |
|0    |[0.8559284221619687,0.14407157783803126]|0.0       |
|0    |[0.8553094387653087,0.14469056123469135]|0.0       |
|0    |[0.8549991193581558,0.1450008806418442] |0.0       |
|0    |[0.8543768226252731,0.14562317737472683]|0.0       |
|0    |[0.8543768226252731,0.14562317737472683]|0.0       |
|0    |[0.8540648443042509,0.14593515569574905]|0.0       |
|0    |[0.8537523120194936,0.14624768798050644]|0.0       |
|1    |[0.8537523120194936,0.14624768798050644]|0.0       |
|0    |[0.8534392252776417,0.14656077472235818]|0.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [66]:
# Evaluate the Performance on Test Data
nb_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
nb_auroc = nb_evaluator.evaluate(model_predictions)
print(f'The auc value of NB Classifier is {nb_auroc}')

The auc value of NB Classifier is 0.5385761345090099


In [67]:
nb_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
nb_aupr = nb_evaluator.evaluate(model_predictions)
print(f'The aupr value of NB Classifier is {nb_aupr}')

The aupr value of NB Classifier is 0.12686823143109424


In [68]:
true_pos = model_predictions.filter(model_predictions['label'] == 1).filter(model_predictions['prediction'] == 1).count()
actual_pos = model_predictions.filter(model_predictions['label'] == 1).count()
pred_pos = model_predictions.filter(model_predictions['prediction'] == 1).count()
print('Recall')
if (actual_pos != 0):
    float(true_pos)/(actual_pos)
print('Precision')
if (pred_pos != 0):
    float(true_pos)/(pred_pos)

Recall
Precision


In [69]:
# Build and Train the GBT Model
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier()
gbt_model = gbt.fit(train)
model_predictions = gbt_model.transform(test)
model_predictions.select(['label','probability','prediction']).show(10,False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[0.85012614422865,0.14987385577135004]  |0.0       |
|0    |[0.85012614422865,0.14987385577135004]  |0.0       |
|0    |[0.8406966539959689,0.15930334600403107]|0.0       |
|0    |[0.8667593815300664,0.13324061846993362]|0.0       |
|0    |[0.8667593815300664,0.13324061846993362]|0.0       |
|0    |[0.8667593815300664,0.13324061846993362]|0.0       |
|0    |[0.8673628690517133,0.13263713094828666]|0.0       |
|0    |[0.8699471400084235,0.13005285999157645]|0.0       |
|1    |[0.8699471400084235,0.13005285999157645]|0.0       |
|0    |[0.8699471400084235,0.13005285999157645]|0.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [70]:
# Evaluate Performance on Test Data
gbt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
gbt_auroc = gbt_evaluator.evaluate(model_predictions)
print(f'The auc value of GradientBoostedTreesClassifier {gbt_auroc}')

The auc value of GradientBoostedTreesClassifier 0.6594454285356434


In [71]:
gbt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
gbt_aupr = gbt_evaluator.evaluate(model_predictions)
print(f'The aupr value of GradientBoostedTreesClassifier {gbt_aupr}')

The aupr value of GradientBoostedTreesClassifier 0.22409626829491014


In [72]:
true_pos = model_predictions.filter(model_predictions['label'] == 1).filter(model_predictions['prediction'] == 1).count()
actual_pos = model_predictions.filter(model_predictions['label'] == 1).count()
pred_pos = model_predictions.filter(model_predictions['prediction'] == 1).count()
print('Recall')
if (actual_pos != 0):
    float(true_pos)/(actual_pos)
print('Precision')
if (pred_pos != 0):
    float(true_pos)/(pred_pos)

Recall
Precision


In [73]:
# Build and Train the Random Forest Model
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=50,maxDepth=30)
rf_model = rf.fit(train)
model_predictions = rf_model.transform(test)
model_predictions.select(['label','probability','prediction']).show(10,False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|0    |[0.8578064570054239,0.14219354299457604]|0.0       |
|0    |[0.8727855867596094,0.1272144132403907] |0.0       |
|0    |[0.8717816939817045,0.12821830601829556]|0.0       |
|0    |[0.855876336582516,0.144123663417484]   |0.0       |
|0    |[0.8801341435027843,0.11986585649721569]|0.0       |
|0    |[0.8801341435027843,0.11986585649721569]|0.0       |
|0    |[0.8755445242781492,0.12445547572185087]|0.0       |
|0    |[0.8721588419744968,0.12784115802550314]|0.0       |
|1    |[0.8721588419744968,0.12784115802550314]|0.0       |
|0    |[0.8721588419744968,0.12784115802550314]|0.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [74]:
# Evaluate Performance on Test Data
rf_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
rf_auroc = rf_evaluator.evaluate(model_predictions)
print(f'The auc value of RandomForestClassifier model is {rf_auroc}')

The auc value of RandomForestClassifier model is 0.6542310919333579


In [75]:
rf_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
rf_aupr = rf_evaluator.evaluate(model_predictions)
print(f'The aupr value of RandomForestClassifier model is {rf_aupr}')

The aupr value of RandomForestClassifier model is 0.21757247145983355


In [76]:
true_pos = model_predictions.filter(model_predictions['label'] == 1).filter(model_predictions['prediction'] == 1).count()
actual_pos = model_predictions.filter(model_predictions['label'] == 1).count()
pred_pos = model_predictions.filter(model_predictions['prediction'] == 1).count()
print('Recall')
if (actual_pos != 0):
    float(true_pos)/(actual_pos)
print('Precision')
if (pred_pos != 0):
    float(true_pos)/(pred_pos)

Recall
Precision


In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
rf = RandomForestClassifier()
paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth,[5,10,20,25,30])
            .addGrid(rf.maxBins,[20,60])
            .addGrid(rf.numTrees,[5,20,50,100])).build()
cv = CrossValidator(estimator=rf,estimatorParamMaps=paramGrid,evaluator=rf_evaluator,numFolds=5)
cv_model = cv.fit(train)
best_rf_model = cv_model.bestModel
model_predictions = best_rf_model.transform(test)
rf_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
rf_auroc = rf_evaluator.evaluate(model_predictions)
print(rf_auroc)