In [1]:
# Reading CSV to a df - Change to spark.read.csv
url = "https://raw.githubusercontent.com/raul-arrabales/BigData-Hands-on/master/Spark/Spark3/data/GimeMeCredit.csv"
from pyspark import SparkFiles
spark.sparkContext.addFile(url)

df = spark.read.csv("file://"+SparkFiles.get("GimeMeCredit.csv"), header=True, inferSchema=True, sep=",")

In [2]:
df.printSchema()

In [3]:
# Missing values?
df.filter(df.NumberOfTimes90DaysLate == '').count()

In [4]:
# Checking number of delinquents
df.groupBy("label").count().show()

In [5]:
# Register a temp table to be used with the SQL API
df.registerTempTable("traindata")

In [6]:
# Using a selection from the Spark Dataframe as a pandas dataframe
import pandas as pd
matureDelinquents = df.filter("label = 1 and age > 50").toPandas()
matureDelinquents.head()

Unnamed: 0,_id,label,Revolving,age,Num3059,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumLoans,Num6089,NumberOfDependents
0,96226,1.0,0.974184,55,1,0.565208,4500.0,7,0,2,0,0
1,43024,1.0,0.820257,73,1,0.588343,4374.0,9,2,2,0,0
2,17205,1.0,1.0,53,0,0.0,2900.0,2,1,0,0,0
3,71057,1.0,0.739464,67,0,1.083739,6364.0,22,0,9,0,0
4,120425,1.0,1.001052,54,0,0.371898,5600.0,6,0,2,0,0


In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [8]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df)
labelIndexer.labels

In [9]:
plan_indexer = StringIndexer(inputCol = 'intl_plan', outputCol = 'intl_plan_indexed')

In [10]:
# Create the feature columns
assembler = VectorAssembler(
  inputCols = ['Revolving', 'age', 'Num3059', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumLoans', 'Num6089', 'NumberOfDependents'],
  outputCol = 'features')

In [11]:
# Training the model
classifier = RandomForestClassifier(labelCol = 'label', featuresCol = 'features')
pipeline = Pipeline(stages=[labelIndexer, assembler, classifier])
model = pipeline.fit(df)

In [12]:
# Evaluating the model
# Need to do that over a test set, not the same training set. 
from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
predictions = model.transform(df)
evaluator = BinaryClassificationEvaluator()
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

In [13]:
predictions.take(3)

In [14]:
auroc

In [15]:
df.printSchema()

In [16]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [17]:
train, test = df.randomSplit([0.9, 0.1], seed=12345)

In [19]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

In [20]:
display(rf.params)

doc,name,parent,typeConverter
Whether bootstrap samples are used when building trees.,bootstrap,RandomForestClassifier_5f68df019994,List()
"If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.",cacheNodeIds,RandomForestClassifier_5f68df019994,List()
set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.,checkpointInterval,RandomForestClassifier_5f68df019994,List()
"The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto'",featureSubsetStrategy,RandomForestClassifier_5f68df019994,List()
features column name.,featuresCol,RandomForestClassifier_5f68df019994,List()
"Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini",impurity,RandomForestClassifier_5f68df019994,List()
label column name.,labelCol,RandomForestClassifier_5f68df019994,List()
Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.,leafCol,RandomForestClassifier_5f68df019994,List()
Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.,maxBins,RandomForestClassifier_5f68df019994,List()
"Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.",maxDepth,RandomForestClassifier_5f68df019994,List()


In [21]:
paramGrid = ParamGridBuilder()\
    .addGrid(rf.numTrees, [10, 20, 50, 100]) \
    .addGrid(rf.maxDepth, [0, 1, 4, 8])\
    .addGrid(rf.impurity, ["entropy", "gini"])\
    .build()


In [22]:
tvs = TrainValidationSplit(estimator=pipeline,
                           estimatorParamMaps=paramGrid,
                           evaluator=BinaryClassificationEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

In [23]:
# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

In [24]:
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
testResults = model.transform(test)\
    .select("features", "label", "prediction")

In [25]:
testResults.count()

In [26]:
predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator()
auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
auroc