In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, RobustScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
import numpy as np
import matplotlib as plt

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("AppName") \
    .getOrCreate()

spark.conf.set("spark.sql.debug.maxToStringFields", 200)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/30 16:28:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = spark.read.option('escape','"').csv('kickstarter_cleaned.csv', header=True, inferSchema=True, mode="DROPMALFORMED")
data = data.dropna()

                                                                                

In [4]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

## Pipeline
- String Indexer on categorical columns
- Label indexer on "state" target column
- Vector Assembler on numerical columns -> Robust Scaler
- Vector Assembler on scaled features + categorical features

In [5]:
categorical_cols = ["main_category", "currency", "year", "month", "day_of_week", "continent", "use_of_?!"]
numerical_cols = ["goal", "time_interval", "length_of_title"]

In [6]:
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in categorical_cols]

label_indexer = StringIndexer(inputCol="state", outputCol="state_index")

In [7]:
num_assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")
scaler = RobustScaler(inputCol="features", outputCol="scaled_features")

In [8]:
assembler_all = VectorAssembler(inputCols=["scaled_features"] + [col+"_index" for col in categorical_cols], outputCol="final_features")

In [9]:
pipeline = Pipeline(stages=indexers + [label_indexer, num_assembler, scaler, assembler_all])
pipeline_model = pipeline.fit(trainingData)
trainingData = pipeline_model.transform(trainingData)

                                                                                

In [10]:
for cols in categorical_cols:
    name_col = cols+"_index"
    trainingData = trainingData.withColumn(name_col, col(name_col).cast("integer"))

## MultiLayer Perceptron

#### Model

In [11]:
mlp = MultilayerPerceptronClassifier(labelCol="state_index", featuresCol="final_features", \
                                     layers=[len(numerical_cols) + len(categorical_cols), 2, 2, 2], \
                                     stepSize=0.05, \
                                     maxIter=80, \
                                     seed=42)

#### Train

In [12]:
%%time
mlp_model = mlp.fit(trainingData)

                                                                                

CPU times: user 37.5 ms, sys: 21.5 ms, total: 59 ms
Wall time: 51 s


#### Test

In [13]:
testData = pipeline_model.transform(testData)
predictions = mlp_model.transform(testData)

#### Performance

In [14]:
binary_evaluator = BinaryClassificationEvaluator(labelCol="state_index", metricName="areaUnderROC")
auroc = binary_evaluator.evaluate(predictions)

# Evaluate F-measure
multiclass_evaluator = MulticlassClassificationEvaluator(labelCol="state_index", metricName="f1")
f_measure = multiclass_evaluator.evaluate(predictions)

print(f"Area under ROC (AUROC): {auroc}")
print(f"F-measure: {f_measure}")

[Stage 130:>                                                        (0 + 1) / 1]

Area under ROC (AUROC): 0.671037641494408
F-measure: 0.6218828699601252


                                                                                

In [15]:
input_layer_weights = mlp_model.weights.toArray()[:len(numerical_cols) + len(categorical_cols)]

features_name = ["goal", "time_interval", "length_of_title", "main_category", "currency", "year", "month", "day_of_week", "continent", "use_of_?!"]

print("Input Layer Weights:")
for i, weight in enumerate(input_layer_weights):
    print(f"Weight of {features_name[i]}: "+f"{weight}")

Input Layer Weights:
Weight of goal: 0.25396692955299305
Weight of time_interval: -2.518343131071752
Weight of length_of_title: 0.18499214622141333
Weight of main_category: -0.0880339192874887
Weight of currency: -1.1620272816577222
Weight of year: -0.16586887360398933
Weight of month: 0.06295690333799958
Weight of day_of_week: 0.02181254815024463
Weight of continent: 0.2363525889079787
Weight of use_of_?!: 0.18670245633045898


## GridSearch with CrossValidation

In [16]:
mlp_grid = MultilayerPerceptronClassifier(labelCol="state_index", featuresCol="final_features", \
                                     layers=[len(numerical_cols) + len(categorical_cols), 2, 2, 2], \
                                         maxIter=100)

In [17]:
param_grid = ParamGridBuilder() \
    .addGrid(mlp_grid.layers, [[len(numerical_cols) + len(categorical_cols), 2, 2, 2], \
                               [len(numerical_cols) + len(categorical_cols), 4, 4, 2],\
                              [len(numerical_cols) + len(categorical_cols), 6, 4, 2]]) \
    .addGrid(mlp_grid.stepSize, [0.01, 0.05, 0.1]) \
    .build()

In [18]:
evaluator = BinaryClassificationEvaluator(labelCol="state_index", metricName="areaUnderROC")

crossval = CrossValidator(estimator=mlp_grid,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=4) 

In [19]:
%%time
cv_model = crossval.fit(trainingData)

                                                                                

CPU times: user 2.73 s, sys: 608 ms, total: 3.34 s
Wall time: 29min 56s


In [22]:
#Print hyperparameters of the model
best_model = cv_model.bestModel
best_model_params = best_model.extractParamMap()

for param, value in best_model_params.items():
    print(f"{param.name}: {value}")

blockSize: 128
featuresCol: final_features
labelCol: state_index
maxIter: 100
predictionCol: prediction
probabilityCol: probability
rawPredictionCol: rawPrediction
seed: 3625276727404315868
solver: l-bfgs
stepSize: 0.01
tol: 1e-06
layers: [10, 6, 4, 2]


In [20]:
#testData = pipeline_model.transform(testData)
predictions = cv_model.transform(testData)

In [21]:
binary_evaluator = BinaryClassificationEvaluator(labelCol="state_index", metricName="areaUnderROC")
auroc = binary_evaluator.evaluate(predictions)

# Evaluate F-measure
multiclass_evaluator = MulticlassClassificationEvaluator(labelCol="state_index", metricName="f1")
f_measure = multiclass_evaluator.evaluate(predictions)

# Evaluate Accuracy
acc_evaluator = MulticlassClassificationEvaluator(labelCol="state_index", metricName="accuracy")
acc_measure = acc_evaluator.evaluate(predictions)

print(f"Area under ROC (AUROC): {auroc}")
print(f"F-measure: {f_measure}")
print(f"Accuracy: {acc_measure}")

[Stage 4698:>                                                       (0 + 1) / 1]

Area under ROC (AUROC): 0.6744945687664258
F-measure: 0.6241596088716185


                                                                                