In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, RobustScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.sql.functions import length, col, expr
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("AppName") \
    .getOrCreate()

spark.conf.set("spark.sql.debug.maxToStringFields", 200)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/14 16:36:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/14 16:37:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
data = spark.read.option('escape','"').csv('kickstarter_cleaned.csv', header=True, inferSchema=True, mode="DROPMALFORMED")
data = data.dropna()

                                                                                

In [4]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

## Pipeline
- String Indexer on categorical columns
- Label indexer on "state" target column
- Vector Assembler on numerical columns -> Robust Scaler
- Vector Assembler on scaled features + categorical features

In [5]:
categorical_cols = ["main_category", "currency", "year", "month", "day_of_week", "continent", "use_of_?!"]
numerical_cols = ["goal", "time_interval", "length_of_title"]

In [6]:
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in categorical_cols]

label_indexer = StringIndexer(inputCol="state", outputCol="state_index")

In [7]:
num_assembler = VectorAssembler(inputCols=numerical_cols, outputCol="features")
scaler = RobustScaler(inputCol="features", outputCol="scaled_features")

In [8]:
assembler_all = VectorAssembler(inputCols=["scaled_features"] + [col+"_index" for col in categorical_cols], outputCol="final_features")

In [9]:
pipeline = Pipeline(stages=indexers + [label_indexer, num_assembler, scaler, assembler_all])
pipeline_model = pipeline.fit(trainingData)
trainingData = pipeline_model.transform(trainingData)

                                                                                

## Logistic Regression Model

The model defined here use the hyperparameters found with the grid search. Useful to test the model and avoid a huge computation time do to the cross validation.

In [10]:
lr = LogisticRegression(labelCol='state_index', featuresCol='scaled_features', maxIter=10)
lr_model = lr.fit(trainingData)

                                                                                

In [12]:
testData = pipeline_model.transform(testData)

In [23]:
evaluator_ROC = BinaryClassificationEvaluator(labelCol="state_index", metricName="areaUnderROC")
evaluator_acc = MulticlassClassificationEvaluator(labelCol="state_index", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="state_index", metricName="f1")

In [24]:
predictions = lr_model.transform(testData)

area_under_roc = evaluator_ROC.evaluate(predictions)
print(f"Area under ROC on test data = {area_under_roc}")

accuracy = evaluator_acc.evaluate(predictions)
print(f"Accuracy on test data = {accuracy}")

f1 = evaluator_f1.evaluate(predictions)
print(f"F1 on test data = {f1}")

                                                                                

Area under ROC on test data = 0.6425832217153836


                                                                                

Accuracy on test data = 0.6128318138823482


[Stage 112:>                                                        (0 + 1) / 1]

F1 on test data = 0.5721266708046735


                                                                                