# Sparkify notebook "Machine Learning" on AWS

This notebook connects to the full "Sparkify" dataset on an S3 storage at AWS. It bases on the local evaluation of the small dataset provided by udacity. The PySpark Kernel available on the EMR-Notebook at AWS misses some of the python libraries I needed to process my data and the code for plotting the data differs from my local notebook. A very good introduction on how to adjust the code is provided by Amazon: https://aws.amazon.com/de/blogs/big-data/install-python-libraries-on-a-running-cluster-with-emr-notebooks/

## install missing libraries

In [None]:
sc.install_pypi_package("pandas==0.25.1") #Install pandas version 0.25.1 
sc.install_pypi_package("matplotlib==3.1.1", "https://pypi.org/simple") #Install matplotlib from given PyPI repository
sc.install_pypi_package("plotly") #Install plotly
sc.install_pypi_package("openpyxl") #Install openpyxl
sc.install_pypi_package("seaborn") #Install seaborn

## import all libraries needed

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc, count, countDistinct
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import mean as Fmean
from pyspark.sql.functions import round as Fround
from pyspark.sql.functions import max as Fmax
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, Normalizer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import datetime

import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

## define metrics for model evaluation

In [None]:
def model_evaluator(results):
    """
    This function calculates the true/false positive/negative prediction from the predicted test dataset.
    Those are then used to calculate the model evaluation metrics and a confusion matrix.
    
    INPUT:
    results: test-dataset including the prediction column
    
    OUTPUT:
    standard metrics for model evaluation and a confusion matrix
    """
    
    #extract right and wrong predicted values and count their numbers 
    true_negative = results.filter((results.prediction == 0)&(results.churn==0)).count() * 1.0 
    false_positive = results.filter((results.prediction == 1)&(results.churn==0)).count() * 1.0 
    false_negative = results.filter((results.prediction == 0)&(results.churn==1)).count() * 1.0 
    true_positive = results.filter((results.prediction == 1)&(results.churn==1)).count() * 1.0 
    
    #calculate standard measures for evaluating the model
    accuracy = (true_positive+true_negative)/(true_negative+false_positive+false_negative+true_positive)
    precision = true_positive/(true_positive+false_positive)
    recall = true_positive/(true_positive+false_negative)
    f1 = 2.0 * (precision * recall)/(precision + recall)
    
    #
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("F1-Score: {}".format(f1))
    print("Recall: {}".format(recall))
    
    #create confusion matrix to illustrate model quality
    print("\n Confusion Matrix \n")
    print("TRUE_NEGATIVE:{} 	 FALSE_POSITIVE:{}".format(true_negative,false_positive))
    print("FALSE_NEGATIVE:{} 	 TRUE_POSITIVE: {}".format(false_negative, true_positive))

## setup spark session

In [None]:
# Create spark session
spark = SparkSession \
    .builder \
    .appName("Sparkify") \
    .getOrCreate()

## import data from S3-repository

In [None]:
# Read in full sparkify dataset
event_data = "s3n://udacity-dsnd/sparkify/sparkify_event_data.json"
user_log = spark.read.json(event_data)
user_log.printSchema()

## clean dataset

In [None]:
#drop invalid (guest) userId's
user_log = user_log.where(user_log.userId != "")
print("dataframe w/o empty userId's has",user_log.count(),"rows and",len(user_log.columns),"columns.")

In [None]:
#drop duplicate rows
user_log = user_log.dropDuplicates()
print("dataframe w/o duplicates has",user_log.count(),"rows and",len(user_log.columns),"columns.")

## feature creation and selection

### create new feature "membership_days" as mutual time reference for all users

In [None]:
user_log=user_log.withColumn("membership_days", Fround((col('ts')/1000-col('registration')/1000)/86400).cast(IntegerType()))

### convert columns with categorical variables into 0 and 1 columns

In [None]:
column_list=['auth','gender','level','status','page']

In [None]:
cols_add=[]
for column in column_list:
    categories = user_log.select(column).distinct().rdd.flatMap(lambda x: x).collect()
    cols_add = cols_add + [F.when(F.col(column) == cat, 1).otherwise(0).alias(column + "_" + str(cat).lower().replace(" ","_")) for cat in categories]

In [None]:
user_log_features =user_log.select("userId","membership_days",*cols_add)
df.printSchema()

In [None]:
print("dataframe has",user_log_features.count(),"rows and",len(user_log_features.columns),"columns.")

### aggregate data by "userId" to prepare for ML

In [None]:
df=user_log_features\
        .groupBy("userId") \
        .agg( \
             Fmax("membership_days").alias("max_membership_days"), \
             countDistinct("membership_days").alias("active_days"), \
             Fmax("gender_m").alias("gender_m"), \
             Fmax("level_paid").alias("level_paid"), \
             Fsum("status_200").alias("sum_status_200"), \
             Fsum("status_307").alias("sum_status_307"), \
             Fsum("status_404").alias("sum_status_404"), \
             Fsum("page_nextsong").alias("sum_page_nextsong"), \
             Fsum("page_add_to_playlist").alias("sum_page_add_to_playlist"), \
             Fsum("page_roll_advert").alias("sum_page_roll_advert"), \
             Fsum("page_thumbs_up").alias("sum_page_thumbs_up"), \
             Fsum("page_home").alias("sum_page_home"), \
             Fsum("page_logout").alias("sum_page_logout"), \
             Fsum("page_help").alias("sum_page_help"), \
             Fsum("page_upgrade").alias("sum_page_upgrade"), \
             Fsum("page_add_friend").alias("sum_page_add_friend"), \
             Fsum("page_settings").alias("sum_page_settings"), \
             Fsum("page_submit_upgrade").alias("sum_page_submit_upgrade"), \
             Fsum("page_about").alias("sum_page_about"), \
             Fsum("page_submit_downgrade").alias("sum_page_submit_downgrade"), \
             Fsum("page_error").alias("sum_page_error"), \
             Fsum("page_save_settings").alias("sum_page_save_settings"), \
             Fsum("page_cancel").alias("sum_page_cancel"), \
             Fsum("page_cancellation_confirmation").alias("churn") \
             ).dropna()

In [None]:
print("dataframe 'df' has",df.count(),"rows and",len(df.columns),"columns.")

## machine learning section
### Split In Test And Train Dataset

In [None]:
#Split the data
(training_data, test_data) = df.randomSplit([0.8,0.2], seed = 42)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

### Setting up ML Pipeline

In [None]:
#definition of features
input_cols=df.columns[1:-2]
print('Feature overview:',input_cols)

#Configure an ML pipeline, which consists of three stages: assemble, normalize, estimator
assembler = VectorAssembler(inputCols=input_cols, outputCol='features')

scaler = StandardScaler(inputCol="features", outputCol="ScaledFeatures", withMean=True, withStd=True) 

rf = RandomForestClassifier(labelCol="churn", 
                            featuresCol="ScaledFeatures")
dt = DecisionTreeClassifier(featuresCol = "ScaledFeatures", 
                            labelCol = "churn")

pipeline_rf=Pipeline(stages=[assembler, scaler, rf])
pipeline_dt=Pipeline(stages=[assembler, scaler, dt])

### decision tree classifier

In [None]:
#Fit Decision Tree Classifier to training data and transform test data
model_dt = pipeline_dt.fit(training_data)
dt_predictions = model_dt.transform(test_data)
#Determine accuracy, f1, and precision of prediction
model_evaluator(dt_predictions)

In [None]:
importances_dt = model_dt.stages[-1].featureImportances.toArray()
feature_ranking_dt=pd.DataFrame(data={'Features': np.array(input_cols),'Importance':importances_dt})\
                    .sort_values('Importance', ascending=False).reset_index(drop=True)
feature_ranking_dt

### random forest classifier

In [None]:
#Fit Random Forest Classifier to training data and transform test data
model_rf = pipeline_rf.fit(training_data)
rf_predictions = model_rf.transform(test_data)

#Determine accuracy, f1, and precision of prediction
model_evaluator(rf_predictions)

In [None]:
#rank features
importances_rf = model_rf.stages[-1].featureImportances.toArray()
feature_ranking_rf=pd.DataFrame(data={'Features': np.array(input_cols),'Importance':importances_rf})\
                    .sort_values('Importance', ascending=False).reset_index(drop=True)
feature_ranking_rf

# Hyperparameter Tuning Via CrossValidator

### Random Forest Classifier - setting up parameter space

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxBins, [20, 30, 40]) \
    .addGrid(rf.maxDepth, [3, 4, 5, 6]) \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.impurity, ['gini','entropy']) \
    .build()

crossval_rf = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol = "churn", metricName = 'f1'),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel_rf = crossval_rf.fit(training_data)
prediction_rf = cvModel_rf.transform(test_data)

### Create Report Of Performance

In [None]:
#Show results for all model parameters
params_rf_cv = [{p.name: v for p, v in m.items()} for m in cvModel_rf.getEstimatorParamMaps()]
results_rf_cv=pd.DataFrame.from_dict([
    {cvModel.getEvaluator().getMetricName(): metric, **ps} 
    for ps, metric in zip(params_rf_cv, cvModel_rf.avgMetrics)
])

results_rf_cv

In [None]:
# check best model performance
model_evaluator(prediction_rf)

In [None]:
#rank features of best model
importances_rf_cv = cvModel_rf.bestModel.stages[-1].featureImportances.toArray()
feature_ranking_rf_cv=pd.DataFrame(data={'Features': np.array(input_cols),'Importance':importances_rf_cv})\
                    .sort_values('Importance', ascending=False).reset_index(drop=True)
feature_ranking_rf_cv