# Prediction Of Churn

## Preparing Data
### Import Of All Libraries

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc, count, countDistinct
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import mean as Fmean
from pyspark.sql.functions import round as Fround
from pyspark.sql.functions import max as Fmax
from pyspark.sql.functions import col

import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Initiate Spark Session

In [2]:
spark = SparkSession \
    .builder \
    .appName("Sparkify_Data_Exploration") \
    .getOrCreate()

### Import Data

In [3]:
path = "C:/Users/krs1bbh/OneDrive - Robert Bosch GmbH/Documents/udacity/Final/data/mini_sparkify_event_data.json"
user_log = spark.read.json(path)
print("initial dataframe has",user_log.count(),"rows and",len(user_log.columns),"columns.")

initial dataframe has 286500 rows and 18 columns.


## Preparing Data for ML

### Clean Data

In [4]:
#drop invalid (guest) userId's
user_log = user_log.where(user_log.userId != "")
print("dataframe w/o empty userId's has",user_log.count(),"rows and",len(user_log.columns),"columns.")

dataframe w/o empty userId's has 278154 rows and 18 columns.


In [5]:
#drop duplicate rows
user_log = user_log.dropDuplicates()
print("dataframe w/o duplicates has",user_log.count(),"rows and",len(user_log.columns),"columns.")

dataframe w/o duplicates has 278154 rows and 18 columns.


### create column with days since registration (membership days) and dummy relevant columns

In [6]:
user_log=user_log.withColumn("membership_days", Fround((col('ts')/1000-col('registration')/1000)/86400).cast(IntegerType()))

In [7]:
column_list=['auth','gender','level','status','page']

In [8]:
cols_add=[]
for column in column_list:
    categories = user_log.select(column).distinct().rdd.flatMap(lambda x: x).collect()
    cols_add = cols_add + [F.when(F.col(column) == cat, 1).otherwise(0).alias(column + "_" + str(cat).replace(" ","_")) for cat in categories]

In [9]:
user_log_features =user_log.select("userId","membership_days",*cols_add)
user_log_features.printSchema()

root
 |-- userId: string (nullable = true)
 |-- membership_days: integer (nullable = true)
 |-- auth_Cancelled: integer (nullable = false)
 |-- auth_Logged_In: integer (nullable = false)
 |-- gender_F: integer (nullable = false)
 |-- gender_M: integer (nullable = false)
 |-- level_free: integer (nullable = false)
 |-- level_paid: integer (nullable = false)
 |-- status_307: integer (nullable = false)
 |-- status_404: integer (nullable = false)
 |-- status_200: integer (nullable = false)
 |-- page_Cancel: integer (nullable = false)
 |-- page_Submit_Downgrade: integer (nullable = false)
 |-- page_Thumbs_Down: integer (nullable = false)
 |-- page_Home: integer (nullable = false)
 |-- page_Downgrade: integer (nullable = false)
 |-- page_Roll_Advert: integer (nullable = false)
 |-- page_Logout: integer (nullable = false)
 |-- page_Save_Settings: integer (nullable = false)
 |-- page_Cancellation_Confirmation: integer (nullable = false)
 |-- page_About: integer (nullable = false)
 |-- page_Set

In [10]:
print("dataframe has",user_log_features.count(),"rows and",len(user_log_features.columns),"columns.")

dataframe has 278154 rows and 30 columns.


### Aggregate Data By "userId" To Prepare For ML

In [11]:
df=user_log_features\
        .groupBy("userId") \
        .agg( \
             Fmax("membership_days").alias("max_membership_days"), \
             countDistinct("membership_days").alias("active_days"), \
             Fmax("gender_m").alias("gender_m"), \
             Fmax("level_paid").alias("level_paid"), \
             Fsum("status_200").alias("sum_status_200"), \
             Fsum("status_307").alias("sum_status_307"), \
             Fsum("status_404").alias("sum_status_404"), \
             Fsum("page_nextsong").alias("sum_page_nextsong"), \
             Fsum("page_add_to_playlist").alias("sum_page_add_to_playlist"), \
             Fsum("page_roll_advert").alias("sum_page_roll_advert"), \
             Fsum("page_thumbs_up").alias("sum_page_thumbs_up"), \
             Fsum("page_home").alias("sum_page_home"), \
             Fsum("page_logout").alias("sum_page_logout"), \
             Fsum("page_help").alias("sum_page_help"), \
             Fsum("page_upgrade").alias("sum_page_upgrade"), \
             Fsum("page_add_friend").alias("sum_page_add_friend"), \
             Fsum("page_settings").alias("sum_page_settings"), \
             Fsum("page_submit_upgrade").alias("sum_page_submit_upgrade"), \
             Fsum("page_about").alias("sum_page_about"), \
             Fsum("page_submit_downgrade").alias("sum_page_submit_downgrade"), \
             Fsum("page_error").alias("sum_page_error"), \
             Fsum("page_save_settings").alias("sum_page_save_settings"), \
             Fsum("page_cancel").alias("sum_page_cancel"), \
             Fsum("page_cancellation_confirmation").alias("churn") \
             )

In [12]:
print("dataframe 'df' has",df.count(),"rows and",len(df.columns),"columns.")

dataframe 'df' has 225 rows and 25 columns.


## Machine Learning section

In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, Normalizer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

def model_evaluator(results):
    """
    This function calculates the true/false positive/negative prediction from the predicted test dataset.
    Those are then used to calculate the model evaluation metrics and a confusion matrix.
    
    INPUT:
    results: test-dataset including the prediction column
    
    OUTPUT:
    standard metrics for model evaluation and a confusion matrix
    """
    
    #extract right and wrong predicted values and count their numbers 
    true_negative = results.filter((results.prediction == 0)&(results.churn==0)).count() * 1.0 
    false_positive = results.filter((results.prediction == 1)&(results.churn==0)).count() * 1.0 
    false_negative = results.filter((results.prediction == 0)&(results.churn==1)).count() * 1.0 
    true_positive = results.filter((results.prediction == 1)&(results.churn==1)).count() * 1.0 
    
    #calculate standard measures for evaluating the model
    accuracy = (true_positive+true_negative)/(true_negative+false_positive+false_negative+true_positive)
    precision = true_positive/(true_positive+false_positive)
    recall = true_positive/(true_positive+false_negative)
    f1 = 2.0 * (precision * recall)/(precision + recall)
    
    #
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("F1-Score: {}".format(f1))
    print("Recall: {}".format(recall))
    
    #create confusion matrix to illustrate model quality
    print("\n Confusion Matrix \n")
    print("TRUE_NEGATIVE:{} 	 FALSE_POSITIVE:{}".format(true_negative,false_positive))
    print("FALSE_NEGATIVE:{} 	 TRUE_POSITIVE: {}".format(false_negative, true_positive))

### Split In Test And Train Dataset

In [14]:
#Split the data
(training_data, test_data) = df.randomSplit([0.8,0.2], seed = 42)
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 191
Test Dataset Count: 34


### Setting up ML Pipeline

In [15]:
#definition of features
input_cols=df.columns[1:-2]
print('Feature overview:',input_cols)

#Configure an ML pipeline, which consists of three stages: assemble, normalize, estimator
assembler = VectorAssembler(inputCols=input_cols, outputCol='features')

scaler = StandardScaler(inputCol="features", outputCol="ScaledFeatures", withMean=True, withStd=True) 

rf = RandomForestClassifier(labelCol="churn", 
                            featuresCol="ScaledFeatures")
dt = DecisionTreeClassifier(featuresCol = "ScaledFeatures", 
                            labelCol = "churn")

pipeline_rf=Pipeline(stages=[assembler, scaler, rf])
pipeline_dt=Pipeline(stages=[assembler, scaler, dt])

Feature overview: ['max_membership_days', 'active_days', 'gender_m', 'level_paid', 'sum_status_200', 'sum_status_307', 'sum_status_404', 'sum_page_nextsong', 'sum_page_add_to_playlist', 'sum_page_roll_advert', 'sum_page_thumbs_up', 'sum_page_home', 'sum_page_logout', 'sum_page_help', 'sum_page_upgrade', 'sum_page_add_friend', 'sum_page_settings', 'sum_page_submit_upgrade', 'sum_page_about', 'sum_page_submit_downgrade', 'sum_page_error', 'sum_page_save_settings']


### Decision Tree Classifier

In [16]:
#Fit Decision Tree Classifier to training data and transform test data
model_dt = pipeline_dt.fit(training_data)
dt_predictions = model_dt.transform(test_data)

In [17]:
#Determine accuracy, f1, and precision of prediction
model_evaluator(dt_predictions)

Accuracy: 0.7352941176470589
Precision: 0.42857142857142855
F1-Score: 0.39999999999999997
Recall: 0.375

 Confusion Matrix 

TRUE_NEGATIVE:22.0 	 FALSE_POSITIVE:4.0
FALSE_NEGATIVE:5.0 	 TRUE_POSITIVE: 3.0


In [18]:
importances_dt = model_dt.stages[-1].featureImportances.toArray()
feature_ranking_dt=pd.DataFrame(data={'Features': np.array(input_cols),'Importance':importances_dt})\
                    .sort_values('Importance', ascending=False).reset_index(drop=True)
feature_ranking_dt

Unnamed: 0,Features,Importance
0,max_membership_days,0.400418
1,sum_page_add_to_playlist,0.153633
2,sum_status_404,0.097254
3,sum_page_roll_advert,0.087234
4,active_days,0.074382
5,sum_page_about,0.074068
6,sum_page_logout,0.049393
7,sum_page_save_settings,0.036354
8,gender_m,0.027265
9,sum_page_nextsong,0.0


### Random Forest Classifier

In [19]:
#Fit Random Forest Classifier to training data and transform test data
model_rf = pipeline_rf.fit(training_data)
rf_predictions = model_rf.transform(test_data)

In [20]:
#Determine accuracy, f1, and precision of prediction
model_evaluator(rf_predictions)

Accuracy: 0.8235294117647058
Precision: 1.0
F1-Score: 0.4
Recall: 0.25

 Confusion Matrix 

TRUE_NEGATIVE:26.0 	 FALSE_POSITIVE:0.0
FALSE_NEGATIVE:6.0 	 TRUE_POSITIVE: 2.0


In [21]:
importances_rf = model_rf.stages[-1].featureImportances.toArray()
feature_ranking_rf=pd.DataFrame(data={'Features': np.array(input_cols),'Importance':importances_rf})\
                    .sort_values('Importance', ascending=False).reset_index(drop=True)
feature_ranking_rf

Unnamed: 0,Features,Importance
0,max_membership_days,0.23138
1,sum_page_nextsong,0.081799
2,sum_page_add_friend,0.070532
3,sum_page_logout,0.068933
4,active_days,0.062892
5,sum_page_add_to_playlist,0.053455
6,sum_status_307,0.048807
7,sum_status_200,0.048251
8,sum_page_about,0.048144
9,sum_page_settings,0.04605


# Hyperparameter Tuning Via CrossValidator

In [22]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 

### Random Forest Classifier - setting up parameter space

In [23]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxBins, [20, 30, 40]) \
    .addGrid(rf.maxDepth, [3, 4, 5, 6]) \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.impurity, ['gini','entropy']) \
    .build()

crossval_rf = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol = "churn", metricName = 'f1'),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel_rf = crossval_rf.fit(training_data)
prediction_rf = cvModel_rf.transform(test_data)

### Create Report Of Performance

In [25]:
#Show results for all model parameters
params_rf_cv = [{p.name: v for p, v in m.items()} for m in cvModel_rf.getEstimatorParamMaps()]
results_rf_cv=pd.DataFrame.from_dict([
    {cvModel_rf.getEvaluator().getMetricName(): metric, **ps} 
    for ps, metric in zip(params_rf_cv, cvModel_rf.avgMetrics)
])

results_rf_cv

Unnamed: 0,f1,maxBins,maxDepth,numTrees,impurity
0,0.665067,20,3,10,gini
1,0.670522,20,3,10,entropy
2,0.670554,20,3,20,gini
3,0.670522,20,3,20,entropy
4,0.670554,20,3,30,gini
...,...,...,...,...,...
67,0.707972,40,6,10,entropy
68,0.702833,40,6,20,gini
69,0.722079,40,6,20,entropy
70,0.725209,40,6,30,gini


In [28]:
results_rf_cv.to_excel("random_forest_hyperparameter_results.xlsx")

In [26]:
model_evaluator(prediction_rf)

Accuracy: 0.8235294117647058
Precision: 1.0
F1-Score: 0.4
Recall: 0.25

 Confusion Matrix 

TRUE_NEGATIVE:26.0 	 FALSE_POSITIVE:0.0
FALSE_NEGATIVE:6.0 	 TRUE_POSITIVE: 2.0


In [27]:
importances_rf_cv = cvModel_rf.bestModel.stages[-1].featureImportances.toArray()
feature_ranking_rf_cv=pd.DataFrame(data={'Features': np.array(input_cols),'Importance':importances_rf_cv})\
                    .sort_values('Importance', ascending=False).reset_index(drop=True)
feature_ranking_rf_cv

Unnamed: 0,Features,Importance
0,max_membership_days,0.269922
1,active_days,0.080381
2,sum_page_add_friend,0.067848
3,sum_page_logout,0.061558
4,sum_status_307,0.055228
5,sum_page_thumbs_up,0.054233
6,sum_page_roll_advert,0.052999
7,sum_page_settings,0.050676
8,sum_page_add_to_playlist,0.046795
9,sum_page_about,0.035144
