# Sparkify Project Workspace
This workspace contains a tiny subset (128MB) of the full dataset available (12GB).

In [311]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, concat, count, desc, explode, lit, min, max, sum, split, stddev, udf, when, lag
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql import Window

import datetime

import os

# Set spark environments
os.environ['PYSPARK_PYTHON'] = '/Users/stillqe/anaconda/envs/Sparkify/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/Users/stillqe/anaconda/envs/Sparkify/bin/python3'

In [89]:
# create a Spark session
spark = SparkSession.builder \
    .master("local") \
    .appName("Sparkify") \
    .getOrCreate()

# Load and Clean Dataset
In this workspace, the mini-dataset file is `mini_sparkify_event_data.json`. Load and clean the dataset, checking for invalid or missing data - for example, records without userids or sessionids. 

In [90]:
sample = 'mini_sparkify_event_data.json'
df = spark.read.json(sample)
df.printSchema()
print((df.count(), len(df.columns)))

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)

(286500, 18)


# Exploratory Data Analysis.

### Define Churn

Churn is more like dynamic feature rather than static. Some users are stick to the service from the begining but they can be likely to churn at certain time period. So focus on behavioral events for certain period, not just entire service life time. 

User characteristics change over time. The probability of a user churn also changes over time. Therefore, it is more appropriate to view the user's characteristics in multiple time windows rather than consolidating them with one input without considering changes over time.

You need to know in advance when predicting churn. That way, you can act ahead. It is practically meaningless to make predictions by looking at the records up to the moment of churn. After a certain period of time, for example, by looking at the records up to 4 weeks ago, you have to learn whether it is churn or not after 4 weeks. 

### Explore Data
Once you've defined churn, perform some exploratory data analysis to observe the behavior for users who stayed vs users who churned. You can start by exploring aggregates on these two groups of users, observing how much of a specific action they experienced per a certain time unit or number of songs played.

In [91]:
df.select('page').dropDuplicates().show(30)

+--------------------+
|                page|
+--------------------+
|              Cancel|
|    Submit Downgrade|
|         Thumbs Down|
|                Home|
|           Downgrade|
|         Roll Advert|
|              Logout|
|       Save Settings|
|Cancellation Conf...|
|               About|
| Submit Registration|
|            Settings|
|               Login|
|            Register|
|     Add to Playlist|
|          Add Friend|
|            NextSong|
|           Thumbs Up|
|                Help|
|             Upgrade|
|               Error|
|      Submit Upgrade|
+--------------------+



In [82]:
df.where((df.page=='Cancel') | (df.page=='Cancellation Confirmation')).orderBy('userID', 'ts').show()

+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+--------------------+-------------+---------+----+------+-------------+--------------------+------+
|artist|     auth|firstName|gender|itemInSession|lastName|length|level|            location|method|                page| registration|sessionId|song|status|           ts|           userAgent|userId|
+------+---------+---------+------+-------------+--------+------+-----+--------------------+------+--------------------+-------------+---------+----+------+-------------+--------------------+------+
|  null|Logged In|  Delaney|     F|           22|   Perez|  null| free|Miami-Fort Lauder...|   PUT|              Cancel|1534627466000|       53|null|   307|1538498074000|"Mozilla/5.0 (Mac...|100001|
|  null|Cancelled|  Delaney|     F|           23|   Perez|  null| free|Miami-Fort Lauder...|   GET|Cancellation Conf...|1534627466000|       53|null|   200|1538498205000|"Mozilla/5.0 (Mac...|100001|
|  nu

In [208]:
(1538498074000-1534627466000)/(1000*60*60*24)

44.7987037037037

In [210]:
datetime.date.fromtimestamp(1538498074000/1000).strftime('%Y-%m-%d %H:%M:%S')

'2018-10-02 00:00:00'

In [211]:
datetime.date.fromtimestamp(1534627466000/1000).strftime('%Y-%m-%d %H:%M:%S')

'2018-08-18 00:00:00'

In [19]:
df.where(df.page=='NextSong').groupBy('gender').agg({'song':'count'}).show()

+------+-----------+
|gender|count(song)|
+------+-----------+
|     F|     126696|
|     M|     101412|
+------+-----------+



In [92]:
df.filter(df.page == 'NextSong') \
    .select('Artist') \
    .groupBy('Artist') \
    .agg({'Artist':'count'}) \
    .withColumnRenamed('count(Artist)', 'Artistcount') \
    .sort(desc('Artistcount')) \
    .show(5)

+--------------------+-----------+
|              Artist|Artistcount|
+--------------------+-----------+
|       Kings Of Leon|       1841|
|            Coldplay|       1813|
|Florence + The Ma...|       1236|
|       Dwight Yoakam|       1135|
|            BjÃÂ¶rk|       1133|
+--------------------+-----------+
only showing top 5 rows



In [147]:
df.select('page').dropDuplicates().show(30)

+--------------------+
|                page|
+--------------------+
|              Cancel|
|    Submit Downgrade|
|         Thumbs Down|
|                Home|
|           Downgrade|
|         Roll Advert|
|              Logout|
|       Save Settings|
|Cancellation Conf...|
|               About|
| Submit Registration|
|            Settings|
|               Login|
|            Register|
|     Add to Playlist|
|          Add Friend|
|            NextSong|
|           Thumbs Up|
|                Help|
|             Upgrade|
|               Error|
|      Submit Upgrade|
+--------------------+



In [184]:
df = df.withColumn('cancel', when(df.page == 'Cancellation Confirmation', 1).otherwise(0)) \
    .withColumn('thumpsup', when(df.page == 'Thumbs Up', 1).otherwise(0)) \
    .withColumn('thumpsdown', when(df.page == 'Thumbs Down', 1).otherwise(0)) \
    .withColumn('ad', when(df.page == 'Roll Advert', 1).otherwise(0)) \
    .withColumn('playlist', when(df.page == 'Add to Playlist', 1).otherwise(0)) \
    .withColumn('friend', when(df.page == 'Add Friend', 1).otherwise(0)) \
    .withColumn('error', when(df.page == 'Error', 1).otherwise(0)) \
    .withColumn('home', when(df.page == 'Home', 1).otherwise(0)) \
    .withColumn('downgrade', when(df.page == 'Downgrade', 1).otherwise(0)) \
    .withColumn('upgrade', when(df.page == 'Upgrade', 1).otherwise(0)) \
    .withColumn('setting', when(df.page == 'Save Settings', 1).otherwise(0)) \
    .withColumn('ts', col('ts')/1000) \
    .withColumn('registration', col('registration')/1000) \
    .withColumn('day', ((col('ts')-col('registration'))/(60*60*24)).cast(IntegerType()))

### Aggregate

How to aggregate historical data into a single row?


In [224]:
df = df.where(df.userId != "")

count_missings(df)

Unnamed: 0,count
artist,50046
length,50046
song,50046
userAgent,0
setting,0
upgrade,0
downgrade,0
home,0
error,0
friend,0


In [225]:
df.where(df.userId=="").count()

0

In [254]:


sessions = df.groupBy('userId', 'sessionId').agg(count('song').alias('play'),
                                                 max('cancel').alias('churn'),
                                            min('day').alias('day'),
                                            min('ts').alias('start'), 
                                            max('ts').alias('end'),
                                           max('cancel').alias('cancel'),
                                           sum('thumpsup').alias('thumpsup'),
                                           sum('thumpsdown').alias('thumpsdown'),
                                           sum('ad').alias('ads'),
                                           sum('playlist').alias('playlists'),
                                           sum('friend').alias('friends'),
                                           sum('error').alias('errors'),
                                           sum('setting').alias('setting')) \
    .withColumn('duration', (col('end') - col('start')).cast(IntegerType()))


In [255]:
windowSpec = Window.partitionBy('userId').orderBy('start')
userWindow = Window.partitionBy('userId')
sessions = sessions.withColumn('gap', col('day') - lag('day', 1, 0).over(windowSpec))
sessions = sessions.withColumn('churn', max('cancel').over(userWindow))
sessions = sessions.withColumn('last', max('day').over(userWindow))
#It is practically not useful to predict churning with entire records including right before churning. 
# Exclude the record from 30 days before last sessions.
sessions = sessions.withColumn('last', max('day').over(userWindow))

It is practically not useful to predict churning with data 

In [319]:
sessions = sessions.where(sessions.day < (sessions.last-30))
sessions.where(sessions.churn == '1').show(100)

+------+---------+----+-----+---+-------------+-------------+------+--------+----------+---+---------+-------+------+-------+--------+---+----+
|userId|sessionId|play|churn|day|        start|          end|cancel|thumpsup|thumpsdown|ads|playlists|friends|errors|setting|duration|gap|last|
+------+---------+----+-----+---+-------------+-------------+------+--------+----------+---+---------+-------+------+-------+--------+---+----+
|    54|       53|   6|    1| 67| 1.53835393E9|1.538355255E9|     0|       1|         0|  0|        0|      0|     0|      0|    1325| 67| 110|
|    54|      260| 118|    1| 68|1.538395957E9|1.538424556E9|     0|       4|         1|  1|        3|      5|     0|      0|   28599|  1| 110|
|    54|      309|  37|    1| 71|1.538623828E9|1.538633675E9|     0|       2|         1|  0|        1|      0|     0|      0|    9847|  3| 110|
|    54|      434|  82|    1| 72|1.538712831E9|1.538734617E9|     0|       3|         0|  0|        2|      1|     0|      0|   21786|  

In [315]:
sessions.where(sessions.churn == '1').show(100)

+------+---------+----+-----+---+-------------+-------------+------+--------+----------+---+---------+-------+------+-------+--------+---+
|userId|sessionId|play|churn|day|        start|          end|cancel|thumpsup|thumpsdown|ads|playlists|friends|errors|setting|duration|gap|
+------+---------+----+-----+---+-------------+-------------+------+--------+----------+---+---------+-------+------+-------+--------+---+
|   125|      174|   8|    1| 71|1.539317144E9|1.539318918E9|     1|       0|         0|  1|        0|      0|     0|      0|    1774| 71|
|    51|      236| 497|    1|  3|1.538398632E9|1.538522795E9|     0|      22|         4|  0|       19|     11|     0|      0|  124163|  3|
|    51|      362|  16|    1|  6|1.538648234E9|1.538652258E9|     0|       0|         2|  0|        0|      0|     0|      0|    4024|  3|
|    51|      442| 373|    1|  7|1.538704395E9|1.538796904E9|     0|      21|         0|  0|        6|      5|     0|      0|   92509|  1|
|    51|      528| 266|    

In [320]:
count_missings(sessions)

Unnamed: 0,count
userId,0
sessionId,0
gap,0
duration,0
setting,0
errors,0
friends,0
playlists,0
ads,0
thumpsdown,0


In [None]:
import pyspark.sql.functions as F
def count_missings(spark_df,sort=True):
    """
    Counts number of nulls and nans in each column
    """
    df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for c in spark_df.columns]).toPandas()

    if len(df) == 0:
        print("There are no any missing values!")
        return None

    if sort:
        return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)

    return df

count_missings(df)

어떤 피쳐를 추출해야하는가. 



# Feature Engineering
Once you've familiarized yourself with the data, build out the features you find promising to train your model on. To work with the full dataset, you can follow the following steps.
- Write a script to extract the necessary features from the smaller subset of data
- Ensure that your script is scalable, using the best practices discussed in Lesson 3
- Try your script on the full data set, debugging your script if necessary

If you are working in the classroom workspace, you can just extract features based on the small subset of data contained here. Be sure to transfer over this work to the larger dataset when you work on your Spark cluster.

In [257]:
sessions.printSchema()

root
 |-- userId: string (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- play: long (nullable = false)
 |-- churn: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- start: double (nullable = true)
 |-- end: double (nullable = true)
 |-- cancel: integer (nullable = true)
 |-- thumpsup: long (nullable = true)
 |-- thumpsdown: long (nullable = true)
 |-- ads: long (nullable = true)
 |-- playlists: long (nullable = true)
 |-- friends: long (nullable = true)
 |-- errors: long (nullable = true)
 |-- setting: long (nullable = true)
 |-- duration: integer (nullable = true)
 |-- gap: integer (nullable = true)



In [321]:
users = sessions.groupBy('userId').agg(count('sessionId').alias('total_session'),
                                       avg('play').alias('avg_play'),
                                       when(count('play') > 1, stddev('play')).otherwise(0).alias('std_play'),
                                       avg('thumpsup').alias('avg_up'),
                                       avg('thumpsdown').alias('avg_down'),
                                       avg('ads').alias('avg_ads'),
                                       avg('setting').alias('avg_setting'),
                                       avg('playlists').alias('avg_playlists'),
                                       avg('friends').alias('avg_friends'),
                                       avg('duration').alias('avg_duration'),
                                       when(count('duration') > 1, stddev('duration')).otherwise(0).alias('std_duration'), 
                                       avg('gap').alias('avg_gap'),
                                       when(count('gap') > 1, stddev('gap')).otherwise(0).alias('std_gap'),
                                       max('day').alias('life_time'),
                                       max('churn').alias('label'))

users.show(10)
                                                                           

+------+-------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------+-----+
|userId|total_session|          avg_play|          std_play|            avg_up|          avg_down|           avg_ads|        avg_setting|     avg_playlists|       avg_friends|      avg_duration|      std_duration|           avg_gap|           std_gap|life_time|label|
+------+-------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------+-----+
|100010|            4|              30.0| 16.22754859285078|               1.5|              0.25|               5.5|                0.0|               0.5|              0.75|            6722.0|38

In [269]:
users.printSchema()

root
 |-- userId: string (nullable = true)
 |-- total_session: long (nullable = false)
 |-- avg_play: double (nullable = true)
 |-- std_play: double (nullable = true)
 |-- avg_up: double (nullable = true)
 |-- avg_down: double (nullable = true)
 |-- avg_ads: double (nullable = true)
 |-- avg_setting: double (nullable = true)
 |-- avg_playlists: double (nullable = true)
 |-- avg_friends: double (nullable = true)
 |-- avg_duration: double (nullable = true)
 |-- std_duration: double (nullable = true)
 |-- avg_gap: double (nullable = true)
 |-- std_gap: double (nullable = true)
 |-- life_time: integer (nullable = true)
 |-- label: integer (nullable = true)



# Modeling
Split the full dataset into train, test, and validation sets. Test out several of the machine learning methods you learned. Evaluate the accuracy of the various models, tuning parameters as necessary. Determine your winning model based on test accuracy and report results on the validation set. Since the churned users are a fairly small subset, I suggest using F1 score as the metric to optimize.

### Build a pipeline

In [324]:



assembler = VectorAssembler(inputCols=["total_session",
                                       "avg_play",
                                       "std_play",
                                       "avg_up",
                                       "avg_down",
                                       "avg_ads",
                                       "avg_setting",
                                       "avg_playlists",
                                       "avg_friends",
                                       "avg_duration",
                                       "std_duration",
                                       "avg_gap",
                                       "std_gap",
                                       "life_time"
                                      ], outputCol="NumFeatures")

scaler = StandardScaler(inputCol="NumFeatures", outputCol="features", withStd=True)
#gbt = GBTClassifier(featuresCol="features", maxIter=10)
rf = RandomForestClassifier(featuresCol="features")


pipeline = Pipeline(stages=[assembler, scaler, rf])

In [325]:
rf = RandomForestClassifier(featuresCol="features")
pipeline = Pipeline(stages=[assembler, scaler, rf])
model = pipeline.fit(train)

Exception ignored in: <function JavaWrapper.__del__ at 0x7f9fae01b1f0>
Traceback (most recent call last):
  File "/Users/stillqe/anaconda/envs/Sparkify/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 42, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'StandardScaler' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7f9fae01b1f0>
Traceback (most recent call last):
  File "/Users/stillqe/anaconda/envs/Sparkify/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 42, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RandomForestClassifier' object has no attribute '_java_obj'


In [326]:
# Split the data into train and test subsets
train, test = users.randomSplit([0.8, 0.2], seed=123)

model = pipeline.fit(train)

In [308]:
paramGrid = ParamGridBuilder() \
            .addGrid(gbt.maxDepth, [3, 5, 10]) \
            .addGrid(gbt.maxBins, [16, 32, 64]).build()

crossval = CrossValidator(estimator=pipeline,
                         estimatorParamMaps=paramGrid,
                         evaluator=MulticlassClassificationEvaluator(metricName='f1'),
                         numFolds=3)

cvModel = crossval.fit(train)
cvModel.avgMetrics

NameError: name 'cvModel_SVM' is not defined

In [310]:
cvModel.avgMetrics

pred_test = cvModel.transform(test)

f1_score_evaluator = MulticlassClassificationEvaluator(metricName='f1')
f1_score = f1_score_evaluator.evaluate(pred_test.select('label','prediction'),{f1_score_evaluator.metricName: 'f1'})

print('The F1 score on the test set is {:.2%}'.format(f1_score))

The F1 score on the test set is 77.55%


In [314]:
pred_test = model.transform(test)

f1_score_evaluator = MulticlassClassificationEvaluator(metricName='f1')
f1_score = f1_score_evaluator.evaluate(pred_test.select('label','prediction'),{f1_score_evaluator.metricName: 'f1'})

print('The F1 score on the test set is {:.2%}'.format(f1_score))

The F1 score on the test set is 79.46%


In [303]:
inputCols=["total_session", "avg_play","std_play","avg_up","avg_down","avg_ads","avg_setting","avg_playlists",
           "avg_friends","avg_duration","std_duration","avg_gap","std_gap","life_time"]
# print feature importances
for i in range(len(model.stages[-1].featureImportances)):
    print("{} : {} \n".format(inputCols[i], model.stages[-1].featureImportances[i]))


Exception ignored in: <function JavaWrapper.__del__ at 0x7f9fae01b1f0>
Traceback (most recent call last):
  File "/Users/stillqe/anaconda/envs/Sparkify/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 42, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'StandardScaler' object has no attribute '_java_obj'


total_session : 0.09265597231971447 

avg_play : 0.020043266406611474 

std_play : 0.02087489622324075 

avg_up : 0.03466778331225433 

avg_down : 0.12084347612478788 

avg_ads : 0.10486518950621027 

avg_setting : 0.05857305800973874 

avg_playlists : 0.03971134170798569 

avg_friends : 0.015787617986012244 

avg_duration : 0.0026083766350204615 

std_duration : 0.008286571122397769 

avg_gap : 0.1047910747445332 

std_gap : 0.058819884056444054 

life_time : 0.3174714918450489 



In [294]:
pred_test.select("prediction", "label", "features").show(100)


+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|    1|[2.45232377485841...|
|       1.0|    0|[0.19883706282635...|
|       0.0|    0|[0.59651118847907...|
|       0.0|    1|[0.39767412565271...|
|       1.0|    0|[0.39767412565271...|
|       0.0|    0|[0.19883706282635...|
|       1.0|    0|[0.19883706282635...|
|       0.0|    0|[0.06627902094211...|
|       0.0|    0|[0.46395314659483...|
|       0.0|    0|[1.12674335601603...|
|       1.0|    1|[1.32558041884238...|
|       0.0|    0|[1.19302237695814...|
|       1.0|    1|[0.06627902094211...|
|       0.0|    0|[0.92790629318967...|
|       0.0|    0|[0.66279020942119...|
|       0.0|    1|[1.25930139790026...|
|       0.0|    0|[0.59651118847907...|
|       0.0|    0|[1.59069650261086...|
|       1.0|    1|[0.66279020942119...|
|       0.0|    0|[5.03720559160107...|
|       0.0|    0|[1.19302237695814...|
|       0.0|    0|[0.33139510471059...|


In [None]:
accuracy = evaluator.evaluate(predictions)

# Final Steps
Clean up your code, adding comments and renaming variables to make the code easier to read and maintain. Refer to the Spark Project Overview page and Data Scientist Capstone Project Rubric to make sure you are including all components of the capstone project and meet all expectations. Remember, this includes thorough documentation in a README file in a Github repository, as well as a web app or blog post.