In [1]:
import os
import boto3 
import json

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *

import pandas as pd
import numpy as np
import s3fs

In [2]:
sc = SparkContext()
ss = SparkSession.builder.getOrCreate()

In [3]:
files = ! aws s3api list-objects --bucket msds630-kaggle-competition
files = [f.split()[1][1:-2] for f in files.get_list() if ".csv" in f]
files # files in the bucket

['attributes.csv', 'events.csv', 'messages.csv', 'sessions.csv']

### Cleaning 

In [129]:
types = {"app_id": np.uint64,
         "message_id": np.uint64,
         "action_type": "category",
         "delivery_type": np.uint8,
         "delivery_time_mode": np.uint8,
         "goal_kind": "category"}

df_messages = pd.read_csv('s3://msds630-kaggle-competition/messages.csv', dtype=types)
df_messages = df_messages[df_messages.app_id == 4724682771660800]

# I suggest ignoring last column

In [132]:
df_messages.to_csv('messages_cleaned.csv', index=False)

### Experimentation 

In [130]:
types = {"app_id": np.uint64,
         "message_id": np.uint64,
         "action_type": "category",
         "delivery_type": np.uint8,
         "delivery_time_mode": np.uint8,
         "goal_kind": "category"}

In [131]:
df_messages = pd.read_csv('s3://msds630-kaggle-competition/messages.csv', dtype=types)

In [124]:
df_messages.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896 entries, 0 to 2895
Data columns (total 6 columns):
app_id                2896 non-null uint64
message_id            2896 non-null uint64
action_type           2896 non-null category
delivery_type         2896 non-null uint8
delivery_time_mode    2896 non-null uint8
goal_kind             1256 non-null category
dtypes: category(2), uint64(2), uint8(2)
memory usage: 58.9 KB


In [125]:
df_messages.head()

Unnamed: 0,app_id,message_id,action_type,delivery_type,delivery_time_mode,goal_kind
0,6196435404455936,5420304779837440,__Push Notification,4,3,
1,6196435404455936,5059935179767808,__Push Notification,0,3,
2,6196435404455936,6563799154425856,__Webhook,4,3,
3,6196435404455936,4680497250304000,__Push Notification,0,3,
4,4724682771660800,6286714069450752,Center Popup,0,3,3.0


In [93]:
df_messages.count()

app_id                2896
message_id            2896
action_type           2896
delivery_type         2896
delivery_time_mode    2896
goal_kind             1256
dtype: int64

In [94]:
df_messages.app_id.isna().any() and df_messages.app_id.isnull().any()

False

In [95]:
df_messages.app_id.value_counts()

6206221868072960    1323
6196435404455936     945
4724682771660800     628
Name: app_id, dtype: int64

In [96]:
df_messages = df_messages[df_messages.app_id == 4724682771660800]
df_messages.count()

app_id                628
message_id            628
action_type           628
delivery_type         628
delivery_time_mode    628
goal_kind             108
dtype: int64

In [97]:
df_messages.message_id.isna().any() and df_messages.message_id.isnull().any()

False

In [98]:
df_messages.message_id.value_counts()[0:10]

5319004722167808    6
6634487500242944    4
4609541589565440    3
4857785064095744    2
6301414704021504    2
6006728682897408    2
6004640771997696    2
4592402159501312    2
5138636113444864    2
5663864431837184    2
Name: message_id, dtype: int64

In [99]:
df_messages.action_type.isna().any() and df_messages.action_type.isnull().any()

False

In [100]:
df_messages.action_type.value_counts()

__Push Notification      541
Center Popup              65
Interstitial              18
Confirm                    2
Push Ask to Ask            1
__Email                    1
Alert                      0
Banner                     0
Custom Center Popup        0
Floating Interstitial      0
new_banner                 0
message or action222       0
Request App Rating         0
Web Interstitial           0
__Newsfeed Message         0
__Webhook                  0
banner                     0
3-button Confirm           0
Name: action_type, dtype: int64

In [101]:
df_messages.delivery_type.isna().any() and df_messages.delivery_type.isnull().any()

False

In [102]:
df_messages.delivery_time_mode.value_counts()

3    553
2     45
0     29
1      1
Name: delivery_time_mode, dtype: int64

In [103]:
df_messages.goal_kind.isna().any() and df_messages.goal_kind.isnull().any()

True

In [104]:
df_messages.goal_kind.value_counts()

6    52
3    50
4     3
0     3
Name: goal_kind, dtype: int64

In [105]:
sum(df_messages.goal_kind.isna())

520

In [None]:
# 6: user action
# 3: event
# 4: optimal time
# 0: metric