# Churn Metrics

Adding metrics to people dataset for modeling

In [95]:
import numpy as np
import pandas as pd
import re
import matplotlib
import seaborn as sns

Read the events and people data into dataframes

In [96]:
events = pd.read_csv('Cleaned_Dataset_With_ID.csv', low_memory=False)
people = pd.read_csv('labeled_data.csv', low_memory= False)

# Adding Number of Ratings Metric

In [97]:
def ratings(events_df, people_df):
    #creating dictionary with distinct id and number of ratings
    number_ratings_dict = events_df[events_df['name_x'] == 'Rating Flow - Complete'].groupby('distinct_id').count()['name_x'].to_dict()
    
    #creating df with just distinct_id and mapping ratings to each distinct id, filling in Nans with 0
    short_df = people_df[['distinct_id']]
    short_df['number_ratings'] = short_df['distinct_id'].map(number_ratings_dict)
    short_df = short_df.fillna(0)
    
    # drop 'distinct_id' column from short_df and then append to the original dataframe
    short_df = short_df.drop(columns = ['distinct_id'])
    people_df = pd.concat([people_df, short_df], axis=1)
    people_df = people_df.drop(columns = ['Unnamed: 0'])
    
    return people_df

In [98]:
ratings_df = ratings(events, people)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


# SQS

In [99]:
# weight dictionary

weight_dict = {
    'Basic' : 1,
    'Profile' : 2,
    'Explore' : 5,
    'Menu' : 5,
    'Collection' : 7,
    'Rating' : 7,
    'Share' : 10,
    'MISC' : 1,
    'AE' : 1
}

In [100]:
# events dictionary

event_dict = {
    'Basic' : ['Login', 'Logout', 'AppOpen', 'Onboarding - Allow Location', 'Onboarding - Allow Notifications', 
               'Onboarding - Always Allow Location', 'PageVisit', 'Permissions for Locations', 
               'Permissions for Notifications', 'Signup - Sign In', 'Signup - Skip', 'Signup Success', 
               'Visit Signup/Login Page',  'Visit Home Page'],
    
    'Profile' : ['Password Changed', 'Profile - Change Section', 'Profile - Go to Collection', 
                 'Profile - Go to Menu', 'Profile Updated', 'Sidebar - Change Password', 
                 'Sidebar - Contacts Permissions', 'Sidebar - Edit Profile', 'Sidebar - Go to Followers', 
                 'Sidebar - Privacy Policy', 'Sidebar - Sign Out', 'Sidebar - Terms and Conditions', 
                 'Visit Change Password Page', 'Visit Edit Profile Page', 'Followers Page - Go to Profile', 
                 'Item Page - Go to Profile'],
    
    'Explore' : ['Explore - Tap Collection', 'Explore - Visit Explore Page', 'Home - Explore Nearby', 
                 'Feed - Bookmark Icon', 'Feed - Maps Icon', 'Feed - Scroll Down', 'Feed - Tap Dish', 'Item Page - Bookmark', 
                 'Landing - Go to Delivery Feed', 'Landing - Go to Walking Feed', 'Search - Change Section', 
                 'Search - Follow User', 'Search - Invalid Location', 'Search - Results Loaded', 'Search - Tap Home Bar', 
                 'Search - Tap Nearby Restaurant', 'Search - Tap Result', 'Search - Tap Return', 'Search - Tap to Request', 
                 'Search - Unfollow User', 'Tap Searched Dish', 'Visit Item Page', 
                 'Visit Menu Page'],
    
    'Menu' : ['Maps Popup - Go to Reviews', 'Maps Popup - Maybe Later', 'Maps Popup - Open', 'Menu Page - Accuracy Popup', 
              'Menu Page - Add New Dish', 'Menu Page - Collection via Add Icon', 'Menu Page - Filter Items', 
              'Menu Page - Go to Item Page', 'Menu Page - Rate via Add Icon', 'Menu Page - See Restaurant Details', 
              'Menu Page - Submit Flag', 'Menu Page - Switch Menus', 'Menu Page - Tap Flag Icon', 'Visit Menu Page', 
              'Nearby - Bookmark Dish', 'Nearby - Delivery Link', 'Delivery Page - Visit For 10 Sec', 'Home - Order Delivery', 
              'Nearby - Open Maps', 'Nearby - Tap Dish Maps', 'Visit Carousel Wise Dish List'],
    
    'Rating' : ['Edit Rating - Add Photo', 'Rating Flow - Complete', 'Rating Flow - Rate', 
                'Rating Flow - Resubmit', 'Rating Flow - Start', 'Request Restaurant', 
                'Edit Rating - Delete', 'Edit Rating - Edit', 'Edit Rating - Exit', 
                'Edit Rating - Modify Date', 'Item Page - Open My Rating', 
                'Item Page - Tap Upload Photo', 'Profile - Edit Rating Popup'],
    
    'Collection' : ['Collection - Tap Follow Button', 'Collection - Tap Restaurant', 'Collection - Tap Return Button', 
                    'Collection - Tap Search Bar', 'Collection - Visit Collection Detail Page', 
                    'Collection Flow - Create New Collection', 'Collection Flow - Tap Collection', 
                    'Collection Flow - Tap New Collection'], 
    'Share' : ['Share Page - Instagram Story', 'Share Page - Skip', 'Provided App Feedback', 
               'Sidebar - Go to Feedback', 'Sidebar - Invite a Friend',  'Visit App Feedback Page', 
               'Invite Page - Successful Text', 'Search - Invite Button'],
    
    'AE' : ['$ae_crashed', '$ae_first_open', '$ae_session', '$ae_updated', '$campaign_bounced', 
            '$campaign_delivery', '$message_suppressed', '$unsubscribe'],
    
    'MISC': ['DMV-resident'],
}

In [101]:
# short_df only includes the event name and an empty column for category

short_df = events[['name_x']]
short_df.loc['category'] = ''

In [102]:
# create a reverse events dictionary in order to use map function 

reverse_dict = {}
for key, value in event_dict.items():
    for event in value:
        reverse_dict.setdefault(event, []).append(key)
        
# apply reverse dictionary to new column called category that corresponds to the event name
short_df['category'] = short_df['name_x'].map(reverse_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [103]:
# get the first item in category because each value is a list of one element

short_df['category'] = short_df['category'].str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [104]:
# map the weights for each category to a score column

short_df['score'] = short_df['category'].map(weight_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [105]:
# MERGE DATAFRAMES

# drop 'name_x' column from short_df and then append to the original dataframe

labeled_df = short_df.drop(columns = ['name_x'])
event_df = pd.concat([events, labeled_df], axis = 1)
event_df = event_df.drop(columns = ['Unnamed: 0'])

# filter out sessions with -1 for session_id

event_df = event_df[event_df['session_id'] != -1]

# groupby distinct id and then session id 

group_user_session = event_df.groupby(['distinct_id', 'session_id'], as_index=False).agg({'name_x' : 'count', 'score' : 'sum'})
group_user_session = group_user_session.rename(columns = {'name_x' : 'event_count', 'score' : 'Average_SQS'})

# add up all scores for one person and divide by number of sessions

SQS = group_user_session.groupby(['distinct_id'], as_index=False).agg({'Average_SQS' : np.mean})

# merge SQS with original dataframe

SQS_df = pd.merge(left=event_df, right=SQS, on='distinct_id')
#SQS_df = SQS_df.drop(columns = ['Unnamed: 0'])

## Final Dataframe with SQS

In [106]:
SQS_df

Unnamed: 0,name_x,distinct_id,time,ae_session_length,city,region,mp_country_code,locationSetting,notificationSettings,FirstTimeUploaded,...,US,Facebook,Foodie,Google,Unknown,session_id,time_delta,category,score,Average_SQS
0,AppOpen,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:48:32,-1.0,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Basic,1.0,40.181818
1,Visit Signup/Login Page,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:48:35,-1.0,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Basic,1.0,40.181818
2,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:49:02,30.1,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,AE,1.0,40.181818
3,Signup Success,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:50:06,-1.0,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,2.0,64.0,Basic,1.0,40.181818
4,Onboarding - Allow Location,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:50:17,-1.0,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,2.0,64.0,Basic,1.0,40.181818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163267,Feed - Scroll Down,xbarbarazhong@gmail.com_6704,2019-12-12 14:29:23,-1.0,Portland,Oregon,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,29813.0,3.0,Explore,5.0,42.000000
163268,Profile - Change Section,xbarbarazhong@gmail.com_6704,2019-12-12 14:29:28,-1.0,Portland,Oregon,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,29813.0,3.0,Profile,2.0,42.000000
163269,$ae_session,xbarbarazhong@gmail.com_6704,2019-12-12 14:30:54,190.4,Portland,Oregon,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,29813.0,3.0,AE,1.0,42.000000
163270,AppOpen,yoon.s.jeong@gmail.com_6485,2019-10-09 11:12:29,-1.0,Chicago,Illinois,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,29814.0,0.0,Basic,1.0,2.000000


# Merge SQS with Ratings

In [107]:
def merge_SQS_rating(SQS, ratings):
    SQS_short = SQS[['distinct_id', 'Average_SQS']]
    df = pd.merge(ratings, SQS_short, left_on = 'distinct_id', right_on = 'distinct_id', how = 'left')
    df = df.drop_duplicates('distinct_id')
    return df

In [108]:
# merged SQS with ratings

result = merge_SQS_rating(SQS_df, ratings_df)
result

Unnamed: 0,distinct_id,active_timespan,sessions_per_day,churned,number_ratings,Average_SQS
0,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1.130949,85.768672,0,1.0,40.181818
97,001210DC-54C8-43AD-B295-148F47818391,7.158310,8.521564,0,0.0,20.000000
153,0025A8A4-2590-4ECA-8CE8-419D710AE46F,113.687361,0.562947,0,1.0,27.375000
216,0038898ed9b23f7e,0.000000,0.000000,0,0.0,
217,003BC352-1FE5-462C-883F-932AC209EB93,211.945590,0.042464,0,0.0,
...,...,...,...,...,...,...
166661,wandaandreu@gmail.com_5053,188.791597,0.026484,0,0.0,1.666667
166666,willwojt@gmail.com_6868,37.992164,4.685177,0,7.0,4.171717
166843,wolphramite@gmail.com_291,79.125498,0.050553,1,0.0,1.000000
166844,xbarbarazhong@gmail.com_6704,0.584016,35.957906,0,0.0,42.000000


# Add Slope and Merge Slope with People

In [109]:
df = events.loc[(events['name_x'] == '$ae_session') & (events['time_delta'] != 0)]
df

Unnamed: 0.1,Unnamed: 0,name_x,distinct_id,time,ae_session_length,city,region,mp_country_code,locationSetting,notificationSettings,...,active_timespan,average_session_time,country,US,Facebook,Foodie,Google,Unknown,session_id,time_delta
19,19,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:50:51,65.8,Palo Alto,California,US,-1,-1,...,,,US,1,0,0,0,0,2,64.0
21,21,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:51:52,59.9,Palo Alto,California,US,-1,-1,...,,,US,1,0,0,0,0,3,7.0
24,24,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 14:38:30,72.3,Hayward,California,US,-1,-1,...,,,US,1,0,0,0,0,4,2728.0
36,36,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 15:05:10,151.9,San Jose,California,US,-1,-1,...,,,US,1,0,0,0,0,5,1466.0
50,50,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 15:08:11,115.8,San Jose,California,US,-1,-1,...,,,US,1,0,0,0,0,6,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190340,190340,$ae_session,willwojt@gmail.com_6868,2020-03-07 12:11:54,56.5,Washington,District of Columbia,US,-1,-1,...,,,US,1,0,0,0,0,29807,40862.0
190341,190341,$ae_session,willwojt@gmail.com_6868,2020-03-07 12:14:44,32.9,Washington,District of Columbia,US,-1,-1,...,,,US,1,0,0,0,0,29808,170.0
190342,190342,$ae_session,willwojt@gmail.com_6868,2020-03-08 10:47:21,42.3,Washington,District of Columbia,US,-1,-1,...,,,US,1,0,0,0,0,29809,81157.0
190343,190343,$ae_session,willwojt@gmail.com_6868,2020-03-08 10:52:05,200.6,Washington,District of Columbia,US,-1,-1,...,,,US,1,0,0,0,0,29810,284.0


In [110]:
def best_fit_slope(xs,ys):
    m = (((np.mean(xs)*np.mean(ys)) - np.mean(xs*ys)) /
         ((np.mean(xs)*np.mean(xs)) - np.mean(xs*xs)))
    return m

In [111]:
def add_slope(df):
    grouped = df.groupby('distinct_id', as_index=False).agg({'time' : 'count'})
    grouped['slope'] = None
    for index, row in grouped.iterrows():
        x = df.loc[df['distinct_id'] == row['distinct_id']]
        length = np.arange(len(x))
        slope = best_fit_slope(length, x['time_delta'])
        grouped.loc[grouped['distinct_id'] == row['distinct_id'], 'slope'] = slope
    final = grouped.drop(columns='time', axis=1)
    return final

In [112]:
# Metric: slope
# AKA: best fit slope of trend of time between session
# Given the trend of time between sessions over time, find the slope of the best fit line for that user. 

slope = add_slope(df)
slope

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,distinct_id,slope
0,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1113.36
1,001210DC-54C8-43AD-B295-148F47818391,-11716
2,0025A8A4-2590-4ECA-8CE8-419D710AE46F,615550
3,004C9C76-A530-422A-BA1B-667CDA12A08D,-44.5
4,005144D7-BD3F-405B-9621-5FACA6B1AC04,
...,...,...
2617,varun.murthy@gmail.com_5928,-436340
2618,virtrutest99@gmail.com_5848,17.2857
2619,wandaandreu@gmail.com_5053,1.37164e+07
2620,willwojt@gmail.com_6868,-122.179


In [113]:
result = result.merge(slope, how='left', on='distinct_id')
result.head(10)

Unnamed: 0,distinct_id,active_timespan,sessions_per_day,churned,number_ratings,Average_SQS,slope
0,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1.130949,85.768672,0,1.0,40.181818,1113.36
1,001210DC-54C8-43AD-B295-148F47818391,7.15831,8.521564,0,0.0,20.0,-11716.0
2,0025A8A4-2590-4ECA-8CE8-419D710AE46F,113.687361,0.562947,0,1.0,27.375,615550.0
3,0038898ed9b23f7e,0.0,0.0,0,0.0,,
4,003BC352-1FE5-462C-883F-932AC209EB93,211.94559,0.042464,0,0.0,,
5,00471d582ed13800,0.0,0.0,0,0.0,,
6,00492868-3539-4228-AF7E-04AF74447A65,,,0,0.0,,
7,004C9C76-A530-422A-BA1B-667CDA12A08D,6.605903,3.935874,0,0.0,13.25,-44.5
8,005144D7-BD3F-405B-9621-5FACA6B1AC04,197.638484,0.070836,0,0.0,22.0,
9,0056EDD0-44AF-4850-AA3C-C8D339CBF828,0.333333,9.0,0,0.0,,


# Add Average Session Time Metric #

In [114]:
# Metric: avg_session_duration
# Similar to "average_session_time", but NOT calculated the same way.
# Takes the average of all of each user's session durations.

def avg_session_time(people, events):
    events = events[events['ae_session_length'] != -1.0]
    groupby_id = events.groupby(['distinct_id']).agg(np.mean)
    people_m2 = people.merge(groupby_id['ae_session_length'], on = 'distinct_id', how = 'left')
    people_m2['ae_session_length'] = people_m2['ae_session_length'].fillna(-1)
    people_m2 = people_m2.rename(columns={"ae_session_length": "avg_session_duration"})
    return people_m2

In [115]:
result = avg_session_time(result, events)
result

Unnamed: 0,distinct_id,active_timespan,sessions_per_day,churned,number_ratings,Average_SQS,slope,avg_session_duration
0,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1.130949,85.768672,0,1.0,40.181818,1113.36,82.854545
1,001210DC-54C8-43AD-B295-148F47818391,7.158310,8.521564,0,0.0,20.000000,-11716,47.940000
2,0025A8A4-2590-4ECA-8CE8-419D710AE46F,113.687361,0.562947,0,1.0,27.375000,615550,177.325000
3,0038898ed9b23f7e,0.000000,0.000000,0,0.0,,,-1.000000
4,003BC352-1FE5-462C-883F-932AC209EB93,211.945590,0.042464,0,0.0,,,-1.000000
...,...,...,...,...,...,...,...,...
7208,wandaandreu@gmail.com_5053,188.791597,0.026484,0,0.0,1.666667,1.37164e+07,54.700000
7209,willwojt@gmail.com_6868,37.992164,4.685177,0,7.0,4.171717,-122.179,61.190909
7210,wolphramite@gmail.com_291,79.125498,0.050553,1,0.0,1.000000,,17.500000
7211,xbarbarazhong@gmail.com_6704,0.584016,35.957906,0,0.0,42.000000,,124.150000


# Cleaning #

In [116]:
# number_ratings: NaNs imputed with 0.0
# SQS: NaNs imputed with -1.0
# slope: NaNs imputed with 0.0
# avg_session_duration: NaNs i,puted with -1.0

result['Average_SQS'] = result['Average_SQS'].fillna(-1)
result['slope'] = result['slope'].fillna(0)
result = result.rename(columns={'slope':'time_in_between_sessions_slope'})
result

Unnamed: 0,distinct_id,active_timespan,sessions_per_day,churned,number_ratings,Average_SQS,time_in_between_sessions_slope,avg_session_duration
0,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1.130949,85.768672,0,1.0,40.181818,1.113364e+03,82.854545
1,001210DC-54C8-43AD-B295-148F47818391,7.158310,8.521564,0,0.0,20.000000,-1.171600e+04,47.940000
2,0025A8A4-2590-4ECA-8CE8-419D710AE46F,113.687361,0.562947,0,1.0,27.375000,6.155496e+05,177.325000
3,0038898ed9b23f7e,0.000000,0.000000,0,0.0,-1.000000,0.000000e+00,-1.000000
4,003BC352-1FE5-462C-883F-932AC209EB93,211.945590,0.042464,0,0.0,-1.000000,0.000000e+00,-1.000000
...,...,...,...,...,...,...,...,...
7208,wandaandreu@gmail.com_5053,188.791597,0.026484,0,0.0,1.666667,1.371636e+07,54.700000
7209,willwojt@gmail.com_6868,37.992164,4.685177,0,7.0,4.171717,-1.221786e+02,61.190909
7210,wolphramite@gmail.com_291,79.125498,0.050553,1,0.0,1.000000,0.000000e+00,17.500000
7211,xbarbarazhong@gmail.com_6704,0.584016,35.957906,0,0.0,42.000000,0.000000e+00,124.150000


In [None]:
result.to_csv(r'people_with_metrics.csv')