# Churn Metrics

Adding metrics to people dataset for modeling

In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib
import seaborn as sns

Read the events and people data into dataframes

In [2]:
events = pd.read_csv('Cleaned_Dataset_With_ID.csv', low_memory=False)
people = pd.read_csv('cleaned_people.csv', low_memory= False)

# Adding Number of Ratings Metric

In [3]:
def ratings(events_df, people_df):
    #creating dictionary with distinct id and number of ratings
    number_ratings_dict = events_df[events_df['name_x'] == 'Rating Flow - Complete'].groupby('distinct_id').count()['name_x'].to_dict()
    
    #creating df with just distinct_id and mapping ratings to each distinct id, filling in Nans with 0
    short_df = people_df[['distinct_id']]
    short_df['number_ratings'] = short_df['distinct_id'].map(number_ratings_dict)
    short_df = short_df.fillna(0)
    
    # drop 'distinct_id' column from short_df and then append to the original dataframe
    short_df = short_df.drop(columns = ['distinct_id'])
    people_df = pd.concat([people_df, short_df], axis=1)
    people_df = people_df.drop(columns = ['Unnamed: 0'])
    
    return people_df

In [4]:
ratings_df = ratings(events, people)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


# SQS

In [5]:
# weight dictionary

weight_dict = {
    'Basic' : 1,
    'Profile' : 2,
    'Explore' : 5,
    'Menu' : 5,
    'Collection' : 7,
    'Rating' : 7,
    'Share' : 10,
    'MISC' : 1,
    'AE' : 1
}

In [6]:
# events dictionary

event_dict = {
    'Basic' : ['Login', 'Logout', 'AppOpen', 'Onboarding - Allow Location', 'Onboarding - Allow Notifications', 
               'Onboarding - Always Allow Location', 'PageVisit', 'Permissions for Locations', 
               'Permissions for Notifications', 'Signup - Sign In', 'Signup - Skip', 'Signup Success', 
               'Visit Signup/Login Page',  'Visit Home Page'],
    
    'Profile' : ['Password Changed', 'Profile - Change Section', 'Profile - Go to Collection', 
                 'Profile - Go to Menu', 'Profile Updated', 'Sidebar - Change Password', 
                 'Sidebar - Contacts Permissions', 'Sidebar - Edit Profile', 'Sidebar - Go to Followers', 
                 'Sidebar - Privacy Policy', 'Sidebar - Sign Out', 'Sidebar - Terms and Conditions', 
                 'Visit Change Password Page', 'Visit Edit Profile Page', 'Followers Page - Go to Profile', 
                 'Item Page - Go to Profile'],
    
    'Explore' : ['Explore - Tap Collection', 'Explore - Visit Explore Page', 'Home - Explore Nearby', 
                 'Feed - Bookmark Icon', 'Feed - Maps Icon', 'Feed - Scroll Down', 'Feed - Tap Dish', 'Item Page - Bookmark', 
                 'Landing - Go to Delivery Feed', 'Landing - Go to Walking Feed', 'Search - Change Section', 
                 'Search - Follow User', 'Search - Invalid Location', 'Search - Results Loaded', 'Search - Tap Home Bar', 
                 'Search - Tap Nearby Restaurant', 'Search - Tap Result', 'Search - Tap Return', 'Search - Tap to Request', 
                 'Search - Unfollow User', 'Tap Searched Dish', 'Visit Item Page', 
                 'Visit Menu Page'],
    
    'Menu' : ['Maps Popup - Go to Reviews', 'Maps Popup - Maybe Later', 'Maps Popup - Open', 'Menu Page - Accuracy Popup', 
              'Menu Page - Add New Dish', 'Menu Page - Collection via Add Icon', 'Menu Page - Filter Items', 
              'Menu Page - Go to Item Page', 'Menu Page - Rate via Add Icon', 'Menu Page - See Restaurant Details', 
              'Menu Page - Submit Flag', 'Menu Page - Switch Menus', 'Menu Page - Tap Flag Icon', 'Visit Menu Page', 
              'Nearby - Bookmark Dish', 'Nearby - Delivery Link', 'Delivery Page - Visit For 10 Sec', 'Home - Order Delivery', 
              'Nearby - Open Maps', 'Nearby - Tap Dish Maps', 'Visit Carousel Wise Dish List'],
    
    'Rating' : ['Edit Rating - Add Photo', 'Rating Flow - Complete', 'Rating Flow - Rate', 
                'Rating Flow - Resubmit', 'Rating Flow - Start', 'Request Restaurant', 
                'Edit Rating - Delete', 'Edit Rating - Edit', 'Edit Rating - Exit', 
                'Edit Rating - Modify Date', 'Item Page - Open My Rating', 
                'Item Page - Tap Upload Photo', 'Profile - Edit Rating Popup'],
    
    'Collection' : ['Collection - Tap Follow Button', 'Collection - Tap Restaurant', 'Collection - Tap Return Button', 
                    'Collection - Tap Search Bar', 'Collection - Visit Collection Detail Page', 
                    'Collection Flow - Create New Collection', 'Collection Flow - Tap Collection', 
                    'Collection Flow - Tap New Collection'], 
    'Share' : ['Share Page - Instagram Story', 'Share Page - Skip', 'Provided App Feedback', 
               'Sidebar - Go to Feedback', 'Sidebar - Invite a Friend',  'Visit App Feedback Page', 
               'Invite Page - Successful Text', 'Search - Invite Button'],
    
    'AE' : ['$ae_crashed', '$ae_first_open', '$ae_session', '$ae_updated', '$campaign_bounced', 
            '$campaign_delivery', '$message_suppressed', '$unsubscribe'],
    
    'MISC': ['DMV-resident'],
}

In [7]:
# short_df only includes the event name and an empty column for category

short_df = events[['name_x']]
short_df.loc['category'] = ''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
# create a reverse events dictionary in order to use map function 

reverse_dict = {}
for key, value in event_dict.items():
    for event in value:
        reverse_dict.setdefault(event, []).append(key)
        
# apply reverse dictionary to new column called category that corresponds to the event name
short_df['category'] = short_df['name_x'].map(reverse_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [9]:
# get the first item in category because each value is a list of one element

short_df['category'] = short_df['category'].str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
# map the weights for each category to a score column

short_df['score'] = short_df['category'].map(weight_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# MERGE DATAFRAMES

# drop 'name_x' column from short_df and then append to the original dataframe

labeled_df = short_df.drop(columns = ['name_x'])
event_df = pd.concat([events, labeled_df], axis = 1)
event_df = event_df.drop(columns = ['Unnamed: 0'])

# filter out sessions with -1 for session_id

event_df = event_df[event_df['session_id'] != -1]

# groupby distinct id and then session id 

group_user_session = event_df.groupby(['distinct_id', 'session_id'], as_index=False).agg({'name_x' : 'count', 'score' : 'sum'})
group_user_session = group_user_session.rename(columns = {'name_x' : 'event_count', 'score' : 'SQS'})

# add up all scores for one person and divide by number of sessions

SQS = group_user_session.groupby(['distinct_id'], as_index=False).agg({'SQS' : np.mean})

# merge SQS with original dataframe

SQS_df = pd.merge(left=event_df, right=SQS, on='distinct_id')
#SQS_df = SQS_df.drop(columns = ['Unnamed: 0'])

## Final Dataframe with SQS

In [12]:
SQS_df

Unnamed: 0,name_x,distinct_id,time,ae_session_length,city,region,mp_country_code,locationSetting,notificationSettings,FirstTimeUploaded,...,US,Facebook,Foodie,Google,Unknown,session_id,time_delta,category,score,SQS
0,AppOpen,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:48:32,-1.0,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Basic,1.0,40.181818
1,Visit Signup/Login Page,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:48:35,-1.0,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,Basic,1.0,40.181818
2,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:49:02,30.1,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,AE,1.0,40.181818
3,Signup Success,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:50:06,-1.0,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,2.0,64.0,Basic,1.0,40.181818
4,Onboarding - Allow Location,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:50:17,-1.0,Palo Alto,California,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,2.0,64.0,Basic,1.0,40.181818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163267,Feed - Scroll Down,xbarbarazhong@gmail.com_6704,2019-12-12 14:29:23,-1.0,Portland,Oregon,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,29813.0,3.0,Explore,5.0,42.000000
163268,Profile - Change Section,xbarbarazhong@gmail.com_6704,2019-12-12 14:29:28,-1.0,Portland,Oregon,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,29813.0,3.0,Profile,2.0,42.000000
163269,$ae_session,xbarbarazhong@gmail.com_6704,2019-12-12 14:30:54,190.4,Portland,Oregon,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,29813.0,3.0,AE,1.0,42.000000
163270,AppOpen,yoon.s.jeong@gmail.com_6485,2019-10-09 11:12:29,-1.0,Chicago,Illinois,US,-1.0,-1.0,,...,1.0,0.0,0.0,0.0,0.0,29814.0,0.0,Basic,1.0,2.000000


# Merge SQS with Ratings

In [13]:
def merge_SQS_rating(SQS, ratings):
    SQS_short = SQS[['distinct_id', 'SQS']]
    df = pd.merge(ratings, SQS_short, left_on = 'distinct_id', right_on = 'distinct_id', how = 'left')
    df = df.drop_duplicates('distinct_id')
    return df

In [19]:
# merged SQS with ratings

result = merge_SQS_rating(SQS_df, ratings_df)
result

Unnamed: 0,distinct_id,ae_total_app_session_length,name,country_code,auth_provider,ae_total_app_sessions,timezone,last_seen,ios_app_version,ios_version,ios_device_model,ios_lib_version,ae_first_app_open_date,active_timespan,average_session_time,number_ratings,SQS
0,hr@gmail.com_87,345721,Harshil Raval,IN,Foodie,1759,Asia/Kolkata,2020-03-03 06:08:32+00:00,2.5.0,13.2.3,"iPhone9,3",3.4.9,2019-05-30 01:08:51.984962560+00:00,278 days 04:59:40.015037440,196.544059,130.0,29.951417
1702,E2D9AB63-1718-4D1E-936F-10BE1D889989,4412,Theodore Wu,US,Google,59,America/Los_Angeles,2019-09-08 03:15:47+00:00,2.0.5,13.1,"iPhone9,1",3.4.4,2019-04-22 20:34:46+00:00,138 days 06:41:01.000000000,74.779661,0.0,33.000000
1729,vaibhavverma9@gmail.com_56,71430,Vaibhav Verma,IN,Foodie,621,Asia/Kolkata,2019-04-15 15:18:25+00:00,1.3.7,12.2,"iPhone7,1",3.3.3,2019-05-30 01:08:51.984962560+00:00,-45 days +14:09:33.015037440,115.024155,0.0,
1730,0C954A3F-0AB6-4D12-B4E9-D916C95B0C6E,12836,Tiffany Qi,US,Facebook,77,America/Los_Angeles,2019-11-09 18:33:06+00:00,2.1.6,13.1.3,"iPhone9,3",3.4.4,2019-05-01 06:57:08+00:00,192 days 11:35:58.000000000,166.701299,10.0,55.464286
2062,addidas23@gmail.com_139,2066,Nathan Nangia,US,Foodie,39,America/Chicago,2020-03-18 15:09:21+00:00,2.5.4,13.4,"iPhone11,2",3.4.9,2019-05-30 01:08:51.984962560+00:00,293 days 14:00:29.015037440,52.974359,3.0,18.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86193,52774481-EECA-4703-8F31-F634B24773BE,161801,Reyna Huang,US,Facebook,1168,America/New_York,2020-02-23 00:59:33+00:00,2.4.7,13.3.1,"iPhone12,1",3.4.4,2019-02-08 14:15:24+00:00,379 days 10:44:09.000000000,138.528253,2.0,30.444444
86392,F4B6EE31-E835-4C5D-8387-456DFF4E9A3D,8308,Christine Duke,US,Unknown,53,America/Chicago,2020-03-21 16:43:15+00:00,2.5.0,13.3.1,"iPhone12,3",3.4.9,2018-12-03 21:10:02+00:00,473 days 19:33:13.000000000,156.754717,0.0,
86393,04ADA4C3-9212-49DA-B600-67A8403D69C3,2283,Alexandria Ma,US,Unknown,45,America/Chicago,2020-01-19 01:41:22+00:00,2.3.3,13.2,"iPhone11,8",3.4.4,2019-06-11 14:14:02+00:00,221 days 11:27:20.000000000,50.733333,0.0,11.200000
86469,619EECB8-EA49-493B-9048-4EAB912BF98E,4227,Lucy Li,US,Facebook,67,America/Chicago,2019-12-12 03:35:49+00:00,2.3.3,13.2,"iPhone12,3",3.4.4,2019-05-23 13:42:40+00:00,202 days 13:53:09.000000000,63.089552,0.0,15.090909


# Add Slope and Merge Slope with People

In [16]:
df = events.loc[(events['name_x'] == '$ae_session') & (events['time_delta'] != 0)]
df

Unnamed: 0.1,Unnamed: 0,name_x,distinct_id,time,ae_session_length,city,region,mp_country_code,locationSetting,notificationSettings,...,active_timespan,average_session_time,country,US,Facebook,Foodie,Google,Unknown,session_id,time_delta
19,19,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:50:51,65.8,Palo Alto,California,US,-1,-1,...,,,US,1,0,0,0,0,2,64.0
21,21,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:51:52,59.9,Palo Alto,California,US,-1,-1,...,,,US,1,0,0,0,0,3,7.0
24,24,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 14:38:30,72.3,Hayward,California,US,-1,-1,...,,,US,1,0,0,0,0,4,2728.0
36,36,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 15:05:10,151.9,San Jose,California,US,-1,-1,...,,,US,1,0,0,0,0,5,1466.0
50,50,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 15:08:11,115.8,San Jose,California,US,-1,-1,...,,,US,1,0,0,0,0,6,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190340,190340,$ae_session,willwojt@gmail.com_6868,2020-03-07 12:11:54,56.5,Washington,District of Columbia,US,-1,-1,...,,,US,1,0,0,0,0,29807,40862.0
190341,190341,$ae_session,willwojt@gmail.com_6868,2020-03-07 12:14:44,32.9,Washington,District of Columbia,US,-1,-1,...,,,US,1,0,0,0,0,29808,170.0
190342,190342,$ae_session,willwojt@gmail.com_6868,2020-03-08 10:47:21,42.3,Washington,District of Columbia,US,-1,-1,...,,,US,1,0,0,0,0,29809,81157.0
190343,190343,$ae_session,willwojt@gmail.com_6868,2020-03-08 10:52:05,200.6,Washington,District of Columbia,US,-1,-1,...,,,US,1,0,0,0,0,29810,284.0


In [17]:
def best_fit_slope(xs,ys):
    m = (((np.mean(xs)*np.mean(ys)) - np.mean(xs*ys)) /
         ((np.mean(xs)*np.mean(xs)) - np.mean(xs*xs)))
    return m

In [18]:
def add_slope(df):
    grouped = df.groupby('distinct_id', as_index=False).agg({'time' : 'count'})
    grouped['slope'] = None
    for index, row in grouped.iterrows():
        x = df.loc[df['distinct_id'] == row['distinct_id']]
        length = np.arange(len(x))
        slope = best_fit_slope(length, x['time_delta'])
        grouped.loc[grouped['distinct_id'] == row['distinct_id'], 'slope'] = slope
    final = grouped.drop(columns='time', axis=1)
    return final

slope = add_slope(df)

  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
slope

Unnamed: 0,distinct_id,slope
0,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1113.36
1,001210DC-54C8-43AD-B295-148F47818391,-11716
2,0025A8A4-2590-4ECA-8CE8-419D710AE46F,615550
3,004C9C76-A530-422A-BA1B-667CDA12A08D,-44.5
4,005144D7-BD3F-405B-9621-5FACA6B1AC04,
...,...,...
2617,varun.murthy@gmail.com_5928,-436340
2618,virtrutest99@gmail.com_5848,17.2857
2619,wandaandreu@gmail.com_5053,1.37164e+07
2620,willwojt@gmail.com_6868,-122.179


In [26]:
result = result.merge(slope, how='left', on='distinct_id')
result

Unnamed: 0,distinct_id,ae_total_app_session_length,name,country_code,auth_provider,ae_total_app_sessions,timezone,last_seen,ios_app_version,ios_version,ios_device_model,ios_lib_version,ae_first_app_open_date,active_timespan,average_session_time,number_ratings,SQS,slope
0,hr@gmail.com_87,345721,Harshil Raval,IN,Foodie,1759,Asia/Kolkata,2020-03-03 06:08:32+00:00,2.5.0,13.2.3,"iPhone9,3",3.4.9,2019-05-30 01:08:51.984962560+00:00,278 days 04:59:40.015037440,196.544059,130.0,29.951417,60.7808
1,E2D9AB63-1718-4D1E-936F-10BE1D889989,4412,Theodore Wu,US,Google,59,America/Los_Angeles,2019-09-08 03:15:47+00:00,2.0.5,13.1,"iPhone9,1",3.4.4,2019-04-22 20:34:46+00:00,138 days 06:41:01.000000000,74.779661,0.0,33.000000,1.73002e+06
2,vaibhavverma9@gmail.com_56,71430,Vaibhav Verma,IN,Foodie,621,Asia/Kolkata,2019-04-15 15:18:25+00:00,1.3.7,12.2,"iPhone7,1",3.3.3,2019-05-30 01:08:51.984962560+00:00,-45 days +14:09:33.015037440,115.024155,0.0,,
3,0C954A3F-0AB6-4D12-B4E9-D916C95B0C6E,12836,Tiffany Qi,US,Facebook,77,America/Los_Angeles,2019-11-09 18:33:06+00:00,2.1.6,13.1.3,"iPhone9,3",3.4.4,2019-05-01 06:57:08+00:00,192 days 11:35:58.000000000,166.701299,10.0,55.464286,17473
4,addidas23@gmail.com_139,2066,Nathan Nangia,US,Foodie,39,America/Chicago,2020-03-18 15:09:21+00:00,2.5.4,13.4,"iPhone11,2",3.4.9,2019-05-30 01:08:51.984962560+00:00,293 days 14:00:29.015037440,52.974359,3.0,18.000000,1400.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,52774481-EECA-4703-8F31-F634B24773BE,161801,Reyna Huang,US,Facebook,1168,America/New_York,2020-02-23 00:59:33+00:00,2.4.7,13.3.1,"iPhone12,1",3.4.4,2019-02-08 14:15:24+00:00,379 days 10:44:09.000000000,138.528253,2.0,30.444444,-12236.1
177,F4B6EE31-E835-4C5D-8387-456DFF4E9A3D,8308,Christine Duke,US,Unknown,53,America/Chicago,2020-03-21 16:43:15+00:00,2.5.0,13.3.1,"iPhone12,3",3.4.9,2018-12-03 21:10:02+00:00,473 days 19:33:13.000000000,156.754717,0.0,,
178,04ADA4C3-9212-49DA-B600-67A8403D69C3,2283,Alexandria Ma,US,Unknown,45,America/Chicago,2020-01-19 01:41:22+00:00,2.3.3,13.2,"iPhone11,8",3.4.4,2019-06-11 14:14:02+00:00,221 days 11:27:20.000000000,50.733333,0.0,11.200000,39954
179,619EECB8-EA49-493B-9048-4EAB912BF98E,4227,Lucy Li,US,Facebook,67,America/Chicago,2019-12-12 03:35:49+00:00,2.3.3,13.2,"iPhone12,3",3.4.4,2019-05-23 13:42:40+00:00,202 days 13:53:09.000000000,63.089552,0.0,15.090909,235484
