# Churn Metrics

Adding metrics to people dataset for modeling

In [29]:
import numpy as np
import pandas as pd
import re
import matplotlib
import seaborn as sns
from datetime import timedelta

Read the events and people data into dataframes

In [30]:
events = pd.read_csv('Cleaned_Events_With_SessionID.csv', low_memory=False)
people = pd.read_csv('labeled_data.csv', low_memory= False)

# Adding Number of Ratings Metric

In [31]:
def ratings(events_df, people_df):
    #creating dictionary with distinct id and number of ratings
    number_ratings_dict = events_df[events_df['name_x'] == 'Rating Flow - Complete'].groupby('distinct_id').count()['name_x'].to_dict()
    
    #creating df with just distinct_id and mapping ratings to each distinct id, filling in Nans with 0
    short_df = people_df[['distinct_id']]
    short_df['number_ratings'] = short_df['distinct_id'].map(number_ratings_dict)
    short_df = short_df.fillna(0)
    
    # drop 'distinct_id' column from short_df and then append to the original dataframe
    short_df = short_df.drop(columns = ['distinct_id'])
    people_df = pd.concat([people_df, short_df], axis=1)
    people_df = people_df.drop(columns = ['Unnamed: 0'])
    
    return people_df

In [32]:
ratings_df = ratings(events, people)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


# SQS

In [33]:
# weight dictionary

weight_dict = {
    'Basic' : 1,
    'Profile' : 2,
    'Explore' : 5,
    'Menu' : 5,
    'Collection' : 7,
    'Rating' : 7,
    'Share' : 10,
    'MISC' : 1,
    'AE' : 1
}

In [34]:
# events dictionary

event_dict = {
    'Basic' : ['Login', 'Logout', 'AppOpen', 'Onboarding - Allow Location', 'Onboarding - Allow Notifications', 
               'Onboarding - Always Allow Location', 'PageVisit', 'Permissions for Locations', 
               'Permissions for Notifications', 'Signup - Sign In', 'Signup - Skip', 'Signup Success', 
               'Visit Signup/Login Page',  'Visit Home Page'],
    
    'Profile' : ['Password Changed', 'Profile - Change Section', 'Profile - Go to Collection', 
                 'Profile - Go to Menu', 'Profile Updated', 'Sidebar - Change Password', 
                 'Sidebar - Contacts Permissions', 'Sidebar - Edit Profile', 'Sidebar - Go to Followers', 
                 'Sidebar - Privacy Policy', 'Sidebar - Sign Out', 'Sidebar - Terms and Conditions', 
                 'Visit Change Password Page', 'Visit Edit Profile Page', 'Followers Page - Go to Profile', 
                 'Item Page - Go to Profile'],
    
    'Explore' : ['Explore - Tap Collection', 'Explore - Visit Explore Page', 'Home - Explore Nearby', 
                 'Feed - Bookmark Icon', 'Feed - Maps Icon', 'Feed - Scroll Down', 'Feed - Tap Dish', 'Item Page - Bookmark', 
                 'Landing - Go to Delivery Feed', 'Landing - Go to Walking Feed', 'Search - Change Section', 
                 'Search - Follow User', 'Search - Invalid Location', 'Search - Results Loaded', 'Search - Tap Home Bar', 
                 'Search - Tap Nearby Restaurant', 'Search - Tap Result', 'Search - Tap Return', 'Search - Tap to Request', 
                 'Search - Unfollow User', 'Tap Searched Dish', 'Visit Item Page', 
                 'Visit Menu Page'],
    
    'Menu' : ['Maps Popup - Go to Reviews', 'Maps Popup - Maybe Later', 'Maps Popup - Open', 'Menu Page - Accuracy Popup', 
              'Menu Page - Add New Dish', 'Menu Page - Collection via Add Icon', 'Menu Page - Filter Items', 
              'Menu Page - Go to Item Page', 'Menu Page - Rate via Add Icon', 'Menu Page - See Restaurant Details', 
              'Menu Page - Submit Flag', 'Menu Page - Switch Menus', 'Menu Page - Tap Flag Icon', 'Visit Menu Page', 
              'Nearby - Bookmark Dish', 'Nearby - Delivery Link', 'Delivery Page - Visit For 10 Sec', 'Home - Order Delivery', 
              'Nearby - Open Maps', 'Nearby - Tap Dish Maps', 'Visit Carousel Wise Dish List'],
    
    'Rating' : ['Edit Rating - Add Photo', 'Rating Flow - Complete', 'Rating Flow - Rate', 
                'Rating Flow - Resubmit', 'Rating Flow - Start', 'Request Restaurant', 
                'Edit Rating - Delete', 'Edit Rating - Edit', 'Edit Rating - Exit', 
                'Edit Rating - Modify Date', 'Item Page - Open My Rating', 
                'Item Page - Tap Upload Photo', 'Profile - Edit Rating Popup'],
    
    'Collection' : ['Collection - Tap Follow Button', 'Collection - Tap Restaurant', 'Collection - Tap Return Button', 
                    'Collection - Tap Search Bar', 'Collection - Visit Collection Detail Page', 
                    'Collection Flow - Create New Collection', 'Collection Flow - Tap Collection', 
                    'Collection Flow - Tap New Collection'], 
    'Share' : ['Share Page - Instagram Story', 'Share Page - Skip', 'Provided App Feedback', 
               'Sidebar - Go to Feedback', 'Sidebar - Invite a Friend',  'Visit App Feedback Page', 
               'Invite Page - Successful Text', 'Search - Invite Button'],
    
    'AE' : ['$ae_crashed', '$ae_first_open', '$ae_session', '$ae_updated', '$campaign_bounced', 
            '$campaign_delivery', '$message_suppressed', '$unsubscribe'],
    
    'MISC': ['DMV-resident'],
}

In [35]:
# short_df only includes the event name and an empty column for category

short_df = events[['name_x']]
short_df.loc['category'] = ''

In [36]:
# create a reverse events dictionary in order to use map function 

reverse_dict = {}
for key, value in event_dict.items():
    for event in value:
        reverse_dict.setdefault(event, []).append(key)
        
# apply reverse dictionary to new column called category that corresponds to the event name
short_df['category'] = short_df['name_x'].map(reverse_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [37]:
# get the first item in category because each value is a list of one element

short_df['category'] = short_df['category'].str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
# map the weights for each category to a score column

short_df['score'] = short_df['category'].map(weight_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
# MERGE DATAFRAMES

# drop 'name_x' column from short_df and then append to the original dataframe

labeled_df = short_df.drop(columns = ['name_x'])
event_df = pd.concat([events, labeled_df], axis = 1)
event_df = event_df.drop(columns = ['Unnamed: 0'])

# filter out sessions with -1 for session_id

event_df = event_df[event_df['session_id'] != -1]

# groupby distinct id and then session id 

group_user_session = event_df.groupby(['distinct_id', 'session_id'], as_index=False).agg({'name_x' : 'count', 'score' : 'sum'})
group_user_session = group_user_session.rename(columns = {'name_x' : 'event_count', 'score' : 'SQS'})

# add up all scores for one person and divide by number of sessions

SQS = group_user_session.groupby(['distinct_id'], as_index=False).agg({'SQS' : np.mean})

# merge SQS with original dataframe

SQS_df = pd.merge(left=event_df, right=SQS, on='distinct_id')
#SQS_df = SQS_df.drop(columns = ['Unnamed: 0'])

## Final Dataframe with SQS

In [40]:
SQS_df

Unnamed: 0,name_x,distinct_id,time,ae_session_length,city,region,mp_country_code,locationSetting,notificationSettings,FirstTimeUploaded,...,US,Facebook,Foodie,Google,Unknown,session_id,time_delta,category,score,SQS
0,AppOpen,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:48:32,-1.0,Palo Alto,California,US,1.0,0.0,,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,Basic,1.0,40.181818
1,Visit Signup/Login Page,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:48:35,-1.0,Palo Alto,California,US,1.0,0.0,,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,Basic,1.0,40.181818
2,$ae_session,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:49:02,30.1,Palo Alto,California,US,1.0,0.0,,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,AE,1.0,40.181818
3,Signup Success,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:50:06,-1.0,Palo Alto,California,US,1.0,1.0,,...,1.0,0.0,0.0,0.0,1.0,2.0,64.0,Basic,1.0,40.181818
4,Onboarding - Allow Location,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,2019-08-23 13:50:17,-1.0,Palo Alto,California,US,1.0,0.0,,...,1.0,0.0,0.0,0.0,1.0,2.0,64.0,Basic,1.0,40.181818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163267,Feed - Scroll Down,xbarbarazhong@gmail.com_6704,2019-12-12 14:29:23,-1.0,Portland,Oregon,US,1.0,0.0,,...,1.0,1.0,0.0,0.0,0.0,29813.0,3.0,Explore,5.0,42.000000
163268,Profile - Change Section,xbarbarazhong@gmail.com_6704,2019-12-12 14:29:28,-1.0,Portland,Oregon,US,1.0,0.0,,...,1.0,1.0,0.0,0.0,0.0,29813.0,3.0,Profile,2.0,42.000000
163269,$ae_session,xbarbarazhong@gmail.com_6704,2019-12-12 14:30:54,190.4,Portland,Oregon,US,1.0,1.0,,...,1.0,1.0,0.0,0.0,0.0,29813.0,3.0,AE,1.0,42.000000
163270,AppOpen,yoon.s.jeong@gmail.com_6485,2019-10-09 11:12:29,-1.0,Chicago,Illinois,US,1.0,1.0,,...,1.0,0.0,0.0,1.0,0.0,29814.0,0.0,Basic,1.0,2.000000


# Merge SQS with Ratings

In [41]:
def merge_SQS_rating(SQS, ratings):
    SQS_short = SQS[['distinct_id', 'SQS']]
    df = pd.merge(ratings, SQS_short, left_on = 'distinct_id', right_on = 'distinct_id', how = 'left')
    df = df.drop_duplicates('distinct_id')
    return df

In [42]:
# merged SQS with ratings

result = merge_SQS_rating(SQS_df, ratings_df)
result

Unnamed: 0,distinct_id,locationSetting,notificationSettings,active_timespan,average_session_time,US,Facebook,Foodie,Google,Unknown,...,America/Los_Angeles,America/Chicago,Asia/Kolkata,America/Denver,other_timezone,commentAdded,commentLength,churned,number_ratings,SQS
0,00000000-0000-0000-0000-000000000000,1,0,24.202130,,0,0,1,0,0,...,0,0,0,0,1,0,0.000000,1,0.0,
1,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1,1,1.130949,82.272727,1,0,0,1,1,...,1,0,0,0,0,0,0.000000,0,1.0,40.181818
98,001210DC-54C8-43AD-B295-148F47818391,1,0,7.158310,47.600000,1,0,0,1,1,...,0,1,0,0,0,0,0.000000,0,0.0,20.000000
154,0019C3B9-FA62-4AB4-A895-1390A1FA818C,1,1,0.000000,,1,0,1,0,0,...,0,0,0,0,1,0,0.000000,0,0.0,
155,0025A8A4-2590-4ECA-8CE8-419D710AE46F,1,0,113.687361,176.750000,1,0,1,0,1,...,0,0,0,0,0,0,0.000000,0,1.0,27.375000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166663,wandaandreu@gmail.com_5053,1,0,188.791597,,1,0,1,0,1,...,0,0,0,0,0,0,0.000000,0,0.0,1.666667
166668,willwojt@gmail.com_6868,1,0,37.992164,,1,0,1,0,1,...,0,0,0,0,0,5,2.252809,0,7.0,4.171717
166845,wolphramite@gmail.com_291,1,0,79.125498,,1,0,1,0,1,...,0,0,0,0,0,0,0.000000,1,0.0,1.000000
166846,xbarbarazhong@gmail.com_6704,1,0,0.584016,,1,0,1,0,1,...,1,0,0,0,0,0,0.000000,1,0.0,42.000000


# Add Average Session Time Metric #

In [43]:
# Metric: avg_session_duration
# Similar to "average_session_time", but NOT calculated the same way.
# Takes the average of all of each user's session durations.

def avg_session_time(people, events):
    events = events[events['ae_session_length'] != -1.0]
    groupby_id = events.groupby(['distinct_id']).agg(np.mean)
    people_m2 = people.merge(groupby_id['ae_session_length'], on = 'distinct_id', how = 'left')
    people_m2['ae_session_length'] = people_m2['ae_session_length'].fillna(-1)
    people_m2 = people_m2.rename(columns={"ae_session_length": "avg_session_duration"})
    return people_m2

In [44]:
result = avg_session_time(result, events)
result

Unnamed: 0,distinct_id,locationSetting,notificationSettings,active_timespan,average_session_time,US,Facebook,Foodie,Google,Unknown,...,America/Chicago,Asia/Kolkata,America/Denver,other_timezone,commentAdded,commentLength,churned,number_ratings,SQS,avg_session_duration
0,00000000-0000-0000-0000-000000000000,1,0,24.202130,,0,0,1,0,0,...,0,0,0,1,0,0.000000,1,0.0,,-1.000000
1,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1,1,1.130949,82.272727,1,0,0,1,1,...,0,0,0,0,0,0.000000,0,1.0,40.181818,82.854545
2,001210DC-54C8-43AD-B295-148F47818391,1,0,7.158310,47.600000,1,0,0,1,1,...,1,0,0,0,0,0.000000,0,0.0,20.000000,47.940000
3,0019C3B9-FA62-4AB4-A895-1390A1FA818C,1,1,0.000000,,1,0,1,0,0,...,0,0,0,1,0,0.000000,0,0.0,,-1.000000
4,0025A8A4-2590-4ECA-8CE8-419D710AE46F,1,0,113.687361,176.750000,1,0,1,0,1,...,0,0,0,0,0,0.000000,0,1.0,27.375000,177.325000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7210,wandaandreu@gmail.com_5053,1,0,188.791597,,1,0,1,0,1,...,0,0,0,0,0,0.000000,0,0.0,1.666667,54.700000
7211,willwojt@gmail.com_6868,1,0,37.992164,,1,0,1,0,1,...,0,0,0,0,5,2.252809,0,7.0,4.171717,61.190909
7212,wolphramite@gmail.com_291,1,0,79.125498,,1,0,1,0,1,...,0,0,0,0,0,0.000000,1,0.0,1.000000,17.500000
7213,xbarbarazhong@gmail.com_6704,1,0,0.584016,,1,0,1,0,1,...,0,0,0,0,0,0.000000,1,0.0,42.000000,124.150000


# Add Slopes and Merge Slopes with People

In [45]:
def best_fit_slope(xs,ys):
    m = (((np.mean(xs)*np.mean(ys)) - np.mean(xs*ys)) /
         ((np.mean(xs)*np.mean(xs)) - np.mean(xs*xs)))
    return m

In [46]:
def add_time_delta_slope(df):
    df = df.loc[(events['name_x'] == '$ae_session') & (events['time_delta'] != 0)]
    grouped = df.groupby('distinct_id', as_index=False).agg({'time' : 'count'})
    grouped['time_delta_slope'] = None
    for index, row in grouped.iterrows():
        subset = df.loc[df['distinct_id'] == row['distinct_id']]
        subset = subset.sort_values('time', ascending=True)
        length = np.arange(len(subset))
        slope = best_fit_slope(length, subset['time_delta'])
        grouped.loc[grouped['distinct_id'] == row['distinct_id'], 'time_delta_slope'] = slope
    final = grouped.drop(columns='time', axis=1)
    return final

In [47]:
# Metric: slope
# AKA: best fit slope of trend of time between session
# Given the trend of time between sessions over time, find the slope of the best fit line for that user. 

time_delta_slope = add_time_delta_slope(events)
time_delta_slope

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,distinct_id,time_delta_slope
0,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1113.36
1,001210DC-54C8-43AD-B295-148F47818391,-11716
2,0025A8A4-2590-4ECA-8CE8-419D710AE46F,615550
3,004C9C76-A530-422A-BA1B-667CDA12A08D,-44.5
4,005144D7-BD3F-405B-9621-5FACA6B1AC04,
...,...,...
2617,varun.murthy@gmail.com_5928,-436340
2618,virtrutest99@gmail.com_5848,17.2857
2619,wandaandreu@gmail.com_5053,1.37164e+07
2620,willwojt@gmail.com_6868,-122.179


In [48]:
def add_session_time_slope(df):
    df = df.loc[(events['name_x'] == '$ae_session')]
    grouped = df.groupby('distinct_id', as_index=False).agg({'time' : 'count'})
    grouped['session_time_slope'] = None
    for index, row in grouped.iterrows():
        subset = df.loc[df['distinct_id'] == row['distinct_id']]
        subset = subset.sort_values('time', ascending=True)
        length = np.arange(len(subset))
        slope = best_fit_slope(length, subset['ae_session_length'])
        grouped.loc[grouped['distinct_id'] == row['distinct_id'], 'session_time_slope'] = slope
    final = grouped.drop(columns='time', axis=1)
    return final

In [49]:
session_time_slope = add_session_time_slope(events)
session_time_slope

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,distinct_id,session_time_slope
0,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,6.30273
1,001210DC-54C8-43AD-B295-148F47818391,6.66788
2,0025A8A4-2590-4ECA-8CE8-419D710AE46F,-41.4762
3,004C9C76-A530-422A-BA1B-667CDA12A08D,2.55
4,005144D7-BD3F-405B-9621-5FACA6B1AC04,20.7
...,...,...
3613,wandaandreu@gmail.com_5053,-44.65
3614,willwojt@gmail.com_6868,0.021697
3615,wolphramite@gmail.com_291,
3616,xbarbarazhong@gmail.com_6704,132.5


In [50]:
result = result.merge(time_delta_slope, how='left', on='distinct_id').merge(session_time_slope, how='left', on='distinct_id')
result
#result.drop(columns=['time_delta_slope_x', 'time_delta_slope_y', "session_time_slope_x", "session_time_slope_y"], axis=1, inplace=True)

Unnamed: 0,distinct_id,locationSetting,notificationSettings,active_timespan,average_session_time,US,Facebook,Foodie,Google,Unknown,...,America/Denver,other_timezone,commentAdded,commentLength,churned,number_ratings,SQS,avg_session_duration,time_delta_slope,session_time_slope
0,00000000-0000-0000-0000-000000000000,1,0,24.202130,,0,0,1,0,0,...,0,1,0,0.000000,1,0.0,,-1.000000,,
1,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1,1,1.130949,82.272727,1,0,0,1,1,...,0,0,0,0.000000,0,1.0,40.181818,82.854545,1113.36,6.30273
2,001210DC-54C8-43AD-B295-148F47818391,1,0,7.158310,47.600000,1,0,0,1,1,...,0,0,0,0.000000,0,0.0,20.000000,47.940000,-11716,6.66788
3,0019C3B9-FA62-4AB4-A895-1390A1FA818C,1,1,0.000000,,1,0,1,0,0,...,0,1,0,0.000000,0,0.0,,-1.000000,,
4,0025A8A4-2590-4ECA-8CE8-419D710AE46F,1,0,113.687361,176.750000,1,0,1,0,1,...,0,0,0,0.000000,0,1.0,27.375000,177.325000,615550,-41.4762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7210,wandaandreu@gmail.com_5053,1,0,188.791597,,1,0,1,0,1,...,0,0,0,0.000000,0,0.0,1.666667,54.700000,1.37164e+07,-44.65
7211,willwojt@gmail.com_6868,1,0,37.992164,,1,0,1,0,1,...,0,0,5,2.252809,0,7.0,4.171717,61.190909,-122.179,0.021697
7212,wolphramite@gmail.com_291,1,0,79.125498,,1,0,1,0,1,...,0,0,0,0.000000,1,0.0,1.000000,17.500000,,
7213,xbarbarazhong@gmail.com_6704,1,0,0.584016,,1,0,1,0,1,...,0,0,0,0.000000,1,0.0,42.000000,124.150000,,132.5


In [51]:
#adding slopes for only events within 90 days
events['time'] = events['time'].apply(lambda x : pd.to_datetime(x))
most_recent = max(events['time'])
threshold = most_recent - timedelta(days=90)
threshold
within90 = events.loc[events['time'] > threshold]

In [52]:
def add_time_delta_slope_90(df):
    df = df.loc[(events['name_x'] == '$ae_session') & (events['time_delta'] != 0)]
    grouped = df.groupby('distinct_id', as_index=False).agg({'time' : 'count'})
    grouped['time_delta_slope_90'] = None
    for index, row in grouped.iterrows():
        subset = df.loc[df['distinct_id'] == row['distinct_id']]
        subset = subset.sort_values('time', ascending=True)
        length = np.arange(len(subset))
        slope = best_fit_slope(length, subset['time_delta'])
        grouped.loc[grouped['distinct_id'] == row['distinct_id'], 'time_delta_slope_90'] = slope
    final = grouped.drop(columns='time', axis=1)
    return final

In [53]:
def add_session_time_slope_90(df):
    df = df.loc[(events['name_x'] == '$ae_session')]
    grouped = df.groupby('distinct_id', as_index=False).agg({'time' : 'count'})
    grouped['session_time_slope_90'] = None
    for index, row in grouped.iterrows():
        subset = df.loc[df['distinct_id'] == row['distinct_id']]
        subset = subset.sort_values('time', ascending=True)
        length = np.arange(len(subset))
        slope = best_fit_slope(length, subset['ae_session_length'])
        grouped.loc[grouped['distinct_id'] == row['distinct_id'], 'session_time_slope_90'] = slope
    final = grouped.drop(columns='time', axis=1)
    return final

In [54]:
session_time_90_slope = add_session_time_slope_90(within90)
time_delta_90_slope = add_time_delta_slope_90(within90)
result = result.merge(time_delta_90_slope, how='left', on='distinct_id').merge(session_time_90_slope, how='left', on='distinct_id')
result

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,distinct_id,locationSetting,notificationSettings,active_timespan,average_session_time,US,Facebook,Foodie,Google,Unknown,...,commentAdded,commentLength,churned,number_ratings,SQS,avg_session_duration,time_delta_slope,session_time_slope,time_delta_slope_90,session_time_slope_90
0,00000000-0000-0000-0000-000000000000,1,0,24.202130,,0,0,1,0,0,...,0,0.000000,1,0.0,,-1.000000,,,,
1,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1,1,1.130949,82.272727,1,0,0,1,1,...,0,0.000000,0,1.0,40.181818,82.854545,1113.36,6.30273,,
2,001210DC-54C8-43AD-B295-148F47818391,1,0,7.158310,47.600000,1,0,0,1,1,...,0,0.000000,0,0.0,20.000000,47.940000,-11716,6.66788,,
3,0019C3B9-FA62-4AB4-A895-1390A1FA818C,1,1,0.000000,,1,0,1,0,0,...,0,0.000000,0,0.0,,-1.000000,,,,
4,0025A8A4-2590-4ECA-8CE8-419D710AE46F,1,0,113.687361,176.750000,1,0,1,0,1,...,0,0.000000,0,1.0,27.375000,177.325000,615550,-41.4762,-9.12784e+06,6.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7210,wandaandreu@gmail.com_5053,1,0,188.791597,,1,0,1,0,1,...,0,0.000000,0,0.0,1.666667,54.700000,1.37164e+07,-44.65,,
7211,willwojt@gmail.com_6868,1,0,37.992164,,1,0,1,0,1,...,5,2.252809,0,7.0,4.171717,61.190909,-122.179,0.021697,-122.179,0.021697
7212,wolphramite@gmail.com_291,1,0,79.125498,,1,0,1,0,1,...,0,0.000000,1,0.0,1.000000,17.500000,,,,
7213,xbarbarazhong@gmail.com_6704,1,0,0.584016,,1,0,1,0,1,...,0,0.000000,1,0.0,42.000000,124.150000,,132.5,,


# Cleaning #

In [55]:
# number_ratings: NaNs imputed with 0.0
# SQS: NaNs imputed with -1.0
# slope: NaNs imputed with 0.0
# avg_session_duration: NaNs i,puted with -1.0

result = result.drop(columns=['average_session_time'])
result['SQS'] = result['SQS'].fillna(-1)
result['time_delta_slope'] = result['time_delta_slope'].fillna(0)
result['session_time_slope'] = result['session_time_slope'].fillna(0)
result['time_delta_slope_90'] = result['time_delta_slope_90'].fillna(0)
result['session_time_slope_90'] = result['session_time_slope_90'].fillna(0)
result

Unnamed: 0,distinct_id,locationSetting,notificationSettings,active_timespan,US,Facebook,Foodie,Google,Unknown,num_sessions,...,commentAdded,commentLength,churned,number_ratings,SQS,avg_session_duration,time_delta_slope,session_time_slope,time_delta_slope_90,session_time_slope_90
0,00000000-0000-0000-0000-000000000000,1,0,24.202130,0,0,1,0,0,2,...,0,0.000000,1,0.0,-1.000000,-1.000000,0.000000e+00,0.000000,0.000000e+00,0.000000
1,000BA5B1-DBEF-414E-ACEC-1A2FCC2053DD,1,1,1.130949,1,0,0,1,1,97,...,0,0.000000,0,1.0,40.181818,82.854545,1.113364e+03,6.302727,0.000000e+00,0.000000
2,001210DC-54C8-43AD-B295-148F47818391,1,0,7.158310,1,0,0,1,1,61,...,0,0.000000,0,0.0,20.000000,47.940000,-1.171600e+04,6.667879,0.000000e+00,0.000000
3,0019C3B9-FA62-4AB4-A895-1390A1FA818C,1,1,0.000000,1,0,1,0,0,1,...,0,0.000000,0,0.0,-1.000000,-1.000000,0.000000e+00,0.000000,0.000000e+00,0.000000
4,0025A8A4-2590-4ECA-8CE8-419D710AE46F,1,0,113.687361,1,0,1,0,1,64,...,0,0.000000,0,1.0,27.375000,177.325000,6.155496e+05,-41.476190,-9.127842e+06,6.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7210,wandaandreu@gmail.com_5053,1,0,188.791597,1,0,1,0,1,5,...,0,0.000000,0,0.0,1.666667,54.700000,1.371636e+07,-44.650000,0.000000e+00,0.000000
7211,willwojt@gmail.com_6868,1,0,37.992164,1,0,1,0,1,178,...,5,2.252809,0,7.0,4.171717,61.190909,-1.221786e+02,0.021697,-1.221786e+02,0.021697
7212,wolphramite@gmail.com_291,1,0,79.125498,1,0,1,0,1,4,...,0,0.000000,1,0.0,1.000000,17.500000,0.000000e+00,0.000000,0.000000e+00,0.000000
7213,xbarbarazhong@gmail.com_6704,1,0,0.584016,1,0,1,0,1,21,...,0,0.000000,1,0.0,42.000000,124.150000,0.000000e+00,132.500000,0.000000e+00,0.000000


In [56]:
result.to_csv('labeled_data_by_metrics.csv')