In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
# from textwrap import wrap

%matplotlib inline
%run cleaning_demo.ipynb
%run cleaning_web_data.ipynb
%run cleaning_experiment_clients.ipynb

df_demo = import_df_demo() # type: ignore
df_web_data = import_df_web_data() # type: ignore
df_experiment_clients = import_df_experiment_clients() # type: ignore


In [2]:
df_demo, df_web_data, df_experiment_clients

(       client_id  clnt_tenure_yr  clnt_tenure_mnth  clnt_age gendr  num_accts  \
 0         836976               6                73        60     U          2   
 1        2304905               7                94        58     U          2   
 2        1439522               5                64        32     U          2   
 3        1562045              16               198        49     M          2   
 4        5126305              12               145        33     F          2   
 ...          ...             ...               ...       ...   ...        ...   
 70604    7993686               4                56        38     U          3   
 70605    8981690              12               148        31     M          2   
 70606     333913              16               198        61     F          2   
 70607    1573142              21               255        68     M          3   
 70608    5602139              21               254        59     F          3   
 
              

In [3]:
# Merge first two dataframes
df_merge_1 = pd.merge(df_web_data, df_demo, on="client_id", how="left")

# Merge resting dataframe
df_merge_2 = pd.merge(df_merge_1, df_experiment_clients, on="client_id", how="left")

df_merge_2

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation
0,9988021,580560515_7732621733,781255054_21935453173_531117,3,2017-04-17 15:27:07,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
1,9988021,580560515_7732621733,781255054_21935453173_531117,2,2017-04-17 15:26:51,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
2,9988021,580560515_7732621733,781255054_21935453173_531117,3,2017-04-17 15:19:22,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
3,9988021,580560515_7732621733,781255054_21935453173_531117,2,2017-04-17 15:19:13,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
4,9988021,580560515_7732621733,781255054_21935453173_531117,3,2017-04-17 15:18:04,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755400,9668240,388766751_9038881013,922267647_3096648104_968866,0,2017-05-24 18:46:10,,,,,,,,,
755401,9668240,388766751_9038881013,922267647_3096648104_968866,0,2017-05-24 18:45:29,,,,,,,,,
755402,9668240,388766751_9038881013,922267647_3096648104_968866,1,2017-05-24 18:44:51,,,,,,,,,
755403,9668240,388766751_9038881013,922267647_3096648104_968866,0,2017-05-24 18:44:34,,,,,,,,,


In [4]:
# Check random client_ids

list_of_ids = df_merge_2["client_id"].to_list()
df_merge_2[df_merge_2["client_id"] == random.choice(list_of_ids)]

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation
699722,8037480,437171338_67045283778,158079087_53010418147_840599,4,2017-06-16 12:39:45,,,,,,,,,
699723,8037480,437171338_67045283778,158079087_53010418147_840599,3,2017-06-16 12:38:26,,,,,,,,,
699724,8037480,437171338_67045283778,158079087_53010418147_840599,2,2017-06-16 12:35:48,,,,,,,,,
699725,8037480,437171338_67045283778,158079087_53010418147_840599,1,2017-06-16 12:35:02,,,,,,,,,
699726,8037480,437171338_67045283778,158079087_53010418147_840599,0,2017-06-16 12:34:59,,,,,,,,,
699727,8037480,437171338_67045283778,158079087_53010418147_840599,1,2017-06-16 12:31:55,,,,,,,,,
699728,8037480,437171338_67045283778,158079087_53010418147_840599,0,2017-06-16 12:31:51,,,,,,,,,
699729,8037480,437171338_67045283778,158079087_53010418147_840599,0,2017-06-16 12:31:13,,,,,,,,,


In [5]:
# Check for NaN values count
df_merge_2.isna().sum()

client_id                0
visitor_id               0
visit_id                 0
process_step             0
date_time                0
clnt_tenure_yr      305701
clnt_tenure_mnth    305701
clnt_age            305701
gendr               305701
num_accts           305701
bal                 305701
calls_6_mnth        305701
logons_6_mnth       305701
Variation           434096
dtype: int64

In [6]:
# Drop NaN values
df_merge_2.dropna(how="any", inplace=True)
df_merge_2.reset_index(drop=True, inplace=True)

df_merge_2

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation
0,9988021,580560515_7732621733,781255054_21935453173_531117,3,2017-04-17 15:27:07,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
1,9988021,580560515_7732621733,781255054_21935453173_531117,2,2017-04-17 15:26:51,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
2,9988021,580560515_7732621733,781255054_21935453173_531117,3,2017-04-17 15:19:22,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
3,9988021,580560515_7732621733,781255054_21935453173_531117,2,2017-04-17 15:19:13,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
4,9988021,580560515_7732621733,781255054_21935453173_531117,3,2017-04-17 15:18:04,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,Test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321190,1574008,117364417_77840596075,528720790_71583064618_169151,0,2017-05-06 23:43:27,10.0,121.0,55.0,U,2.0,153238.83,3.0,6.0,Test
321191,2908510,814969699_90652851448,562606085_36368381773_92090,0,2017-05-10 22:57:17,21.0,252.0,34.0,M,3.0,141808.05,6.0,9.0,Control
321192,2908510,814969699_90652851448,562606085_36368381773_92090,2,2017-05-10 22:56:31,21.0,252.0,34.0,M,3.0,141808.05,6.0,9.0,Control
321193,2908510,814969699_90652851448,562606085_36368381773_92090,1,2017-05-10 22:56:23,21.0,252.0,34.0,M,3.0,141808.05,6.0,9.0,Control


In [7]:
# Rename dataframe to export
df_join_clean = df_merge_2

In [8]:
# create a dummy function to send to the exploration notebook
def import_df_join_clean():
    return (
        df_join_clean
        )

In [50]:
df_control_grp=df_merge_2[df_merge_2['Variation']=='Control']

In [53]:
def backward_steps_for_client(steps , client)->dict:
    count_backward=0
    for i in range (1, len(steps)):
         if steps[i-1] - steps[i] != 1:
             count_backward=count_backward+1
    print(count_backward)
    total_steps=len(steps)-1
    print(total_steps)
    
    return {client:count_backward/total_steps if total_steps > 0 else 0}

In [51]:
grouped =df_control_grp.groupby(['client_id', 'visit_id', 'visitor_id'])['process_step'].apply(list).reset_index()

In [52]:
grouped

Unnamed: 0,client_id,visit_id,visitor_id,process_step
0,1028,557292053_87239438319_391157,42237450_62128060588,"[1, 2, 1, 1, 3, 2, 1, 1, 0]"
1,1104,543158812_46395476577_767725,194240915_18158000533,[0]
2,1104,643221571_99977972121_69283,194240915_18158000533,[0]
3,1186,507052512_11309370126_442139,446844663_31615102958,[0]
4,1186,795373564_99931517312_810896,446844663_31615102958,"[2, 1, 0]"
...,...,...,...,...
32230,9997470,655572400_94971272893_411965,91394485_75296404278,[0]
32231,9997470,761490147_96352537762_21814,395791369_55562604618,"[4, 3, 2, 1, 0, 3, 2, 1, 0, 1, 0, 1, 0, 0, 0]"
32232,9997470,904791598_9725982898_416914,395791369_55562604618,[0]
32233,9998346,189177304_69869411700_783154,292425655_16607136645,"[3, 4, 3, 3, 2, 1, 3, 2, 2, 2, 1, 0]"


In [41]:
#print(backward_steps_for_client(grouped['process_step'],grouped['client_id']))

In [54]:
grouped['error_rate'] = grouped.apply(
    lambda row: backward_steps_for_client(row['process_step'], row['client_id'])[row['client_id']],
    axis=1  # Process each row individually
)

# Now calculate the mean error rate across all rows
mean_error_rate = np.mean(grouped['error_rate'])
print("Mean Error Rate:", mean_error_rate)

4
8
0
0
0
0
0
0
0
2
0
4
1
6
0
0
0
0
0
0
0
0
1
4
0
4
0
4
0
1
0
4
1
5
1
5
0
1
3
9
1
5
0
3
0
4
0
0
3
7
0
4
1
4
0
0
4
4
1
5
0
3
3
7
0
4
1
3
3
8
3
4
1
3
0
0
0
3
1
6
1
2
0
1
0
0
0
1
1
2
1
6
1
2
0
0
0
0
0
0
0
2
3
3
1
1
0
4
0
4
0
4
0
1
0
1
0
4
0
0
2
3
1
5
0
0
0
4
1
5
0
4
2
7
0
4
0
0
0
4
2
8
0
3
3
7
0
3
1
1
0
2
1
5
2
5
0
2
2
7
4
8
2
5
2
6
0
4
2
2
0
4
0
4
1
6
0
4
0
1
1
8
2
8
0
0
0
4
2
2
0
2
2
2
0
4
0
4
1
4
0
4
0
4
4
11
0
4
1
1
0
4
1
1
1
1
1
6
1
8
0
4
0
4
0
1
0
3
0
4
3
11
1
6
0
4
0
0
4
6
0
4
2
6
1
4
0
4
0
4
0
4
2
6
0
1
2
8
2
7
0
4
0
4
0
0
0
4
1
7
0
4
0
4
1
1
0
4
0
1
1
5
1
2
1
1
1
1
1
3
0
1
2
8
2
6
0
4
0
3
1
3
0
0
0
4
0
4
1
2
0
1
10
15
0
2
1
3
2
5
0
4
2
8
2
8
0
4
0
4
1
1
2
6
0
0
0
4
0
4
0
4
1
2
0
1
1
4
1
5
1
1
0
1
3
7
0
4
0
4
0
4
0
4
0
4
0
0
0
0
0
3
0
4
0
4
0
0
0
0
0
4
0
1
3
7
2
4
1
5
0
1
0
1
0
0
1
5
0
3
3
3
6
11
1
2
0
3
1
7
0
0
0
4
2
7
1
5
0
1
1
6
3
10
0
0
0
0
0
3
1
1
0
1
0
2
0
4
1
6
2
6
0
1
0
4
3
9
0
4
0
3
0
4
1
1
0
0
1
1
0
4
0
1
0
4
1
1
0
0
3
11
0
0
0
1
0
4
2
6
3
7
3
4
0
0
2
6
0
1
3
10
0
4
0
4


In [55]:
grouped.head(30)

Unnamed: 0,client_id,visit_id,visitor_id,process_step,error_rate
0,1028,557292053_87239438319_391157,42237450_62128060588,"[1, 2, 1, 1, 3, 2, 1, 1, 0]",0.5
1,1104,543158812_46395476577_767725,194240915_18158000533,[0],0.0
2,1104,643221571_99977972121_69283,194240915_18158000533,[0],0.0
3,1186,507052512_11309370126_442139,446844663_31615102958,[0],0.0
4,1186,795373564_99931517312_810896,446844663_31615102958,"[2, 1, 0]",0.0
5,1195,393817425_39015278493_996341,766842522_69992551638,"[4, 3, 2, 1, 0]",0.0
6,1197,71862471_21202285428_848395,753759429_54481946928,"[4, 3, 2, 3, 2, 1, 0]",0.166667
7,1368,784065271_45379483290_309335,366307863_19014662045,[0],0.0
8,2439,848231744_22569944243_37711,607208067_70160939111,[0],0.0
9,2581,182925466_27021409208_83502,770616558_80928163524,[0],0.0
