In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
# from textwrap import wrap

import functions as f

%matplotlib inline

# %run cleaning_demo.ipynb
# %run cleaning_web_data.ipynb
# %run cleaning_experiment_clients.ipynb

# df_demo = import_df_demo() # type: ignore
# df_web_data = import_df_web_data() # type: ignore
# df_experiment_clients = import_df_experiment_clients() # type: ignore

df_demo = pd.read_csv("../data/clean/df_demo_clean.csv")
df_web_data = pd.read_csv("../data/clean/df_web_data_clean.csv")
df_experiment_clients = pd.read_csv("../data/clean/df_experiment_clients_clean.csv")

In [None]:
df_demo, df_web_data, df_experiment_clients

df_demo.info()

In [None]:
# Merge first two dataframes
df_merge_1 = pd.merge(df_web_data, df_demo, on="client_id", how="left")

# Merge resting dataframe
df_merge_2 = pd.merge(df_merge_1, df_experiment_clients, on="client_id", how="left")

df_merge_2



In [None]:
df_merge_2.info()

In [None]:
# Check random client_ids

list_of_ids = df_merge_2["client_id"].to_list()
df_merge_2[df_merge_2["client_id"] == random.choice(list_of_ids)]

In [None]:
# Check for NaN values count
df_merge_2.isna().sum()

In [None]:
# Drop NaN values
df_merge_2.dropna(how="any", inplace=True)
df_merge_2.reset_index(drop=True, inplace=True)

df_merge_2

In [None]:
# Cast floats as integers
df_merge_2 = df_merge_2.astype({
    "clnt_tenure_yr": "int64", 
    "clnt_tenure_mnth": "int64",
    "clnt_age": "int64",
    "num_accts": "int64",
    "calls_6_mnth": "int64",
    "logons_6_mnth": "int64"
    })

df_merge_2

In [None]:
# Cast date_time as datetime
df_merge_2['date_time'] = pd.to_datetime(df_merge_2['date_time'])
df_merge_2 = df_merge_2.sort_values(by=["client_id","date_time"], ascending=[True,True])
df_merge_2

In [None]:
# Round Balance (bal) to 2 decimals
df_merge_2['bal'] = df_merge_2['bal'].apply(lambda x: round(x, 2))
df_merge_2

In [None]:
#Added session started and completed columns.
df_merge_2['session_completed'] = (df_merge_2['process_step'] == 4).astype(int)
df_merge_2['session_started'] = ((df_merge_2['process_step'] == 0) & ~df_merge_2.duplicated(subset=['visit_id', 'process_step'], keep='first')).astype(int)

df_step_4 = df_merge_2[df_merge_2['process_step'] == 4]
df_step_4_sorted = df_step_4.sort_values(by='date_time', ascending=True)
df_step_4_cleaned = df_step_4_sorted.drop_duplicates(subset=['client_id', 'visit_id', 'visitor_id'], keep='first')
df_non_step_4 = df_merge_2[df_merge_2['process_step'] != 4]
df_merge_2 = pd.concat([df_non_step_4, df_step_4_cleaned])
df_merge_2 = df_merge_2.reset_index(drop=True)

In [None]:
#Created different DF for error rate.
merged_grouped =df_merge_2.groupby(['client_id', 'visit_id', 'visitor_id','Variation'])['process_step'].apply(list).reset_index()

# create new column as error rate and apply the method on all clients in dataframe
merged_grouped['error_rate'] = merged_grouped.apply(
    lambda row: f.backward_steps_for_client(row['process_step'], row['client_id'],row['Variation'])[row['client_id']],
    axis=1  # Process each row individually
)

In [None]:
# Rename dataframe to export
df_join_clean = df_merge_2

In [None]:
# Evaluates funnel conversion. Step progresion status.

df_join_clean['steps_status'] = None
previous_visit_id = None
previous_value = None

for i in range(len(df_join_clean)):
    current_visit_id = df_join_clean.iloc[i]["visit_id"]
    current_value = df_join_clean.iloc[i]["process_step"]

    if current_value == 0:
        df_join_clean.iloc[i, df_join_clean.columns.get_loc('steps_status')] = 0
    else:
        if previous_value > current_value:
            df_join_clean.iloc[i, df_join_clean.columns.get_loc('steps_status')] = -1
        if previous_value == current_value:
            df_join_clean.iloc[i, df_join_clean.columns.get_loc('steps_status')] = 0
        if previous_value < current_value:
            df_join_clean.iloc[i, df_join_clean.columns.get_loc('steps_status')] = 1
        # if current_value == 4:
        #     df_join_clean.iloc[i, df_join_clean.columns.get_loc('steps_status')] = "confirm"
    
    previous_value = current_value

display(df_join_clean)

In [None]:
# Save to csv
f.save_to_csv(df_join_clean, "df_join_clean.csv")
f.save_to_csv(merged_grouped,"both_groups_error_rate.csv")

In [None]:
# create a dummy function to export dataframe directly
def import_df_join_clean():
    return (
        df_join_clean
        )