In [82]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
data = pd.read_csv("ab_data.csv")

In [4]:
data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [5]:
data.tail()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
294473,751197,2017-01-03 22:28:38.630509,control,old_page,0
294474,945152,2017-01-12 00:51:57.078372,control,old_page,0
294475,734608,2017-01-22 11:45:03.439544,control,old_page,0
294476,697314,2017-01-15 01:20:28.957438,control,old_page,0
294477,715931,2017-01-16 12:40:24.467417,treatment,new_page,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [54]:
data.isna().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [55]:
data.user_id.nunique()

290584

In [57]:
data["user_id"].duplicated().sum()

np.int64(3894)

In [61]:
dup = data.query("user_id.duplicated() == True")
dup.shape

(3894, 5)

On a des `user_id` dupliqués 

Vérifions s'il y a une incohérence entre `group` and `landing_page`.
En effet, je suppose que si `group == control` alors `landing_page == old_page` et `group == treatment` implique que `landing_page == new_page`. On vérifie donc si c'est bien le cas dans notre jeu de données.

In [66]:
data_mismatch = data[(data["group"]=="treatment") & (data["landing_page"]=="old_page")
                |(data["group"]=="control") & (data["landing_page"]=="new_page")]
n_mismatch = data_mismatch.shape[0]
percentage_mismatch = n_mismatch / data.shape[0] * 100
print(f"Nombre de lignes incohérentes: {n_mismatch}")
print(f"Pourcentage de lignes incohérentes : {percentage_mismatch :.2f} %")

Nombre de lignes incohérentes: 3893
Pourcentage de lignes incohérentes : 1.32 %


Comme on peut le voir, il y a 3893 lignes non correspondantes, ce qui signifie que le groupe de traitement (`treatment`) est associé à l’ancienne page (`old_page`) et que le groupe de contrôle (`control`) est associé à la nouvelle page (`new_page`). 

Il y a également 3895 doublons de `user_id`, dont certains ont chargé à la fois la nouvelle et l’ancienne page. Cela pourrait indiquer que le département concerné a répété l’expérience après avoir identifié le problème de correspondance. Par conséquent, nous devons conserver uniquement les données correctes et supprimer les données non correspondantes.

In [67]:
data.drop_duplicates(subset="user_id",inplace=True) 

In [68]:
data = data.query("group == 'control' and landing_page == 'old_page' or group == 'treatment' and landing_page == 'new_page'")

data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [69]:
data.shape

(288540, 5)

In [70]:
data["user_id"].duplicated().sum() 

np.int64(0)

In [71]:
control_data = data[data.landing_page == "old_page"]
control_data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1
5,936923,2017-01-10 15:20:49.083499,control,old_page,0
7,719014,2017-01-17 01:48:29.539573,control,old_page,0


In [72]:
control_data.shape

(144226, 5)

In [73]:
treatment_data = data[data.landing_page == "new_page"]
treatment_data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
6,679687,2017-01-19 03:26:46.940749,treatment,new_page,1
8,817355,2017-01-04 17:58:08.979471,treatment,new_page,1
9,839785,2017-01-15 18:11:06.610965,treatment,new_page,1


In [74]:
treatment_data.shape

(144314, 5)

In [85]:
# Create a 1x3 subplot figure
fig = make_subplots(rows=1, cols=3, subplot_titles=("Distribution des données totales", "Conversion pour chaque groupe", "Taux de conversion pour chaque groupe"))

# 1st plot: Histogram
hist = px.histogram(data, x="converted")
fig.add_trace(hist.data[0], row=1, col=1)

# 2nd plot: Count plot
count = px.histogram(data, x="converted", color="group", barmode="group")
for trace in count.data:
    fig.add_trace(trace, row=1, col=2)

# 3rd plot: Scatter plot (simulating the point plot) for conversion rate per group
# Calculate mean conversion rates
group_means = data.groupby("group")["converted"].mean().reset_index()
scatter = go.Scatter(x=group_means["group"], y=group_means["converted"], mode="markers+lines", marker=dict(size=8))
fig.add_trace(scatter, row=1, col=3)

# Update axis titles and other parameters
fig.update_xaxes(title_text="Converted", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_yaxes(title_text="Converted Rate", range=[0.115, 0.125], row=1, col=3)

# Display the figure
fig.show()

In [75]:
def compute_conversion_rate(data: pd.DataFrame) -> float:
    return (data[data.converted == 1].shape[0] / data.shape[0]) * 100

In [76]:
conversion_rate_control = compute_conversion_rate(control_data)
print(f"Taux de conversion sur le groupe de contrôle: {conversion_rate_control:.2f} %")

Taux de conversion sur le groupe de contrôle: 12.03 %


In [77]:
conversion_rate_treatment = compute_conversion_rate(treatment_data)
print(f"Taux de conversion sur le groupe de contrôle: {conversion_rate_treatment:.2f} %")

Taux de conversion sur le groupe de contrôle: 11.87 %


In [80]:
# Conversion Rate
df_counts = pd.DataFrame({"Control_count": data[data.group=="control"].converted.value_counts(),
              "Treatment_count": data[data.group=="treatment"].converted.value_counts(),
              "Control_conversion_rate": conversion_rate_control,
              "Treatment_conversion_rate": conversion_rate_treatment})
df_counts

Unnamed: 0_level_0,Control_count,Treatment_count,Control_conversion_rate,Treatment_conversion_rate
converted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,126877,127180,12.029038,11.872722
1,17349,17134,12.029038,11.872722


Les taux de conversions semblent égaux pour les deux groupes.

Faisons un test d'hypothèse pour confirmer ou infirmer cette hypothèse.


Hypothèse nulle (`H0`): Les deux groupes ont les mêmes taux de conversions.

Hypothèse alternative (`H1`): Les deux groupes ont des teux de conversions différents.

Quel test statistique allons nous utiliser ?


In [None]:
import scipy.stats as stats