In [15]:
import os
import pandas as pd
import plotly.graph_objects as go

In [16]:
PATH = "datasets/real_test_case/"

dfs = []

for filename in os.listdir(PATH):
    if filename.endswith(".csv"):
        filepath = os.path.join(PATH, filename)
        df = pd.read_csv(filepath)
        dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
df["visit_date"] = pd.to_datetime(df["from_date"]).dt.date
df["week_day"] = pd.to_datetime(df["from_date"]).dt.weekday
df = df.drop(["from_date", "to_date"], axis=1)

## Anomalies Detection Methodology

Having in mind all the assumed anomalies detected on the data preparation and analysis stage the following approach to detect the anomalies programmatically and visually will be applied.

**Central Tendency Measure**: For each metric the median value will be used in order to mitigate the impact of outliers, particularly within groups containing a small number of records.
For the *daily visits metric*, the records will be grouped by the weekday, resulting in four records per group reflecting the four-week observation period.
For the *gender proportion* and *age proportion metrics* the daily records will be grouped by gender and age respectively resulting in 28 records per group.
 
**Deviation Calculation:** The deviation of each record from its respective group median will be computed using the following formula:
$$
\text{Deviation} = \frac{| x_n - \text{Med}(x) |}{\text{Med}(x)}
$$

**Outlier Detection:** Adjustable threshold values for each metric should be defined to identify outliers based on the calculated deviations.

In [17]:
DAILY_VISITS_DEVIATION_THRESHOLD = 0.4
GENDER_REPARTITION_DEVIATION_THRESHOLD = 0.4
AGE_REPARTITION_DEVIATION_THRESHOLD = 0.4

## Daily Visits Metric

In [18]:
daily = df.groupby(["visit_date", "week_day"])["count"].sum().reset_index()
daily_stats = daily.groupby(["week_day"])["count"].median().reset_index()
daily_stats.rename(columns={"count": "week_day_median"}, inplace=True)
daily_stats

Unnamed: 0,week_day,week_day_median
0,0,11885.5
1,1,11633.5
2,2,10708.0
3,3,13902.5
4,4,12302.0
5,5,23168.5
6,6,16417.5


In [19]:
daily = pd.merge(daily, daily_stats, on=["week_day"], how="left")
daily["deviation"] = (
    abs(daily["week_day_median"] - daily["count"]) / daily["week_day_median"]
)
daily["is_outlier"] = daily["deviation"] > DAILY_VISITS_DEVIATION_THRESHOLD
daily[daily["is_outlier"]]

Unnamed: 0,visit_date,week_day,count,week_day_median,deviation,is_outlier
19,2024-01-27,5,2410,23168.5,0.895979,True
20,2024-01-28,6,2386,16417.5,0.854667,True


In [20]:
week_day_names = {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=daily["visit_date"],
        y=daily["count"],
        marker=dict(color=daily["is_outlier"].map({True: "red", False: "#7FB3D5"})),
        name="Count",
    )
)

fig.add_trace(
    go.Scatter(
        x=daily["visit_date"],
        y=daily["week_day_median"],
        mode="lines",
        name="Week Day Median",
    )
)

tick_labels = [
    f"{date.strftime('%Y-%m-%d')} ({week_day_names[date.weekday()]})"
    for date in df["visit_date"]
]

fig.update_layout(
    title="Daily Visits Diagram - Real Test Case",
    xaxis=dict(
        title="Date", tickangle=45, tickvals=df["visit_date"], ticktext=tick_labels
    ),
    yaxis=dict(title="Visits"),
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
)

fig.show()

## Gender Proportion Metric

In [21]:
gender_df = df.groupby(["visit_date", "gender"])["count"].sum().reset_index()
gender_df["total_count"] = gender_df.groupby("visit_date")["count"].transform("sum")
gender_df["proportion"] = gender_df["count"] / gender_df["total_count"] * 100

In [22]:
gender_stats = gender_df.groupby("gender")["proportion"].median().reset_index()
gender_stats.rename(columns={"proportion": "median"}, inplace=True)
gender_stats

Unnamed: 0,gender,median
0,man,40.374269
1,woman,59.625731


In [23]:
gender_df = pd.merge(gender_df, gender_stats, on=["gender"], how="left")

gender_df["deviation"] = (
    abs(gender_df["median"] - gender_df["proportion"]) / gender_df["median"]
)
gender_df["is_outlier"] = (
    gender_df["deviation"] > GENDER_REPARTITION_DEVIATION_THRESHOLD
)
gender_df[gender_df["is_outlier"]]

Unnamed: 0,visit_date,gender,count,total_count,proportion,median,deviation,is_outlier
14,2024-01-15,man,11801,14799,79.741874,40.374269,0.975067,True
15,2024-01-15,woman,2998,14799,20.258126,59.625731,0.660245,True
16,2024-01-16,man,9740,12247,79.529681,40.374269,0.969811,True
17,2024-01-16,woman,2507,12247,20.470319,59.625731,0.656686,True
18,2024-01-17,man,9364,11625,80.550538,40.374269,0.995096,True
19,2024-01-17,woman,2261,11625,19.449462,59.625731,0.673808,True
20,2024-01-18,man,10887,13547,80.364656,40.374269,0.990492,True
21,2024-01-18,woman,2660,13547,19.635344,59.625731,0.67069,True
22,2024-01-19,man,12167,15090,80.629556,40.374269,0.997053,True
23,2024-01-19,woman,2923,15090,19.370444,59.625731,0.675133,True


In [24]:
traces = []
for age_group, color in zip(["man", "woman"], ["#7FB3D5", "#99CC99"]):
    filtered_df = gender_df[gender_df["gender"] == age_group]
    trace = go.Bar(
        x=filtered_df["visit_date"],
        y=filtered_df["proportion"],
        name=age_group,
        marker=dict(
            color=color,
            pattern=dict(
                shape=filtered_df["is_outlier"].map({True: "/", False: ""}),
                solidity=0.5,
                fgcolor="red",
                fgopacity=0.8,
            ),
        ),
        hovertext=filtered_df["proportion"].round(2).astype(str) + "%",
        hoverinfo="text+name",
    )
    traces.append(trace)

layout = go.Layout(
    title="Gender Groups Diagram - Real Test Case",
    xaxis=dict(
        title="Date",
        tickangle=45,
        tickvals=gender_df["visit_date"].unique(),
        tickformat="%Y-%m-%d",
    ),
    yaxis=dict(title="Proportion (%)"),
    barmode="stack",
)

fig = go.Figure(data=traces, layout=layout)

fig.show()

## Age Proportion Metric

In [25]:
age_df = df.groupby(["visit_date", "age"])["count"].sum().reset_index()
age_df["total_count"] = age_df.groupby("visit_date")["count"].transform("sum")
age_df["proportion"] = age_df["count"] / age_df["total_count"] * 100

In [26]:
age_stats = age_df.groupby("age")["proportion"].median().reset_index()
age_stats.rename(columns={"proportion": "median"}, inplace=True)
age_stats

Unnamed: 0,age,median
0,adult,74.139672
1,old,15.911161
2,young,9.998024


In [27]:
age_df = pd.merge(age_df, age_stats, on=["age"], how="left")
age_df["deviation"] = (abs(age_df["median"] - age_df["proportion"])) / age_df["median"]
age_df["is_outlier"] = age_df["deviation"] > AGE_REPARTITION_DEVIATION_THRESHOLD
age_df[age_df["is_outlier"]]

Unnamed: 0,visit_date,age,count,total_count,proportion,median,deviation,is_outlier
42,2024-01-22,adult,1951,9525,20.48294,74.139672,0.723725,True
43,2024-01-22,old,7168,9525,75.254593,15.911161,3.729673,True
44,2024-01-22,young,406,9525,4.262467,9.998024,0.573669,True
45,2024-01-23,adult,2868,11780,24.34635,74.139672,0.671615,True
46,2024-01-23,old,8387,11780,71.196944,15.911161,3.474654,True
47,2024-01-23,young,525,11780,4.456706,9.998024,0.554241,True
48,2024-01-24,adult,1887,9791,19.272802,74.139672,0.740047,True
49,2024-01-24,old,7285,9791,74.405066,15.911161,3.676281,True
51,2024-01-25,adult,3292,15586,21.121519,74.139672,0.715112,True
52,2024-01-25,old,11409,15586,73.200308,15.911161,3.600563,True


In [28]:
traces = []
for age_group, color in zip(
    ["adult", "old", "young"], ["#7FB3D5", "#99CC99", "#FFCC99"]
):
    filtered_df = age_df[age_df["age"] == age_group]
    trace = go.Bar(
        x=filtered_df["visit_date"],
        y=filtered_df["proportion"],
        name=age_group,
        marker=dict(
            color=color,
            pattern=dict(
                shape=filtered_df["is_outlier"].map({True: "/", False: ""}),
                solidity=0.5,
                fgcolor="red",
                fgopacity=0.8,
            ),
        ),
        hovertext=filtered_df["proportion"].round(2).astype(str) + "%",
        hoverinfo="text+name",
    )
    traces.append(trace)

layout = go.Layout(
    title="Age Groups Diagram - Real Test Case",
    xaxis=dict(
        title="Date",
        tickangle=45,
        tickvals=age_df["visit_date"].unique(),
        tickformat="%Y-%m-%d",
    ),
    yaxis=dict(title="Proportion (%)"),
    barmode="stack",
)

fig = go.Figure(data=traces, layout=layout)

fig.show()