# Expansion Potential

In [None]:
import os

import pandas as pd
import numpy as np
from scipy.stats import poisson
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from src.dataloader import load_and_save_data
from src.optimiser import main
from src.client_generator import add_new_clients_and_sessions

# K-means on Multiple Features

In [None]:
# Load data
file_path = "../data/ChallengeXHEC23022024.xlsx"
excel_data = pd.ExcelFile(file_path)


jan24_df = pd.read_excel(excel_data, sheet_name="JAN24")
clients_df = pd.read_excel(excel_data, sheet_name="clients")
intervenants_df = pd.read_excel(excel_data, sheet_name="intervenants")


paris_center_coords = {"Latitude": 48.864716, "Longitude": 2.349014}


clients_df["Distance from Paris Center"] = np.sqrt(
    (clients_df["Latitude"] - paris_center_coords["Latitude"]) ** 2
    + (clients_df["Longitude"] - paris_center_coords["Longitude"]) ** 2
)

# Convert service times to datetime
fixed_date = pd.Timestamp("2024-01-01")
jan24_df["Heure de début"] = pd.to_datetime(
    fixed_date.strftime("%Y-%m-%d")
    + " "
    + jan24_df["Heure de début"].astype(str)
)
jan24_df["Heure de fin"] = pd.to_datetime(
    fixed_date.strftime("%Y-%m-%d")
    + " "
    + jan24_df["Heure de fin"].astype(str)
)


jan24_df["Service Duration"] = (
    jan24_df["Heure de fin"] - jan24_df["Heure de début"]
).dt.total_seconds() / 60  # In Minutes

In [None]:
client_service_count = jan24_df.groupby("ID Client")["Prestation"].count()
client_service_duration = jan24_df.groupby("ID Client")[
    "Service Duration"
].sum()

combined_client_data = clients_df.set_index("ID Client").join(
    [client_service_count, client_service_duration], how="left"
)
combined_client_data.rename(
    columns={
        "Prestation": "Total Services",
        "Service Duration": "Total Service Duration",
    },
    inplace=True,
)

In [None]:
combined_client_data["Average Service Duration"] = (
    combined_client_data["Total Service Duration"]
    / combined_client_data["Total Services"]
)
service_variety = jan24_df.groupby("ID Client")["Prestation"].nunique()

combined_client_data = combined_client_data.join(service_variety, how="left")
combined_client_data.rename(
    columns={"Prestation": "Service Variety"}, inplace=True
)

total_days_in_january = jan24_df["Date"].nunique()
combined_client_data["Service Frequency"] = (
    combined_client_data["Total Services"] / total_days_in_january
)

In [None]:
features_for_clustering = combined_client_data[
    [
        "Distance from Paris Center",
        "Total Services",
        "Total Service Duration",
        "Average Service Duration",
        "Service Variety",
        "Service Frequency",
    ]
]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_for_clustering)


kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(features_scaled)


combined_client_data["Cluster"] = kmeans.labels_

combined_client_data.head()

In [None]:
cluster_counts = combined_client_data["Cluster"].value_counts().sort_index()
print(cluster_counts)

In [None]:
for i in range(kmeans.n_clusters):
    cluster_data = combined_client_data[combined_client_data["Cluster"] == i]
    print(f"Cluster {i} Statistics:")
    print(cluster_data.describe())
    print("\n")

# Generate Random Clients

In [None]:
combined_client_data

In [None]:
clients_df = clients_df.join(combined_client_data["Cluster"], on="ID Client")

# Generate Random Sessions from Clients
- From clients in the similar group, create a client using random properties of them
- Groupby client group and prestation. Find the number of times these events occured. According to the average number of times it has occured during the period. Create these events by random.


In [None]:
jan24_df = pd.read_excel(excel_data, sheet_name="JAN24")
df = pd.merge(jan24_df, clients_df, how="left", on="ID Client")
df["weekday"] = df["Date"].dt.weekday
df

## Model events with a Poisson Distribution

### Method 1
For each cluster, we know: 
- How many events a given day has
- Probability of the prestation.

For every day, we assign prestations based on probabilities. 

### Method 2

For each cluster: 
1) get the total number of events in the whole month per client
2) Calculate the probabilities of events to occur on a given day (Vector of length 31)
    - Model this as a poisson distribution
3) With this poisson distribution, for a new client, get a new series of events in the month distributed to days.
4) After we have the events for the client, assign them prestations based on empirical probabilities

In [None]:
mean_events_per_cluster = (
    df.groupby(["Cluster", "ID Client"])["Prestation"]
    .count()
    .groupby("Cluster")
    .mean()
    .apply(np.round)
    .astype(int)
)
mean_events_per_cluster

In [None]:
date_probabilities = df.groupby("Cluster")["Date"].value_counts(normalize=True)

In [None]:
event_counts_per_day = (
    df.groupby(["Cluster", "Date"])["Prestation"].count()
    / df.groupby("Cluster")["ID Client"].nunique()
)
mean_events_per_cluster = event_counts_per_day.groupby(level=0).mean()
mean_events_per_cluster

In [None]:
persona_group = 2
pd.Series(
    poisson.rvs(
        mu=mean_events_per_cluster.loc[persona_group],
        size=df["Date"].nunique(),
    ),
    index=df["Date"].unique(),
)

In [None]:
event_probabilities = df.groupby("Cluster")["Prestation"].value_counts(
    normalize=True
)
# event_frequencies.groupby(level=0).sum()
event_probabilities

- For each persona type, we write a function that:
    - Generates a new client ID
    - Randomly chooses the location from one of the clients 
    - Generates similar number of events for every persona - client combo (Uniform random that has +-3 events)
    - For each event, chooses a pre-existing time and adds a +- 30 minute uniform to start time and multiplies the duration with a random factor
- Add client ID and Coordinates to clients_df
- Add All sessions to dataframe

In [None]:
persona_group = 2

df_clients = pd.read_excel("../data/ChallengeXHEC23022024.xlsx", sheet_name=1)
df_sessions = pd.read_excel("../data/ChallengeXHEC23022024.xlsx", sheet_name=0)
df_persona = df[df["Cluster"] == persona_group].copy()


# Generate key for new client
# new_client_id = np.random.randint(10000000, 100000000)

# Take a random location for client
client_loc = df_persona[["ID Client", "Latitude", "Longitude"]].sample(1)
new_client_id = client_loc.pop("ID Client").iloc[0]
client_loc = {k: list(v.values())[0] for k, v in client_loc.to_dict().items()}

# Join new client to client dataset
new_client = pd.DataFrame(
    {"ID Client": new_client_id} | client_loc, index=[len(df_clients)]
)
new_df_clients = pd.concat([df_clients, new_client])


# Get event counts with a Poisson Distribution
event_counts = df.groupby(["Cluster", "Date"])["Prestation"].count()
event_counts = event_counts.loc[persona_group]

event_counts_per_day = (
    df.groupby(["Cluster", "Date"])["Prestation"].count()
    / df.groupby("Cluster")["ID Client"].nunique()
)
mu = event_counts_per_day.loc[persona_group]
sim_event_counts = pd.Series(
    poisson.rvs(
        mu=mean_events_per_cluster.loc[persona_group],
        size=df["Date"].nunique(),
    ),
    index=df["Date"].unique(),
)


# Get event probabilities
probs = event_probabilities.loc[persona_group]

# Generate a random event until all events are complete
new_events = pd.DataFrame(columns=df_sessions.columns)


## Choose a random event start time
for date, count in sim_event_counts.to_frame().iterrows():
    count = count.iloc[0]
    if count == 0:
        continue

    # Assign events based on probabilities
    events = np.random.choice(
        event_probabilities.loc[persona_group].index,
        count,
        p=event_probabilities.loc[persona_group].values,
    )

    # From these events, sample times
    times = pd.DataFrame(
        [
            df_persona[df_persona["Prestation"] == event][
                ["Heure de début", "Heure de fin"]
            ]
            .sample(1)
            .squeeze()
            .to_list()
            for event in events
        ],
        columns=["Heure de début", "Heure de fin"],
    )

    new_row = pd.DataFrame(
        {
            "ID Client": [new_client_id] * count,
            "ID Intervenant": [
                intervenants_df["ID Intervenant"].sample(1).iloc[0]
            ]
            * count,
            "Date": [date] * count,
            "Heure de début": times["Heure de début"].to_list(),
            "Heure de fin": times["Heure de fin"].to_list(),
            "Prestation": events,
        }
    )

    new_events = pd.concat([new_events, new_row])


new_df_sessions = (
    pd.concat([df_sessions, new_events])
    .sort_values(by="Date")
    .reset_index(drop=True)
)
new_df_sessions

In [None]:
sim_event_counts

# Test

- Generate random scenarios of clients for n_clients equal to 1,2,3,4,5
- Run the optimisation and see which ones have a feasible scenario

In [None]:
os.chdir("..")
os.getcwd()

In [None]:
df_clients, df_sessions = add_new_clients_and_sessions(
    5,
    excel_file="data/ChallengeXHEC23022024.xlsx",
    random_client_segment=False,
    client_personas_sequence="13212",
)

In [None]:
df_sessions

In [None]:
load_and_save_data(generate_new_clients=False, **{"n_clients": 5})

In [None]:
main(include_availability=False, filter_for_competence=False)