# Exploratory Data Analysis

In [39]:
from pathlib import Path

import pandas as pd
import folium
import matplotlib.pyplot as plt

In [40]:
excel_file = Path("../data/ChallengeXHEC23022024.xlsx")

In [41]:
schedule = pd.read_excel(excel_file, sheet_name=0)
clients = pd.read_excel(excel_file, sheet_name=1)
caregivers = pd.read_excel(excel_file, sheet_name=2)

## Simple statistics

### ...on the clients

In [15]:
clients.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID Client  118 non-null    int64  
 1   Latitude   118 non-null    float64
 2   Longitude  118 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 2.9 KB


In [16]:
clients.describe()

Unnamed: 0,ID Client,Latitude,Longitude
count,118.0,118.0,118.0
mean,540141200.0,48.733283,1.361849
std,324996600.0,0.033354,0.077534
min,78690890.0,48.586883,1.186807
25%,297709300.0,48.724503,1.337277
50%,559805900.0,48.731459,1.367189
75%,756172100.0,48.742804,1.383124
max,1453084000.0,48.850785,1.756772


In [17]:
clients["ID Client"].nunique()

118

### ...on the caregivers

In [18]:
caregivers.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID Intervenant      24 non-null     int64  
 1   Latitude            24 non-null     float64
 2   Longitude           24 non-null     float64
 3   Compétences         24 non-null     object 
 4   Permis              23 non-null     object 
 5   Véhicule personnel  23 non-null     object 
 6   Dispo / Indispo     24 non-null     object 
dtypes: float64(2), int64(1), object(4)
memory usage: 1.4+ KB


In [19]:
caregivers.describe()

Unnamed: 0,ID Intervenant,Latitude,Longitude
count,24.0,24.0,24.0
mean,626682800.0,48.727411,1.330559
std,345536200.0,0.042719,0.097643
min,78005440.0,48.640555,1.073195
25%,413536100.0,48.712092,1.250159
50%,743151900.0,48.729706,1.357099
75%,823602800.0,48.746492,1.379708
max,1452747000.0,48.843207,1.523077


In [20]:
caregivers["ID Intervenant"].nunique()

24

ATTENTION: there is one nan for "Permis" and "Véhicule personnel"

In [21]:
caregivers[["Permis", "Véhicule personnel"]].value_counts()

Permis  Véhicule personnel
Oui     Oui                   19
Non     Non                    4
Name: count, dtype: int64

In [22]:
# TODO: for adavenced check out each competences & dispo / indispo

### ...on the schedule

In [23]:
schedule.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2870 entries, 0 to 2869
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ID Client       2870 non-null   int64         
 1   ID Intervenant  2870 non-null   int64         
 2   Date            2870 non-null   datetime64[ns]
 3   Heure de début  2870 non-null   object        
 4   Heure de fin    2870 non-null   object        
 5   Prestation      2870 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 134.7+ KB


In [24]:
schedule.describe()

Unnamed: 0,ID Client,ID Intervenant,Date
count,2870.0,2870.0,2870
mean,489523500.0,612156500.0,2024-01-15 23:26:22.996515840
min,78690890.0,78005440.0,2024-01-01 00:00:00
25%,197278700.0,480302400.0,2024-01-09 00:00:00
50%,530138800.0,746414900.0,2024-01-16 00:00:00
75%,712664800.0,838320700.0,2024-01-24 00:00:00
max,1453084000.0,1452747000.0,2024-01-31 00:00:00
std,275922500.0,288514900.0,


In [25]:
schedule["Prestation"].nunique()

12

In [26]:
schedule["Prestation"].value_counts()

Prestation
TOILETTE                      1217
REPAS                          910
AIDE MENAGERE                  443
VIE SOCIALE                     96
ACCOMPAGNEMENTS COURSES PA      83
ADMINISTRATION                  79
FORMATION                       15
GARDE D'ENFANTS                 10
COORDINATION                    10
FEMME DE MENAGE                  4
HOMMES TOUTES MAINS              2
VISITE MEDICALE                  1
Name: count, dtype: int64

In [27]:
# TODO: for adavenced check out start / end time of service, avg. duration of service, services by weekend, etc.

## Map clients and caretakers

In [28]:
map = folium.Map(
    location=[clients.Latitude.mean(), clients.Longitude.mean()],
    zoom_start=10,
    control_scale=True,
)

In [29]:
for index, location_info in clients.iterrows():
    folium.CircleMarker(
        [location_info["Latitude"], location_info["Longitude"]],
        color="blue",
        fill_color="blue",
    ).add_to(map)

for index, location_info in caregivers.iterrows():
    folium.CircleMarker(
        [location_info["Latitude"], location_info["Longitude"]],
        color="red",
        fill_color="red",
    ).add_to(map)

In [30]:
map

# Number of Times an event occurs by client and Date

In [46]:
(schedule.groupby(["ID Client", "Date"])["Prestation"].value_counts().sort_values() > 1).value_counts()

count
False    1486
True      653
Name: count, dtype: int64

In [47]:
schedule

Unnamed: 0,ID Client,ID Intervenant,Date,Heure de début,Heure de fin,Prestation
0,559475456,162858075,2024-01-01,07:15:00,07:45:00,REPAS
1,559277088,162858075,2024-01-01,07:45:00,08:30:00,TOILETTE
2,87852633,78007018,2024-01-01,07:45:00,08:30:00,TOILETTE
3,243033408,810259688,2024-01-01,07:45:00,08:15:00,TOILETTE
4,814940942,710283561,2024-01-01,07:45:00,09:20:00,TOILETTE
...,...,...,...,...,...,...
2865,559277088,710283561,2024-01-31,19:00:00,19:15:00,REPAS
2866,714782168,810259688,2024-01-31,19:00:00,20:00:00,REPAS
2867,559475456,710283561,2024-01-31,19:15:00,20:00:00,TOILETTE
2868,803656603,854577575,2024-01-31,19:20:00,19:50:00,TOILETTE


In [48]:
caregivers

Unnamed: 0,ID Intervenant,Latitude,Longitude,Compétences,Permis,Véhicule personnel,Dispo / Indispo
0,838320706,48.738516,1.391971,"AIDE MENAGERE, REPAS, TOILETTE",Oui,Oui,"Indispo 01/01, 30/01, 31/01"
1,609468992,48.640555,1.232581,"TOILETTE, REPAS, VIE SOCIALE, AIDE MENAGERE",Oui,Oui,"Indispo Tous les mercredis + 13/01, 14/01, 27/..."
2,78012267,48.729206,1.371985,"HOMMES TOUTES MAINS, JARDINAGE",Oui,Oui,Dispo le 25/01
3,818696864,48.744702,1.357921,"REPAS, AIDE MENAGERE, ACCOMPAGNEMENTS COURSES,...",Oui,Oui,Indispo tous les samedis et dimanche
4,746414886,48.769455,1.197644,"TOILETTE, REPAS, AIDE MENAGERE, ACCOMPAGNEMENT...",Oui,Oui,Indispo tous les mercredis + le 05/01
5,78005437,48.712418,1.356278,"AIDE MENAGERE, ADMINISTRATION, REPAS, TOILETTE...",Oui,Oui,Indispo tous les samedis et dimanche
6,213237245,48.763226,1.24112,"TOILETTE, REPAS, AIDE MENAGERE",Oui,Oui,"Indispo 01/01, 02/01, 03/01, 13/01, 14/01, 16/..."
7,813991780,48.692715,1.073195,"TOILETTE, REPAS, AIDE MENAGERE, ACCOMPAGNEMENT...",Oui,Oui,"Indispo tous les mercredis, vendredi, samedi, ..."
8,856089133,48.645418,1.523077,"ACCOMPAGNEMENTS COURSES PA, REPAS, TOILETTE, A...",Non,Non,"Indispo 05/01, 09/01, 11/01 + du 18/01 au 31/0..."
9,700168298,48.758099,1.210611,"TOILETTE, REPAS, AIDE MENAGERE, ACCOMPAGNEMENT...",Oui,Oui,"Dispo le 01/01, 30/01 et 31/01"
