In [2]:
import pandas as pd
import plotly.express as px

# Getaround EDA : analyse exploratoire des données

Cette analyse de données doit permettre de répondre aux deux questions suivantes.
**Notre Product Manager doit encore trancher sur les points suivants :**
- **seuil** : quelle doit être la durée minimale du délai entre deux locations ?
- **périmètre** : faut-il activer cette fonctionnalité pour tous les véhicules ou uniquement pour les véhicules Connect ?
On peut également approfondir l'exploration en s'appuyant sur les questions :
- **impact** : quel est l'impact de cette fonctionnalité sur le nombre de locations ?
- Quelle part des revenus des propriétaires serait potentiellement affectée par cette fonctionnalité ?
- Combien de locations seraient impactées en fonction du seuil et du périmètre choisis ?
- À quelle fréquence les conducteurs sont-ils en retard pour le check-in suivant ? Quel est l’impact pour le conducteur suivant ?
- Combien de situations problématiques seraient résolues selon le seuil et le périmètre retenus ?



## Analyse

| field name	|   Comment	|   
| ----------- | ----------- |
| rental_id	|       Unique identifier of the rental	|   
| car_id	    |       Unique identifier of the car	|   
| checkin_type |   	Flow used for both checkin and checkout. (ie. access and return the car) <br> mobile = rental agreement signed on the owner's smartphone <br> connect = car equiped with the Connect technology , opened by the driver with his smartphone.<br> **Note:** paper contracts were excluded from the data as we have no data on their delay at checkout and it's negligible use case |   
| state	   |        canceled means that the rental did not happen (was canceled by the driver or the owner).	|   
| delay_at_checkout_in_minutes	|    Difference in minutes between the rental end time requested by the driver when booking the car and the actual time the driver completed the checkout. Negative values |mean that the driver returned the car in advance.|   
| previous_ended_rental_id	|    id of the previous ended rental of the car (NULL when no previous rental or delay with previous rental higher than 12 hours)	|   
| time_delta_with_previous_rental_in_minutes	|    Difference in minutes between this rental planned start time and the previous rental planned end time (when lower than 12 hours, NULL if higher)	|   
        

In [3]:
df=pd.read_csv("../data/raw/get_around_delay_analysis.csv",sep=";")
delays_df = pd.read_excel("../data/raw/get_around_delay_analysis.xlsx")
delays_df.head()

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


In [4]:
display(df.describe())
display(df.info())
display(df.head(10))

Unnamed: 0,rental_id,car_id,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,Unnamed: 7,Unnamed: 8
count,21310.0,21310.0,16346.0,1841.0,1841.0,0.0,0.0
mean,549712.880338,350030.603426,59.701517,550127.411733,279.28843,,
std,13863.446964,58206.249765,1002.561635,13184.023111,254.594486,,
min,504806.0,159250.0,-22433.0,505628.0,0.0,,
25%,540613.25,317639.0,-36.0,540896.0,60.0,,
50%,550350.0,368717.0,9.0,550567.0,180.0,,
75%,560468.5,394928.0,67.0,560823.0,540.0,,
max,576401.0,417675.0,71084.0,575053.0,720.0,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21310 entries, 0 to 21309
Data columns (total 9 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   rental_id                                   21310 non-null  int64  
 1   car_id                                      21310 non-null  int64  
 2   checkin_type                                21310 non-null  object 
 3   state                                       21310 non-null  object 
 4   delay_at_checkout_in_minutes                16346 non-null  float64
 5   previous_ended_rental_id                    1841 non-null   float64
 6   time_delta_with_previous_rental_in_minutes  1841 non-null   float64
 7   Unnamed: 7                                  0 non-null      float64
 8   Unnamed: 8                                  0 non-null      float64
dtypes: float64(5), int64(2), object(2)
memory usage: 1.5+ MB


None

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,Unnamed: 7,Unnamed: 8
0,505000,363965,mobile,canceled,,,,,
1,507750,269550,mobile,ended,-81.0,,,,
2,508131,359049,connect,ended,70.0,,,,
3,508865,299063,connect,canceled,,,,,
4,511440,313932,mobile,ended,,,,,
5,511626,398802,mobile,ended,-203.0,,,,
6,511639,370585,connect,ended,-15.0,563782.0,570.0,,
7,512303,371242,mobile,ended,-44.0,,,,
8,512475,322502,mobile,canceled,,,,,
9,513434,256528,connect,ended,23.0,,,,


In [6]:
fig = px.histogram(df, x="delay_at_checkout_in_minutes", nbins=100)
fig.update_traces(xbins=dict( # bins used for histogram
        start=-720.0,
        end=720.0,
        size=5
    ))
fig.show()
df_ended = df[df["checkin_type"]=="mobile"][["rental_id","delay_at_checkout_in_minutes"]]
fig = px.histogram(df_ended, x="delay_at_checkout_in_minutes", nbins=100)
fig.update_traces(xbins=dict( # bins used for histogram
        start=-720.0,
        end=720.0,
        size=5
    ))
fig.show()  

In [9]:
df_ended = df[df["state"]=="ended"][["rental_id","delay_at_checkout_in_minutes"]]
fig = px.histogram(df_ended, x="delay_at_checkout_in_minutes", nbins=100)
fig.update_traces(xbins=dict( # bins used for histogram
        start=-720.0,
        end=720.0,
        size=5
    ))
fig.show()  
df_ended = df[df["checkin_type"]=="connect"][["rental_id","delay_at_checkout_in_minutes"]]
fig = px.histogram(df_ended, x="delay_at_checkout_in_minutes", nbins=100)
fig.update_traces(xbins=dict( # bins used for histogram
        start=-720.0,
        end=720.0,
        size=5
    ))
fig.show()  
df_ended = df[df["state"]=="canceled"][["rental_id","time_delta_with_previous_rental_in_minutes"]]
fig = px.histogram(df_ended, x="time_delta_with_previous_rental_in_minutes", nbins=100)
fig.update_traces(xbins=dict( # bins used for histogram
        start=-720.0,
        end=720.0,
        size=5
    ))
fig.show()  

In [28]:
df[df["state"]=="canceled"]

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,Unnamed: 7,Unnamed: 8
0,505000,363965,mobile,canceled,,,,,
3,508865,299063,connect,canceled,,,,,
8,512475,322502,mobile,canceled,,,,,
10,513743,330658,mobile,canceled,,,,,
11,514161,366037,connect,canceled,,,,,
...,...,...,...,...,...,...,...,...,...
21283,569325,345079,mobile,canceled,,,,,
21287,569764,405347,mobile,canceled,,,,,
21288,570001,386413,connect,canceled,,,,,
21297,571481,311841,mobile,canceled,,,,,


In [24]:
df_canceled = df[df["state"]=="canceled"][["rental_id","delay_at_checkout_in_minutes"]]
fig = px.histogram(df_canceled, x="delay_at_checkout_in_minutes", nbins=100)

fig.show()  
df_cancelled

Unnamed: 0,rental_id,delay_at_checkout_in_minutes
0,505000,
3,508865,
8,512475,
10,513743,
11,514161,
...,...,...
21283,569325,
21287,569764,
21288,570001,
21297,571481,


In [20]:
df_canceled = df[(df["delay_at_checkout_in_minutes"] >= -60) & (df["delay_at_checkout_in_minutes"] <= 60)]  

In [21]:
df_canceled 

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,Unnamed: 7,Unnamed: 8
6,511639,370585,connect,ended,-15.0,563782.0,570.0,,
7,512303,371242,mobile,ended,-44.0,,,,
9,513434,256528,connect,ended,23.0,,,,
13,515147,257466,mobile,ended,15.0,,,,
19,519491,312389,mobile,ended,58.0,545639.0,420.0,,
...,...,...,...,...,...,...,...,...,...
21292,570440,292303,mobile,ended,32.0,,,,
21293,570957,390479,mobile,ended,7.0,,,,
21294,571154,400293,mobile,ended,2.0,,,,
21296,571359,357612,mobile,ended,52.0,,,,


In [34]:
df_analysis=df[(df["state"]=="canceled") & df["time_delta_with_previous_rental_in_minutes"]>0 ]

In [38]:
df_analysis[["rental_id","car_id","checkin_type","state","time_delta_with_previous_rental_in_minutes"]]

Unnamed: 0,rental_id,car_id,checkin_type,state,time_delta_with_previous_rental_in_minutes
204,543768,374169,connect,canceled,210.0
242,546160,352528,connect,canceled,630.0
504,564627,341431,mobile,canceled,150.0
637,568657,317378,connect,canceled,210.0
669,516550,377700,mobile,canceled,720.0
...,...,...,...,...,...
21022,560787,413181,mobile,canceled,150.0
21172,566228,390871,connect,canceled,60.0
21230,569706,245154,connect,canceled,660.0
21269,568049,381499,connect,canceled,720.0


In [49]:
fig = px.histogram(df_analysis, x="time_delta_with_previous_rental_in_minutes", nbins=200)
fig.update_traces(xbins=dict( # bins used for histogram
        start=0,
        end=1000.0,
        size=30
    ))
fig.show()