# Getaround

## EDA

In [2]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

import os

In [3]:
data_path = os.getcwd() + "/data/"

df_delay = pd.read_excel(data_path + "get_around_delay_analysis.xlsx", sheet_name="rentals_data")
doc =  pd.read_excel(data_path + "get_around_delay_analysis.xlsx",sheet_name='Documentation')
df_prices = pd.read_csv(data_path + "get_around_pricing_project.csv")

In [4]:
display(df_delay.head())
display(df_delay.describe(include="all"))
display(df_delay.isna().sum())

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
count,21310.0,21310.0,21310,21310,16346.0,1841.0,1841.0
unique,,,2,2,,,
top,,,mobile,ended,,,
freq,,,17003,18045,,,
mean,549712.880338,350030.603426,,,59.701517,550127.411733,279.28843
std,13863.446964,58206.249765,,,1002.561635,13184.023111,254.594486
min,504806.0,159250.0,,,-22433.0,505628.0,0.0
25%,540613.25,317639.0,,,-36.0,540896.0,60.0
50%,550350.0,368717.0,,,9.0,550567.0,180.0
75%,560468.5,394928.0,,,67.0,560823.0,540.0


rental_id                                         0
car_id                                            0
checkin_type                                      0
state                                             0
delay_at_checkout_in_minutes                   4964
previous_ended_rental_id                      19469
time_delta_with_previous_rental_in_minutes    19469
dtype: int64

In [5]:
# Useful function
def remove_outliers_from_column(df, column, std_ratio=3) :

    mask = (df[column] > df[column].mean() - std_ratio * df[column].std()) & (df[column] < df[column].mean() + std_ratio * df[column].std())

    return df.loc[mask]

def compute_percentage(column) :

    return [i / column.sum() * 100 for i in column]

def delay_checkout_interval(delay) :

    if delay <= 0:
        interval = "Early"
    elif delay < 30:
        interval = "Late 0' - 30'"
    elif delay < 60:
        interval = "Late 30' - 60'"
    elif delay < 120 :
        interval = "Late 60' - 120'"
    elif delay >= 120 :
        interval = "Late more than 120'"        
    else :
        interval = "NA"

    return interval

In [6]:
df_delay["delay_checkout_interval"] = df_delay["delay_at_checkout_in_minutes"].transform(delay_checkout_interval)
df_delay["delay_at_checkout"] = df_delay["delay_at_checkout_in_minutes"].transform(lambda x : True if x > 0 else False)
df_delay["has_previous_rental"] = [True if x > 0 else False for x in df_delay["previous_ended_rental_id"]]

display(df_delay.head())
display(df_delay["delay_checkout_interval"].value_counts())
display(df_delay["delay_at_checkout"].value_counts())
display(df_delay["has_previous_rental"].value_counts())

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,delay_checkout_interval,delay_at_checkout,has_previous_rental
0,505000,363965,mobile,canceled,,,,,False,False
1,507750,269550,mobile,ended,-81.0,,,Early,False,False
2,508131,359049,connect,ended,70.0,,,Late 60' - 120',True,False
3,508865,299063,connect,canceled,,,,,False,False
4,511440,313932,mobile,ended,,,,,False,False


delay_checkout_interval
Early                  6942
NA                     4964
Late 0' - 30'          3254
Late more than 120'    2568
Late 60' - 120'        1860
Late 30' - 60'         1722
Name: count, dtype: int64

delay_at_checkout
False    11906
True      9404
Name: count, dtype: int64

has_previous_rental
False    19469
True      1841
Name: count, dtype: int64

In [7]:
with pd.option_context('display.max_colwidth', None):
  display(doc)

Unnamed: 0,field name,Comment
0,rental_id,Unique identifier of the rental
1,car_id,Unique identifier of the car
2,checkin_type,"Flow used for both checkin and checkout. (ie. access and return the car)\nmobile = rental agreement signed on the owner's smartphone\nconnect = car equiped with the Connect technology , opened by the driver with his smartphone.\nNote: paper contracts were excluded from the data as we have no data on their delay at checkout and it's negligible use case"
3,state,canceled means that the rental did not happen (was canceled by the driver or the owner).
4,delay_at_checkout_in_minutes,Difference in minutes between the rental end time requested by the driver when booking the car and the actual time the driver completed the checkout. Negative values mean that the driver returned the car in advance.
5,previous_ended_rental_id,id of the previous ended rental of the car (NULL when no previous rental or delay with previous rental higher than 12 hours)
6,time_delta_with_previous_rental_in_minutes,"Difference in minutes between this rental planned start time and the previous rental planned end time (when lower than 12 hours, NULL if higher)"


### Share

Which share of our owner’s revenue would potentially be affected by the feature ?

### Late habit

How often are drivers late for the next check-in? How does it impact the next driver ?

In [None]:
df_delay_habit = df_delay[df_delay["delay_checkout_interval"]!="NA"].groupby(["delay_checkout_interval", "checkin_type"]).size().reset_index(name="count")
df_delay_habit["percentage"] = compute_percentage(df_delay_habit["count"])

fig_4 = px.bar(
    data_frame=df_delay_habit,
    x="checkin_type",
    y="percentage",
    color="delay_checkout_interval",
    labels={
        "checkin_type": "Checkin type",
        "percentage": "Percentage",
        "delay_checkout_interval": "Delay interval at checkout"
    },
    barmode="group"
)

fig_4.show()

In [19]:
df_delay_habit = df_delay[df_delay["delay_checkout_interval"]!="NA"].groupby(["delay_at_checkout", "checkin_type"]).size().reset_index(name="count")
df_delay_habit["percentage"] = compute_percentage(df_delay_habit["count"])

fig_2 = px.bar(
    data_frame=df_delay_habit,
    x="checkin_type",
    y="percentage",
    color="delay_at_checkout",
    labels={
        "checkin_type": "Checkin type",
        "percentage": "Percentage",
        "delay_at_checkout": "Delay at checkout"
    },
    barmode="group"
)

fig_2.show()

In [17]:
df_delay_impact = df_delay[df_delay["delay_checkout_interval"]!="NA"].groupby(["delay_checkout_interval", "has_previous_rental"]).size().reset_index(name="count")
df_delay_impact["percentage"] = compute_percentage(df_delay_impact["count"])

fig_3 = px.bar(
    data_frame=df_delay_impact,
    x="has_previous_rental",
    y="percentage",
    color="delay_checkout_interval",
    labels={
        "has_previous_rental": "Has previous rental",
        "percentage": "Percentage",
        "delay_checkout_interval": "Delay interval at checkout"
    },
    barmode="group"
)

fig_3.show()

In [None]:
df_delay_impact = df_delay[df_delay["delay_checkout_interval"]!="NA"].groupby(["delay_at_checkout", "has_previous_rental"]).size().reset_index(name="count")
df_delay_impact["percentage"] = compute_percentage(df_delay_impact["count"])

fig_5 = px.bar(
    data_frame=df_delay_impact,
    x="has_previous_rental",
    y="percentage",
    color="delay_at_checkout",
    labels={
        "has_previous_rental": "Has previous rental",
        "percentage": "Percentage",
        "delay_at_checkout": "Delay at checkout"
    },
    barmode="group"
)

fig_5.show()

Develop : late habits based on interval + previous rental isn't the main reason beside delay at checkout

### Scope

Should we enable the feature for all cars ?, only Connect cars ?

In [8]:
fig_2 = px.pie(
    data_frame=df_delay.groupby("checkin_type").size().reset_index(name="count"),
    names="checkin_type",
    values="count",
    height=500,
    width=500
)

fig_2.show()

In [15]:
df_canceled = df_delay[(df_delay["state"]=="canceled") & (df_delay["has_previous_rental"]==True)]

fig_3 = px.pie(
    data_frame=df_canceled.groupby("checkin_type").size().reset_index(name="count"),
    names="checkin_type",
    values="count",
    height=500,
    width=500
)

fig_3.show()

First observations/intuitions concerning the threshold => opening on the Threshold more precise analysis

### Threshold

How long should the minimum delay be ?

How many rentals would be affected by the feature depending on the threshold and scope we choose ?

How many problematic cases will it solve depending on the chosen threshold and scope ?

=> Graph evolution depending 

In [None]:
# delays = []
# for x in df_delay['delay_at_checkout_in_minutes']:
#     if x < df_delay['delay_at_checkout_in_minutes'].quantile(0.01):
#         delays.append(np.nan)
#     elif x > df_delay['delay_at_checkout_in_minutes'].quantile(0.99):
#         delays.append(np.nan)     
#     else:
#         delays.append(x)

# df_delay['delays_checkout_min_cleaned'] = delays

In [114]:
fig_1 = px.violin(
    data_frame=df_delay,
    x="delays_checkout_min_cleaned",
    color_discrete_sequence=[px.colors.qualitative.G10[0]]
)

fig_1.update_layout(
    # title="Delay at checkout distribution", => Title on Streamlit
    yaxis_title="Total",
    xaxis_title="Delay at checkout "
)

fig_1.show()

In [115]:
fig_2 = px.violin(
    data_frame=df_delay,
    x="delay_at_checkout_in_minutes",
    color_discrete_sequence=[px.colors.qualitative.G10[0]]
)

fig_2.update_layout(
    # title="Delay at checkout distribution", => Title on Streamlit
    yaxis_title="Total",
    xaxis_title="Delay at checkout "
)

fig_2.show()