# Getaround

## EDA

In [None]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

In [None]:
df_delay = pd.read_excel("https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/get_around_delay_analysis.xlsx", sheet_name="rentals_data")
doc =  pd.read_excel("https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/get_around_delay_analysis.xlsx", sheet_name='Documentation')

In [3]:
display(df_delay.head())
display(df_delay.describe(include="all"))
display(df_delay.isna().sum())

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,


Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
count,21310.0,21310.0,21310,21310,16346.0,1841.0,1841.0
unique,,,2,2,,,
top,,,mobile,ended,,,
freq,,,17003,18045,,,
mean,549712.880338,350030.603426,,,59.701517,550127.411733,279.28843
std,13863.446964,58206.249765,,,1002.561635,13184.023111,254.594486
min,504806.0,159250.0,,,-22433.0,505628.0,0.0
25%,540613.25,317639.0,,,-36.0,540896.0,60.0
50%,550350.0,368717.0,,,9.0,550567.0,180.0
75%,560468.5,394928.0,,,67.0,560823.0,540.0


rental_id                                         0
car_id                                            0
checkin_type                                      0
state                                             0
delay_at_checkout_in_minutes                   4964
previous_ended_rental_id                      19469
time_delta_with_previous_rental_in_minutes    19469
dtype: int64

In [4]:
# Useful function
def remove_outliers_column_based(df, column, std_ratio=3) :

    mask = (df[column] > df[column].mean() - std_ratio * df[column].std()) & (df[column] < df[column].mean() + std_ratio * df[column].std())

    return df.loc[mask]


def compute_percentage_on_column(column) :

    return [i / column.sum() * 100 for i in column]


def create_column_delay_checkout_range(delay) :

    if delay <= 0:
        interval = "Early"
    elif delay < 30:
        interval = "Late 0' - 30'"
    elif delay < 60:
        interval = "Late 30' - 60'"
    elif delay < 120 :
        interval = "Late 60' - 120'"
    elif delay >= 120 :
        interval = "Late more than 120'"        
    else :
        interval = "NA"

    return interval

In [5]:
df_delay = remove_outliers_column_based(df_delay, "delay_at_checkout_in_minutes")

df_delay_previous = df_delay[["rental_id", "delay_at_checkout_in_minutes"]]
df_delay_previous.columns = ["previous_ended_rental_id", "previous_delay_at_checkout_in_minutes"]
df_delay = df_delay.merge(df_delay_previous, on="previous_ended_rental_id", how="left")

df_delay["overlap"] = df_delay["previous_delay_at_checkout_in_minutes"] - df_delay['time_delta_with_previous_rental_in_minutes']
df_delay["rental_started_with_delay"] = df_delay["overlap"] > 0
df_delay["delay_at_checkout"] = df_delay["delay_at_checkout_in_minutes"] > 0
df_delay["delay_checkout_range"] = df_delay["delay_at_checkout_in_minutes"].transform(create_column_delay_checkout_range)

display(df_delay.head())
display(df_delay["delay_checkout_range"].value_counts())
display(df_delay["rental_started_with_delay"].value_counts())

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes,previous_delay_at_checkout_in_minutes,overlap,rental_started_with_delay,delay_at_checkout,delay_checkout_range
0,507750,269550,mobile,ended,-81.0,,,,,False,False,Early
1,508131,359049,connect,ended,70.0,,,,,False,True,Late 60' - 120'
2,511626,398802,mobile,ended,-203.0,,,,,False,False,Early
3,511639,370585,connect,ended,-15.0,563782.0,570.0,136.0,-434.0,False,False,Early
4,512303,371242,mobile,ended,-44.0,,,,,False,False,Early


delay_checkout_range
Early                  6932
Late 0' - 30'          3254
Late more than 120'    2508
Late 60' - 120'        1860
Late 30' - 60'         1722
Name: count, dtype: int64

rental_started_with_delay
False    16107
True       169
Name: count, dtype: int64

In [6]:
df_delay.shape

(16276, 12)

In [21]:
with pd.option_context('display.max_colwidth', None):
  display(doc)

Unnamed: 0,field name,Comment
0,rental_id,Unique identifier of the rental
1,car_id,Unique identifier of the car
2,checkin_type,"Flow used for both checkin and checkout. (ie. access and return the car)\nmobile = rental agreement signed on the owner's smartphone\nconnect = car equiped with the Connect technology , opened by the driver with his smartphone.\nNote: paper contracts were excluded from the data as we have no data on their delay at checkout and it's negligible use case"
3,state,canceled means that the rental did not happen (was canceled by the driver or the owner).
4,delay_at_checkout_in_minutes,Difference in minutes between the rental end time requested by the driver when booking the car and the actual time the driver completed the checkout. Negative values mean that the driver returned the car in advance.
5,previous_ended_rental_id,id of the previous ended rental of the car (NULL when no previous rental or delay with previous rental higher than 12 hours)
6,time_delta_with_previous_rental_in_minutes,"Difference in minutes between this rental planned start time and the previous rental planned end time (when lower than 12 hours, NULL if higher)"


### Overview

#### Scope

**Should we enable the feature for all cars ?, only Connect cars ?**

In [None]:
fig_1 = px.pie(
    data_frame=df_delay.groupby("checkin_type").size().reset_index(name="count"),
    names="checkin_type",
    values="count",
    title="Percentage of checkin type"
)

fig_1.show()

In [8]:
fig_2 = px.pie(
    data_frame=df_delay["delay_at_checkout"].value_counts().reset_index(),
    names="delay_at_checkout",
    values="count",
    labels={
        "count": "Count",
        "delay_at_checkout": "Delay at checkout"
    },
    title="Percentage of delays"
)

fig_2.show()

From this global view, we can see that the mobile check-in procedure accounts for 80% of all rentals, compared with 20% for connected cars. We also see that over 57% of rentals end late, representing 9344 rentals out of 16276, given the importance of these figures, we're going to take a closer look at these delays.

Analysis of the data set shows that only 9.3% of rentals have information on the previous rental, which will be notified when taken into account in the results.

#### Late habit

**How often are drivers late for the next check-in? How does it impact the next driver ?**

In [None]:
fig_3 = px.histogram(
    data_frame=df_delay,
    x="delay_at_checkout_in_minutes",
    color_discrete_sequence=[px.colors.qualitative.G10[0]],
    title="Delay at checkout distribution"
)

fig_3.update_layout(
    yaxis_title="Total",
    xaxis_title="Delay at checkout"
)

fig_3.show()

A closer look reveals that delays are normally distributed. However, it's important to note that outliers more than three standard deviations from the mean have been removed, as some values seemed to indicate delays of up to almost 49 days, which seems highly unlikely.

In [273]:
df_delay_habit = df_delay[(df_delay["delay_checkout_range"]!="NA")].groupby(["delay_checkout_range", "checkin_type"]).size().reset_index(name="count")

nb_element_connect = df_delay_habit[df_delay_habit["checkin_type"]=="connect"]["count"].sum()
nb_element_mobile = df_delay_habit[df_delay_habit["checkin_type"]=="mobile"]["count"].sum()

df_delay_habit["total"] = df_delay_habit["checkin_type"].apply(lambda x : nb_element_connect if x == "connect" else nb_element_mobile)
df_delay_habit["percentage"] = df_delay_habit["count"] * 100 / df_delay_habit["total"]

fig_4 = px.bar(
    data_frame=df_delay_habit,
    x="checkin_type",
    y="percentage",
    color="delay_checkout_range",
    labels={
        "checkin_type": "Checkin type",
        "percentage": "percentage",
        "delay_checkout_range": "Delay range at checkout"
    },
    title="Distribution of delays by range and type of checkin",
    barmode="group"
)

fig_4.show()

We already know that more than half of all rentals end late, but this chart takes our analysis a step further. We can see that connected cars tend to be proportionally less late at the checkout, probably thanks to their faster procedure. Also, it's clear that delay at checkout is mainly within the range 0 to 120 minutes (2 hours), with a majority under 30.

In [97]:
df_delay_impact = df_delay[(df_delay["delay_checkout_range"]!="NA") & (~df_delay["previous_ended_rental_id"].isna())]\
    .groupby(["delay_at_checkout", "rental_started_with_delay"]).size().reset_index(name="count")
df_delay_impact["percentage"] = compute_percentage_on_column(df_delay_impact["count"])

fig_5 = px.bar(
    data_frame=df_delay_impact,
    x="delay_at_checkout",
    y="percentage",
    color="rental_started_with_delay",
    labels={
        "rental_started_with_delay": "Rental started with delay",
        "percentage": "Percentage",
        "delay_at_checkout": "Delay at checkout"
    },
    title="Percentage of rentals with late checkout by late start",
    barmode="group"
)

fig_5.show()

Surprisingly, it would appear that despite a late checkin, this is not the main reason for late checkouts, as only 7.47% seem to be due to this. But we have to be careful with these figures, as they only concern rentals for which we have information on previous checkouts (i.e. only 9.3% of all data).

In view of our owner's profit, as well as unsatisfied customers who have to wait, it seems necessary to think about setting a delay between two rentals to avoid cancellations that would be due to a delay by the previous tenant and remove our customer's frustration. For this delay to be relevant, we will try several thresholds that will take into account the specificity of checkin.

### Threshold

In [None]:
def create_plot_from_thresholds(df, thresholds, column, x_label, y_label, title, only_positive_values, compute_percentage):
    fig = go.Figure()

    total_values = np.zeros(len(thresholds))

    for checkin_type in df["checkin_type"].unique():
        values = []<
        mask = (df["checkin_type"] == checkin_type)

        for threshold in thresholds:
            
            if only_positive_values :
                mask &= (df[column] >= 0)

            count = (mask & (df[column] <= threshold)).sum()
            
            if compute_percentage :
                count *= 100 / df[mask].shape[0]

            values.append(count)

            # Set first value to 0 since we don't want to include early and on time rentals when there's no threshold (i.e. set to 0)
            values[0] = 0

        total_values += np.array(values)

        fig.add_trace(go.Scatter(x=thresholds, y=values, mode="lines+markers", name=checkin_type))

    if not compute_percentage :
        fig.add_trace(go.Scatter(x=thresholds, y=total_values, mode="lines+markers", name="total"))

    fig.update_layout(title=title, xaxis_title=x_label, yaxis_title=y_label)

    return fig

In [304]:
thresholds = list(range(0, 180, 10))

To determine the optimum delay between two rentals, we test several thresholds from 0 to 180 minutes (3 hours) in 10-minute steps.

**How many problematic cases will it solve depending on the chosen threshold and scope ?**

In [None]:
fig_1 = create_plot_from_thresholds(df_delay,
                                  thresholds,
                                  "overlap",
                                  "Threshold (min)",
                                  "Problematic cases solved",
                                  "Number of problematic cases solved depending the threshold and checkin type",
                                  True,
                                  False)

fig_1.show()

Before commenting on the results, it's important to note that this graph concerns only those rentals for which we have information on the previous one (i.e. only 9.3% of all data) and which started late, which represents only 169 cases in total (around 1.03%). We can see that with a threshold of 20 minutes, it would already be possible to resolve half the problematic cases, and more than three-quarters if it were 60 minutes (1 hour). However, we rely on too few data to establish a reliable threshold.

As it seems unreliable to use only rentals for which we have information on the previous rental, we will use all of them for the following analysis, with the bias that we don't know the time delta between two rentals.

In [None]:
fig_2 = create_plot_from_thresholds(df_delay,
                                    thresholds,
                                    "delay_at_checkout_in_minutes",
                                    "Threshold (min)",
                                    "Problematic cases solved",
                                    "Number of problematic cases solved depending the threshold and checkin type",
                                    True,
                                    False)

fig_2.show()

Unlike before, we're talking here about all rentals that finished late, without taking into account whether they started late, as we don't have the information. In total, there were 9344 problematics cases, which could be halved with an overall threshold of 50 minutes.

However, although setting up a delay between two rentals solves some cases, it does have an impact on rentals returned early and on time, as owners won't be able to return their car to the rental market immediately afterwards, which can lead to lost sales. In the next section, we'll measure the number of rentals affected by the threshold and, above all, how much revenue this represents for the owners revenues.

**How many rentals would be affected by the feature depending on the threshold and scope we choose?**

In [None]:
fig_3 = create_plot_from_thresholds(df_delay,
                                  thresholds,
                                  "delay_at_checkout_in_minutes",
                                  "Threshold (min)",
                                  "Rentals affected",
                                  "Number of rentals affected depending the threshold and checkin type",
                                  False,
                                  False)

fig_3.show()

As mentioned above, setting a delay between two rentals is not trivial, as it affects all rentals below the threshold, especially those that are early or on time. We can see from the graph that with a threshold of just 10 minutes, half of all rentals are already affected. And if we go back to the 50-minute threshold, which would resolve half the problem cases, this would affect 70% of all rentals.

Given the large proportion of rentals affected by the introduction of a threshold, it would be interesting to see what this represents in terms of percentage of revenue for owners.

**Which share of our owner’s revenue would potentially be affected by the feature ?**

In [None]:
fig_4 = create_plot_from_thresholds(df_delay,
                                  thresholds,
                                  "delay_at_checkout_in_minutes",
                                  "Threshold (min)",
                                  "Potential loss of rental income (%)",
                                  "Percentage of potential loss of rental income depending the threshold and checkin type",
                                  False,
                                  True)

fig_4.show()

First of all, it's important to note that market shares are calculated by checkin type and not for all rentals. As the distribution of checkin types is very unbalanced, connect rentals would be under-represented, whereas in reality they are less prone to delays, and the introduction of a threshold would therefore be more detrimental to their owners. We can see from the graph that even with a relatively low threshold, a large proportion of revenues can be affected by the introduction of a delay between two rentals. Another important detail is that we're talking about potential loss of rental income here, as we don't have information on all previous rentals, so we don't know if there's already a time delta between two rentals (if so, the rental is potentially less impacted by the threshold).

**How long should the minimum delay be ?**

From a purely business point of view, given that there are few delays due to others, having the lowest possible threshold would enable us to resolve a proportion of problem cases, while avoiding too many affected rentals and therefore too much potential loss of revenue for owners. For mobile rentals, a threshold of 20 minutes would resolve over 25% of problem cases, while affecting just over 54% of all rentals of this type. However, it doesn't seem appropriate to deploy this feature on connect rentals, as they are much less likely to be late, and would certainly resolve some problem cases (around 30%), but the number of rentals impacted (and therefore possible loss of revenue for owners) would be too high - almost 70%.

However, if only customer satisfaction is taken into account, the threshold would have to be high enough to resolve a large proportion of problem cases, and thus avoid waiting and customer frustration. To resolve around 50% of problem cases, we'd need a threshold of 1 hour for mobile rentals and 40 minutes for connect rentals. It is possible to go further and avoid 75% of problem cases with a threshold of 2 hours 20 minutes for mobile rentals and 1 hour 30 minutes for connect.

To sum up, a high threshold for all types of rental will result in greater customer satisfaction, but a potentially high loss of rental income for owners. The scope of functionality and the ideal threshold will depend on the desired vision, according to objectives in terms of cases solved and sales.