# Getaround

## EDA

In [1]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

import os

In [2]:
data_path = os.getcwd() + "/data/"

df_prices = pd.read_csv(data_path+"get_around_pricing_project.csv")

In [3]:
# Useful function
def remove_outliers_column_based(df, column, std_ratio=3) :

    mask = (df[column] > df[column].mean() - std_ratio * df[column].std()) & (df[column] < df[column].mean() + std_ratio * df[column].std())

    return df.loc[mask]

In [4]:
display(df_prices.head())
display(df_prices.describe(include="all"))
display(df_prices.isna().sum())

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843.0,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,,28,,,4,10,8,2,2,2,2,2,2,2,
top,,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,2421.0,,140962.8,128.98823,,,,,,,,,,,121.214536
std,1398.198007,,60196.74,38.99336,,,,,,,,,,,33.568268
min,0.0,,-64.0,0.0,,,,,,,,,,,10.0
25%,1210.5,,102913.5,100.0,,,,,,,,,,,104.0
50%,2421.0,,141080.0,120.0,,,,,,,,,,,119.0
75%,3631.5,,175195.5,135.0,,,,,,,,,,,136.0


Unnamed: 0                   0
model_key                    0
mileage                      0
engine_power                 0
fuel                         0
paint_color                  0
car_type                     0
private_parking_available    0
has_gps                      0
has_air_conditioning         0
automatic_car                0
has_getaround_connect        0
has_speed_regulator          0
winter_tires                 0
rental_price_per_day         0
dtype: int64

Reformulate : The pricing dataframe looks fairly clean, with no missing data... But minimum mileage is negative ? max is 1000376 ? We have some outliers and even incoherent values

In [5]:
# Drop column
df_prices.drop("Unnamed: 0", axis=1, inplace=True)

# Remove outliers (function + 3 calls : engine_power, mileage, rental_price_per_day)
df_prices = remove_outliers_column_based(df_prices, "engine_power")
df_prices = remove_outliers_column_based(df_prices, "mileage")
df_prices = remove_outliers_column_based(df_prices, "rental_price_per_day")

In [9]:
df_prices["engine_power"].median()

120.0

In [6]:
display(df_prices.describe(include="all"))

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4704,4704.0,4704.0,4704,4704,4704,4704,4704,4704,4704,4704,4704,4704,4704.0
unique,26,,,4,10,8,2,2,2,2,2,2,2,
top,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,947,,,4526,1582,1583,2562,3738,3777,3795,2524,3581,4383,
mean,,139544.619685,126.979804,,,,,,,,,,,120.387117
std,,54192.803185,35.024317,,,,,,,,,,,29.740118
min,,476.0,25.0,,,,,,,,,,,26.0
25%,,103680.5,100.0,,,,,,,,,,,104.0
50%,,141084.5,120.0,,,,,,,,,,,119.0
75%,,174762.5,135.0,,,,,,,,,,,135.25


In [16]:
fig_1 = px.histogram(
    x=df_prices["rental_price_per_day"],
    color_discrete_sequence=[px.colors.qualitative.G10[0]]
)

fig_1.update_layout(
    # title="Rental price per day distribution", => Title on Streamlit
    yaxis_title="Total",
    xaxis_title="Rental price per day"
)

fig_1.show()

Reformulate : Most rentals cost between 100 and 140 per day

In [17]:
fig_2 = px.histogram(
    x=df_prices["mileage"],
    color_discrete_sequence=[px.colors.qualitative.G10[0]]
)

fig_2.update_layout(
    # title="Mileage distribution", => Title on Streamlit
    yaxis_title="Total",
    xaxis_title="Mileage"
)

fig_2.show()

In [7]:
fig_3 = px.histogram(
    x=df_prices["engine_power"],
    nbins=20,
    color_discrete_sequence=[px.colors.qualitative.G10[0]]
)

fig_3.update_layout(
    # title="Engine power distribution", => Title on Streamlit
    yaxis_title="Total",
    xaxis_title="Engine power"
)

fig_3.show()

Reformulate : A lot of cars are far from new, but only few have more than 200.000 kms (I am assuming kilometers since we're in Europe...) and most seem to get retired at this point .

## Faire EDA pour aller un peu plus loin, mais pas la priorité !!

## ML

In [33]:
# Import ML
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from xgboost import XGBRegressor



In [10]:
# Our model have to be better than this dummy model
df_prices["rental_price_per_day"].mean()

120.38711734693878

In [75]:
target_variable = "rental_price_per_day"

X = df_prices.drop(target_variable, axis = 1)
y = df_prices[target_variable]

X_train_unproc, X_test_unproc, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)

numerical_features = X.select_dtypes(include="int64").columns
categorical_features = X.select_dtypes(exclude="int64").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

X_train = preprocessor.fit_transform(X_train_unproc)
X_test = preprocessor.transform(X_test_unproc)

models_performance = []

### Baseline

In [76]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

scores = cross_val_score(linear_regression, X_train, y_train, cv=10)

models_performance.append([
    "Linear",
    "",
    linear_regression.score(X_train, y_train),
    linear_regression.score(X_test, y_test),
    scores.mean(),
    scores.std()
])

### Non linear model

In [77]:
ridge = Ridge()

params = {
    "alpha": [i for i in np.arange(0.05, 3.0, 0.1)]
}

gridsearch_ridge = GridSearchCV(ridge, param_grid=params, cv=10)
gridsearch_ridge.fit(X_train, y_train)

models_performance.append([
    "Ridge",
    gridsearch_ridge.best_params_,
    gridsearch_ridge.score(X_train, y_train),
    gridsearch_ridge.score(X_test, y_test),
    gridsearch_ridge.cv_results_["mean_test_score"][gridsearch_ridge.best_index_],
    gridsearch_ridge.cv_results_["std_test_score"][gridsearch_ridge.best_index_]
])


In [78]:
lasso = Lasso()

params = {
    "alpha": [i for i in range(1, 50, 1)]
}

gridsearch_lasso = GridSearchCV(lasso, param_grid=params, cv=10)
gridsearch_lasso.fit(X_train, y_train)

models_performance.append([
    "Lasso",
    gridsearch_lasso.best_params_,
    gridsearch_lasso.score(X_train, y_train),
    gridsearch_lasso.score(X_test, y_test),
    gridsearch_lasso.cv_results_["mean_test_score"][gridsearch_lasso.best_index_],
    gridsearch_lasso.cv_results_["std_test_score"][gridsearch_lasso.best_index_]
])

### Ensemble learning

In [79]:
from sklearn.model_selection import RandomizedSearchCV
rfr = RandomForestRegressor()

params = {
    "n_estimators": [i for i in range(150, 350, 100)],
    "min_samples_split": [i for i in range(2, 3, 4)],
    "min_samples_leaf": [1, 2, 4]
}

gridsearch_rfr = GridSearchCV(rfr, param_grid=params, cv=10)
gridsearch_rfr.fit(X_train, y_train)

models_performance.append([
    "RandomForestRegressor",
    gridsearch_rfr.best_params_,
    gridsearch_rfr.score(X_train, y_train),
    gridsearch_rfr.score(X_test, y_test),
    gridsearch_rfr.cv_results_["mean_test_score"][gridsearch_rfr.best_index_],
    gridsearch_rfr.cv_results_["std_test_score"][gridsearch_rfr.best_index_]
])

In [None]:
xgbr = XGBRegressor()

params = {
    "max_depth": [5, 7, 9],
    "learning_rate": [0.1, 0.01, 0.001],
    "subsample": [0.5, 0.7, 1]
}

gridsearch_xgbr = GridSearchCV(xgbr, param_grid=params, cv=10)
gridsearch_xgbr.fit(X_train, y_train)

models_performance.append([
    "XGBRegressor",
    gridsearch_xgbr.best_params_,
    gridsearch_xgbr.score(X_train, y_train),
    gridsearch_xgbr.cv_results_["mean_test_score"][gridsearch_xgbr.best_index_],
    gridsearch_xgbr.cv_results_["std_test_score"][gridsearch_xgbr.best_index_]
])

In [81]:
score_comparaison = pd.DataFrame(data=models_performance, columns=["model", "best_hyperparameters", "train_score", "test_score", "cv_mean_score", "cv_mean_std"])
score_comparaison

Unnamed: 0,model,best_hyperparameters,train_score,test_score,cv_mean_score,cv_mean_std
0,Linear,,0.716848,0.677586,0.705073,0.042826
1,Ridge,{'alpha': 0.7500000000000002},0.715927,0.678592,0.70532,0.041717
2,Lasso,{'alpha': 1},0.626239,0.59226,0.623894,0.031649
3,RandomForestRegressor,"{'min_samples_leaf': 1, 'min_samples_split': 2...",0.9659,0.755143,0.747597,0.025467
4,XGBRegressor,"{'learning_rate': 0.1, 'max_depth': 7, 'subsam...",0.914811,0.775233,0.77272,0.030903


### MLFlow model test

In [10]:
import requests

payload = {
    "model_key": "Alfa Romeo",
    "mileage": 140000,
    "engine_power": 120,
    "fuel": "diesel", 
    "paint_color": "silver",
    "car_type": "convertible",
    "private_parking_available": False,
    "has_gps": False,
    "has_air_conditioning": False,
    "automatic_car": False,
    "has_getaround_connect": False,
    "has_speed_regulator": False,
    "winter_tires": False
}

request = requests.post("https://qxzjy-get-around-fastapi.hf.space/predict", json=payload)
response = request.json()

print(f"Estimated daily rental price : {round(response['prediction'], 2)} $")

Estimated daily rental price : 109.11 $
