### Pricing Optimization - Predictive ML Model

In [1]:
import pandas as pd
import numpy as np
 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [2]:
df = pd.read_csv("get_around_pricing_project.csv")
df = df.iloc[: , 1:] #drop first null column

In [3]:
# Check the dimension of the dataset
df.shape

# Check the type of variables (object, floats, etc)
df.info()

# Missing values
100*df.isnull().sum()/df.shape[0]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   model_key                  4843 non-null   object
 1   mileage                    4843 non-null   int64 
 2   engine_power               4843 non-null   int64 
 3   fuel                       4843 non-null   object
 4   paint_color                4843 non-null   object
 5   car_type                   4843 non-null   object
 6   private_parking_available  4843 non-null   bool  
 7   has_gps                    4843 non-null   bool  
 8   has_air_conditioning       4843 non-null   bool  
 9   automatic_car              4843 non-null   bool  
 10  has_getaround_connect      4843 non-null   bool  
 11  has_speed_regulator        4843 non-null   bool  
 12  winter_tires               4843 non-null   bool  
 13  rental_price_per_day       4843 non-null   int64 
dtypes: bool(

model_key                    0.0
mileage                      0.0
engine_power                 0.0
fuel                         0.0
paint_color                  0.0
car_type                     0.0
private_parking_available    0.0
has_gps                      0.0
has_air_conditioning         0.0
automatic_car                0.0
has_getaround_connect        0.0
has_speed_regulator          0.0
winter_tires                 0.0
rental_price_per_day         0.0
dtype: float64

In [4]:
# Display first 5 observations
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [5]:
# Basic stats
df.describe(include='all')

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,28,,,4,10,8,2,2,2,2,2,2,2,
top,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,,140962.8,128.98823,,,,,,,,,,,121.214536
std,,60196.74,38.99336,,,,,,,,,,,33.568268
min,,-64.0,0.0,,,,,,,,,,,10.0
25%,,102913.5,100.0,,,,,,,,,,,104.0
50%,,141080.0,120.0,,,,,,,,,,,119.0
75%,,175195.5,135.0,,,,,,,,,,,136.0


### EDA

In [16]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# All variables in a single visualization
num_features = ['mileage', 'engine_power', 'rental_price_per_day']
cat_features = ['model_key', 'fuel', 'paint_color', 'car_type']
binary_features = ['has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']

all_titles = num_features + cat_features + binary_features

fig = make_subplots(
    rows=3, cols=4,
    subplot_titles=all_titles,
    specs=[[{"type": "xy"}]*4,
           [{"type": "xy"}]*4,
           [{"type": "xy"}]*4]
)

# Add box plots for numeric variables (first row, first 3 columns)
for i, feature in enumerate(num_features):
    fig.add_trace(go.Box(y=df[feature], name=feature, showlegend=False), 
                  row=1, col=i+1)

# Add bar plots for categorical variables
cat_positions = [(1,4), (2,1), (2,2), (2,3)]
for i, feature in enumerate(cat_features):
    row, col = cat_positions[i]
    counts = df[feature].value_counts()
    fig.add_trace(go.Bar(x=counts.index, y=counts.values, name=feature, showlegend=False),
                  row=row, col=col)

# Add bar plots for binary variables
bin_positions = [(2,4), (3,1), (3,2), (3,3), (3,4), (3,4)]  
for i, feature in enumerate(binary_features):
    if i < 5:
        row, col = bin_positions[i]
    else:  # 6th binary variable
        continue  # Skip since we only have 5 spots left
    counts = df[feature].value_counts()
    fig.add_trace(go.Bar(x=counts.index, y=counts.values, name=feature, showlegend=False),
                  row=row, col=col)

fig.update_layout(title="All Variable Distributions", height=1200)
fig.show()

Key take-aways:

- There are quite a few outliers for the numeric variables, thus we will use Tukey's method to remove them (Std)

In [25]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

target = 'rental_price_per_day'
features = [col for col in df.columns if col != target]
cols = 3
rows = (len(features) + cols - 1) // cols

fig = make_subplots(rows=rows, cols=cols, subplot_titles=features)

for i, feature in enumerate(features):
    row, col = (i // cols) + 1, (i % cols) + 1
    fig.add_trace(go.Scatter(x=df[feature], y=df[target], mode='markers', 
                            showlegend=False, opacity=0.6), row=row, col=col)
    fig.update_xaxes(title_text=feature, row=row, col=col)
    fig.update_yaxes(title_text=target, row=row, col=col)

fig.update_layout(title=dict(text="Bivariate analysis - Features vs Target", x=0.5),
                 height=300*rows, width=1400)
fig.show()

There appears to be a clear linear relationship between engine power and rental price per day, and a slight linear relationship with mileage as well.

In [24]:
import plotly.figure_factory as ff
import numpy as np

# Correlation matrix for numeric columns only
corr_matrix = df.select_dtypes(include=[np.number]).corr().round(2)
target = 'rental_price_per_day'

# Mask to show only target correlations
mask = np.zeros_like(corr_matrix.values, dtype=bool)
target_idx = corr_matrix.columns.get_loc(target)
mask[:, target_idx] = mask[target_idx, :] = True

masked_corr = np.where(mask, corr_matrix.values, np.nan)

# Create heatmap
fig = ff.create_annotated_heatmap(masked_corr, x=corr_matrix.columns.tolist(), 
                                  y=corr_matrix.index.tolist(), colorscale='RdBu', zmid=0)
fig.update_layout(title=dict(text="Correlation with Target", x=0.5))

# Remove NaN annotations
for ann in fig.layout.annotations:
    if ann.text == 'nan': ann.text = ''

fig.show()

As expected, engine power and mileage are correlated with the target.

### Machine Learning Preprocessing

In [6]:
# Remove extreme outliers in numeric features: engine_power, mileage (beyond 3 std's away from mean)

cols = ["engine_power", "mileage"]
mask = (
    (df[cols] < df[cols].mean() + 3 * df[cols].std()) &
    (df[cols] > df[cols].mean() - 3 * df[cols].std())
).all(axis=1)

df = df[mask]


In [7]:
df.describe()

Unnamed: 0,mileage,engine_power,rental_price_per_day
count,4750.0,4750.0,4750.0
mean,139029.249053,127.470105,120.980421
std,54743.072628,35.616158,32.202871
min,-64.0,25.0,10.0
25%,102863.0,100.0,104.0
50%,140627.0,120.0,119.0
75%,174686.75,135.0,136.0
max,321498.0,240.0,422.0


Even after removing extreme outliers, there is still one negative value for mileage. We will set it to 0.

In [9]:
mask = (df['mileage'] < 0)
df_negative = df[mask]
df_negative


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
2938,Renault,-64,230,diesel,black,sedan,True,True,False,True,False,False,True,274


In [10]:
df.loc[df['mileage'] < 0, 'mileage'] = 0
df.describe()

Unnamed: 0,mileage,engine_power,rental_price_per_day
count,4750.0,4750.0,4750.0
mean,139029.262526,127.470105,120.980421
std,54743.038394,35.616158,32.202871
min,0.0,25.0,10.0
25%,102863.0,100.0,104.0
50%,140627.0,120.0,119.0
75%,174686.75,135.0,136.0
max,321498.0,240.0,422.0


In [11]:
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183
5,Citroën,152352,225,petrol,black,convertible,True,True,False,False,True,True,True,131


In [12]:
# Separate target variable Y from features X
target_name = "rental_price_per_day"

Y = df.loc[:, target_name] 
X = df.drop(target_name, axis=1)  # All columns are kept, except the target

In [13]:
X

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True
5,Citroën,152352,225,petrol,black,convertible,True,True,False,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,Toyota,39743,110,diesel,black,van,False,True,False,False,False,False,True
4839,Toyota,49832,100,diesel,grey,van,False,True,False,False,False,False,True
4840,Toyota,19633,110,diesel,grey,van,False,True,False,False,False,False,True
4841,Toyota,27920,110,diesel,brown,van,True,True,False,False,False,False,True


In [14]:
# Train test splitting
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
    )

# Automatically detect names of numeric/categorical columns
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=[object, bool]).columns.tolist()

# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
        ("scaler", StandardScaler())
    ])

# Create pipeline for categorical features
categorical_transformer = Pipeline(steps=[
        ("encoder",OneHotEncoder(handle_unknown='ignore', drop="first"))
        # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Preprocessings on train set
X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set
X_test = preprocessor.transform(X_test)

In [15]:
numeric_features

['mileage', 'engine_power']

In [16]:
categorical_features

['model_key',
 'fuel',
 'paint_color',
 'car_type',
 'private_parking_available',
 'has_gps',
 'has_air_conditioning',
 'automatic_car',
 'has_getaround_connect',
 'has_speed_regulator',
 'winter_tires']

### Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Train model
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# Performance assessment
Y_train_pred = regressor.predict(X_train)
Y_test_pred = regressor.predict(X_test)

# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.6870200561650507
R2 score on test set :  0.739275542989394


### Feature selection

In [18]:
from sklearn.feature_selection import  SequentialFeatureSelector

transformed_feature_names = preprocessor.get_feature_names_out()
features_list = pd.Index(transformed_feature_names)

feature_selector =  SequentialFeatureSelector(regressor, n_features_to_select = 10)
feature_selector.fit(X_train, Y_train)
 
best_features = features_list[feature_selector.get_support()]

print("According to the forward selection algorithm, the following features should be kept: ")
print(best_features.to_list())

According to the forward selection algorithm, the following features should be kept: 
['num__mileage', 'num__engine_power', 'cat__model_key_BMW', 'cat__model_key_Nissan', 'cat__model_key_Renault', 'cat__fuel_petrol', 'cat__car_type_estate', 'cat__car_type_suv', 'cat__has_gps_True', 'cat__has_getaround_connect_True']


#### 10 features

In [None]:
regressor = LinearRegression()
selector = SequentialFeatureSelector(regressor, n_features_to_select=10)
selector.fit(X_train, Y_train)

X_train_selected = X_train[:, selector.get_support()]
X_test_selected = X_test[:, selector.get_support()]

regressor.fit(X_train_selected, Y_train)
print("R2 score on training set:", regressor.score(X_train_selected, Y_train))
print("R2 score on test set:", regressor.score(X_test_selected, Y_test))

R2 score on training set: 0.6488159093535943
R2 score on test set: 0.7014658471295965


#### 7 features

In [21]:
regressor = LinearRegression()
selector = SequentialFeatureSelector(regressor, n_features_to_select=7)
selector.fit(X_train, Y_train)

X_train_selected = X_train[:, selector.get_support()]
X_test_selected = X_test[:, selector.get_support()]

regressor.fit(X_train_selected, Y_train)
print("R2 score on training set:", regressor.score(X_train_selected, Y_train))
print("R2 score on test set:", regressor.score(X_test_selected, Y_test))

R2 score on training set: 0.6260095626349309
R2 score on test set: 0.6761958754336284


Feature selection does not improve performance, potentially due to the linear regression assumptions not being met, thus the model is not suitable for this dataset.

### Random Forest Regression

In [22]:
from sklearn.ensemble import RandomForestRegressor

# Train the model
RFregressor = RandomForestRegressor(n_estimators=10)
RFregressor.fit(X_train, Y_train)

y_train_pred = RFregressor.predict(X_train)
y_test_pred = RFregressor.predict(X_test)

print("R2 score on training set:", r2_score(Y_train, y_train_pred))
print("R2 score on test set:", r2_score(Y_test, y_test_pred))


R2 score on training set: 0.946221680467423
R2 score on test set: 0.7650647897996268


In [23]:
# Use cross_val_score with R² as the scoring metric
from sklearn.model_selection import cross_val_score
scores = cross_val_score(
    RFregressor,
    X_train,  
    Y_train,  
    cv=5,    
    scoring='r2'  
)

print("Cross-validation R² scores:", scores)
print("Mean R² score:", scores.mean())

Cross-validation R² scores: [0.73315292 0.71146046 0.66796166 0.59577294 0.73610048]
Mean R² score: 0.6888896899706206


Overfitting likely due to too many trees , which lowers generalizability

In [24]:
# Train the model with fewer trees - not much of a difference
RFregressor = RandomForestRegressor(n_estimators=5)
RFregressor.fit(X_train, Y_train)

y_train_pred = RFregressor.predict(X_train)
y_test_pred = RFregressor.predict(X_test)

print("R2 score on training set:", r2_score(Y_train, y_train_pred))
print("R2 score on test set:", r2_score(Y_test, y_test_pred))

R2 score on training set: 0.9301845927278527
R2 score on test set: 0.7220180075823315


In [25]:
# =========================
# 1) Build a single pipeline
# =========================
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import joblib
import numpy as np
import pandas as pd

# ---- Train test split----
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
)

# ---- Preprocessing ----
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=[object, bool]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# ---- Model ----
RFregressor = RandomForestRegressor(n_estimators=10, random_state=0)

# ---- Full pipeline ----
pipe = Pipeline([
    ("pre", preprocessor),
    ("model", RFregressor),
])

# =========================
# 2) Train/Eval on the split
# =========================
pipe.fit(X_train, Y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred  = pipe.predict(X_test)

print("R2 score on training set:", r2_score(Y_train, y_train_pred))
print("R2 score on test set:", r2_score(Y_test, y_test_pred))

# =========================
# 3) Refit on ALL data for serving
# =========================
pipe.fit(X, Y)

# Save the artifact with feature order (raw columns expected at /predict)
artifacts = {
    "model": pipe,
    "feature_names": list(X.columns),           # << raw column order expected by API
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
}

joblib.dump(artifacts, "RF_model.joblib")
print("Saved model.joblib ✓")
print("Expected raw feature order for inference:", artifacts["feature_names"])


R2 score on training set: 0.948826833717862
R2 score on test set: 0.7516882705514488
Saved model.joblib ✓
Expected raw feature order for inference: ['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']
