In [11]:
import pandas as pd      
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from scipy.stats import skew

from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [12]:
df=pd.read_csv('car_price_prediction.csv')

In [13]:
df

Unnamed: 0,make_model,hp_kW,km,age,price,Gearing_Type
0,Audi A1,66.0,56013.000000,3.0,15770,Automatic
1,Audi A1,141.0,80000.000000,2.0,14500,Automatic
2,Audi A1,85.0,83450.000000,3.0,14640,Automatic
3,Audi A1,66.0,73000.000000,3.0,14500,Automatic
4,Audi A1,66.0,16200.000000,3.0,16790,Automatic
...,...,...,...,...,...,...
15490,Renault Espace,147.0,1647.362609,0.0,39950,Automatic
15491,Renault Espace,165.0,9900.000000,0.0,39885,Automatic
15492,Renault Espace,146.0,15.000000,0.0,39875,Automatic
15493,Renault Espace,147.0,10.000000,0.0,39700,Automatic


In [14]:
df.rename(columns={'make_model':'Car Model','hp_kW':'HP','km':'KM','age':'Age','Gearing_Type':'Gearing Type'},inplace=True)

In [15]:
X = df.drop(columns = ["price"])
y = df.price

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [17]:
def trans_2(X_train, X_test):
    
    cat = X_train.select_dtypes("object").columns
    cat = list(cat)
    
    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
    
    X_train_cat = pd.DataFrame(enc.fit_transform(X_train[cat]), index = X_train.index, 
                           columns = enc.get_feature_names_out(cat))
    
    X_test_cat  = pd.DataFrame(enc.transform(X_test[cat]), index = X_test.index, 
                               columns = enc.get_feature_names_out(cat))
    
    X_train = X_train_cat.join(X_train.select_dtypes("number"))
    X_test = X_test_cat.join(X_test.select_dtypes("number"))
    
    
    return X_train, X_test

In [18]:
def train_val(model, X_train, y_train, X_test, y_test):
    
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    scores = {"train": {"R2" : r2_score(y_train, y_train_pred),
    "mae" : mean_absolute_error(y_train, y_train_pred),
    "mse" : mean_squared_error(y_train, y_train_pred),                          
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))},
    
    "test": {"R2" : r2_score(y_test, y_pred),
    "mae" : mean_absolute_error(y_test, y_pred),
    "mse" : mean_squared_error(y_test, y_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))}}
    
    return pd.DataFrame(scores)

In [19]:
X_train, X_test = trans_2(X_train, X_test)

In [20]:
X_train

Unnamed: 0,Car Model_Audi A1,Car Model_Audi A3,Car Model_Opel Astra,Car Model_Opel Corsa,Car Model_Opel Insignia,Car Model_Renault Clio,Car Model_Renault Duster,Car Model_Renault Espace,Gearing Type_Automatic,Gearing Type_Manual,Gearing Type_Semi-automatic,HP,KM,Age
9866,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,66.0,10.0,0.0
3430,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,81.0,54000.0,3.0
3756,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,110.0,39246.0,2.0
103,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,85.0,9082.0,1.0
9753,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,66.0,10.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5695,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,100.0,47000.0,1.0
8006,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,77.0,10.0,0.0
1361,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,70.0,26650.0,1.0
1547,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,70.0,15577.0,1.0


In [21]:
X_test

Unnamed: 0,Car Model_Audi A1,Car Model_Audi A3,Car Model_Opel Astra,Car Model_Opel Corsa,Car Model_Opel Insignia,Car Model_Renault Clio,Car Model_Renault Duster,Car Model_Renault Espace,Gearing Type_Automatic,Gearing Type_Manual,Gearing Type_Semi-automatic,HP,KM,Age
9624,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,51.0,24487.0,1.0
14184,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,56.0,500.0,0.0
12919,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,87.0,24473.0,1.0
6448,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,81.0,70000.0,3.0
1472,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,70.0,24336.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7363,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,77.0,11500.0,1.0
4342,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,85.0,14270.0,1.0
551,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66.0,57000.0,3.0
10447,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,101.0,94220.0,3.0


In [22]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
from sklearn.linear_model import Lasso
lasso_model = Lasso()
lasso_model.fit(X_train_scaled, y_train)
train_val(lasso_model, X_train_scaled, y_train, X_test_scaled, y_test)

Unnamed: 0,train,test
R2,0.8720886,0.8658849
mae,1722.821,1687.795
mse,5953604.0,5709160.0
rmse,2440.001,2389.385


In [24]:
model = Lasso()
scores = cross_validate(model, X_train_scaled, y_train,
                        scoring=['r2', 'neg_mean_absolute_error','neg_mean_squared_error','neg_root_mean_squared_error'],
                        cv=10)

In [25]:
scores = pd.DataFrame(scores, index = range(1, 11))
scores.iloc[:,2:].mean()

test_r2                             8.715852e-01
test_neg_mean_absolute_error       -1.725259e+03
test_neg_mean_squared_error        -5.971711e+06
test_neg_root_mean_squared_error   -2.442752e+03
dtype: float64

## Gridsearch


In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
alpha_space = np.linspace(0.01, 100, 100)
alpha_space

array([1.000e-02, 1.020e+00, 2.030e+00, 3.040e+00, 4.050e+00, 5.060e+00,
       6.070e+00, 7.080e+00, 8.090e+00, 9.100e+00, 1.011e+01, 1.112e+01,
       1.213e+01, 1.314e+01, 1.415e+01, 1.516e+01, 1.617e+01, 1.718e+01,
       1.819e+01, 1.920e+01, 2.021e+01, 2.122e+01, 2.223e+01, 2.324e+01,
       2.425e+01, 2.526e+01, 2.627e+01, 2.728e+01, 2.829e+01, 2.930e+01,
       3.031e+01, 3.132e+01, 3.233e+01, 3.334e+01, 3.435e+01, 3.536e+01,
       3.637e+01, 3.738e+01, 3.839e+01, 3.940e+01, 4.041e+01, 4.142e+01,
       4.243e+01, 4.344e+01, 4.445e+01, 4.546e+01, 4.647e+01, 4.748e+01,
       4.849e+01, 4.950e+01, 5.051e+01, 5.152e+01, 5.253e+01, 5.354e+01,
       5.455e+01, 5.556e+01, 5.657e+01, 5.758e+01, 5.859e+01, 5.960e+01,
       6.061e+01, 6.162e+01, 6.263e+01, 6.364e+01, 6.465e+01, 6.566e+01,
       6.667e+01, 6.768e+01, 6.869e+01, 6.970e+01, 7.071e+01, 7.172e+01,
       7.273e+01, 7.374e+01, 7.475e+01, 7.576e+01, 7.677e+01, 7.778e+01,
       7.879e+01, 7.980e+01, 8.081e+01, 8.182e+01, 

In [28]:
lasso_model = Lasso()

param_grid = {'alpha':alpha_space}

lasso_grid = GridSearchCV(estimator=lasso_model,
                          param_grid=param_grid,
                          scoring='neg_root_mean_squared_error',
                          cv=10,
                          n_jobs = -1)

In [29]:
lasso_grid.fit(X_train_scaled,y_train)

In [30]:
lasso_grid.best_params_

{'alpha': 0.01}

In [31]:
lasso_grid.best_score_

-2442.605473274941

In [32]:
train_val(lasso_grid, X_train_scaled, y_train, X_test_scaled, y_test)

Unnamed: 0,train,test
R2,0.8721047,0.8658268
mae,1724.295,1689.333
mse,5952854.0,5711633.0
rmse,2439.847,2389.902


## Pipeline

In [33]:
cat = X.select_dtypes("object").columns
cat = list(cat)
cat

['Car Model', 'Gearing Type']

In [34]:
X.head(1)

Unnamed: 0,Car Model,HP,KM,Age,Gearing Type
0,Audi A1,66.0,56013.0,3.0,Automatic


In [35]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

column_trans = make_column_transformer((OneHotEncoder(handle_unknown="ignore", sparse=False), cat), 
                                       remainder=MinMaxScaler())

In [36]:
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline

operations = [("OneHotEncoder", column_trans), ("Lasso", Lasso(alpha=0.01))]

pipe_model = Pipeline(steps=operations)

pipe_model.fit(X, y)

In [40]:
my_dict = {
    "HP": 66,
    "Age": 2,
    "KM": 17000,
    "Car Model": 'Audi A3',
    "Gearing Type": "Automatic"
    
}

In [41]:
new_obs = pd.DataFrame([my_dict])
new_obs

Unnamed: 0,HP,Age,KM,Car Model,Gearing Type
0,66,2,17000,Audi A3,Automatic


In [42]:
pipe_model.predict(new_obs)

array([19366.01548998])

In [37]:
import pickle
pickle.dump(pipe_model, open("my_model", 'wb'))

In [38]:
car_predict = pickle.load(open("my_model", "rb"))

In [43]:
car_predict.predict(new_obs)

array([19366.01548998])