In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('car_price_prediction.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [4]:
df.drop(['ID', 'Levy', 'Leather interior', 'Doors', 'Wheel'], axis=1, inplace=True)

In [5]:
df

Unnamed: 0,Price,Manufacturer,Model,Prod. year,Category,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Color,Airbags
0,13328,LEXUS,RX 450,2010,Jeep,Hybrid,3.5,186005 km,6.0,Automatic,4x4,Silver,12
1,16621,CHEVROLET,Equinox,2011,Jeep,Petrol,3,192000 km,6.0,Tiptronic,4x4,Black,8
2,8467,HONDA,FIT,2006,Hatchback,Petrol,1.3,200000 km,4.0,Variator,Front,Black,2
3,3607,FORD,Escape,2011,Jeep,Hybrid,2.5,168966 km,4.0,Automatic,4x4,White,0
4,11726,HONDA,FIT,2014,Hatchback,Petrol,1.3,91901 km,4.0,Automatic,Front,Silver,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,MERCEDES-BENZ,CLK 200,1999,Coupe,CNG,2.0 Turbo,300000 km,4.0,Manual,Rear,Silver,5
19233,15681,HYUNDAI,Sonata,2011,Sedan,Petrol,2.4,161600 km,4.0,Tiptronic,Front,Red,8
19234,26108,HYUNDAI,Tucson,2010,Jeep,Diesel,2,116365 km,4.0,Automatic,Front,Grey,4
19235,5331,CHEVROLET,Captiva,2007,Jeep,Diesel,2,51258 km,4.0,Automatic,Front,Black,4


In [6]:
def eng_vol(value):
    if 'Turbo' in value:
        value = value.replace('Turbo', '')
        return  float(value)
    else:
        return  float(value)
df['Engine volume'] = df['Engine volume'].map(eng_vol) 

In [7]:
def conv_mil(value):
    if 'km' in value:
        value = value.replace('km', '')
        return float(value)
    else:
        return float(value)

df['Mileage'] = df['Mileage'].apply(conv_mil)

In [8]:
df

Unnamed: 0,Price,Manufacturer,Model,Prod. year,Category,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Color,Airbags
0,13328,LEXUS,RX 450,2010,Jeep,Hybrid,3.5,186005.0,6.0,Automatic,4x4,Silver,12
1,16621,CHEVROLET,Equinox,2011,Jeep,Petrol,3.0,192000.0,6.0,Tiptronic,4x4,Black,8
2,8467,HONDA,FIT,2006,Hatchback,Petrol,1.3,200000.0,4.0,Variator,Front,Black,2
3,3607,FORD,Escape,2011,Jeep,Hybrid,2.5,168966.0,4.0,Automatic,4x4,White,0
4,11726,HONDA,FIT,2014,Hatchback,Petrol,1.3,91901.0,4.0,Automatic,Front,Silver,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,MERCEDES-BENZ,CLK 200,1999,Coupe,CNG,2.0,300000.0,4.0,Manual,Rear,Silver,5
19233,15681,HYUNDAI,Sonata,2011,Sedan,Petrol,2.4,161600.0,4.0,Tiptronic,Front,Red,8
19234,26108,HYUNDAI,Tucson,2010,Jeep,Diesel,2.0,116365.0,4.0,Automatic,Front,Grey,4
19235,5331,CHEVROLET,Captiva,2007,Jeep,Diesel,2.0,51258.0,4.0,Automatic,Front,Black,4


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

In [10]:
x = df.drop('Price', axis=1)
y = df['Price']

In [11]:
x_train, x_test, y_train, y_test, = train_test_split(x,y, test_size=0.25, random_state=42)

In [12]:
sc = StandardScaler()

In [13]:
oe = OneHotEncoder(handle_unknown='ignore')

In [14]:
numeric_features = x_train.select_dtypes(include='int64').columns
print(numeric_features)

Index(['Prod. year', 'Airbags'], dtype='object')


In [15]:
categorical_features = x_train.select_dtypes(include='object').columns
print(categorical_features)

Index(['Manufacturer', 'Model', 'Category', 'Fuel type', 'Gear box type',
       'Drive wheels', 'Color'],
      dtype='object')


In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', sc, numeric_features),
        ('cat', oe, categorical_features)
    ]
)

In [17]:
model = make_pipeline(preprocessor, LinearRegression())
print(model)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  Index(['Prod. year', 'Airbags'], dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['Manufacturer', 'Model', 'Category', 'Fuel type', 'Gear box type',
       'Drive wheels', 'Color'],
      dtype='object'))])),
                ('linearregression', LinearRegression())])


In [18]:
model.fit(x_train, y_train)

In [19]:
preds = model.predict(x_test)

In [20]:
import numpy as np
mse = mean_squared_error(y_test, preds)
print(f"MSE:{np.sqrt(mse)}")

MSE:37892.081570141934


In [21]:
joblib.dump(model, 'cars_prices_predictions.joblib')

['cars_prices_predictions.joblib']

In [22]:
loaded_model = joblib.load('cars_prices_predictions.joblib')

In [35]:
df1 = pd.DataFrame({
    'Manufacturer':['CHEVROLET'],
    'Model':['Captiva'],
    'Prod. year':[2015],
    'Category':['Jeep'],
    'Fuel type':['Petrol'],
    'Engine volume':[4.0],
    'Mileage':[100000],
    'Cylinders':[4.0],
    'Gear box type':['Manual'],
    'Drive wheels':['Rear'],
    'Color':['Black'],
    'Airbags':[2],
})
new_pred = loaded_model.predict(df1)
print(new_pred[0])

2807.9210032553674


In [23]:
df

Unnamed: 0,Price,Manufacturer,Model,Prod. year,Category,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Color,Airbags
0,13328,LEXUS,RX 450,2010,Jeep,Hybrid,3.5,186005.0,6.0,Automatic,4x4,Silver,12
1,16621,CHEVROLET,Equinox,2011,Jeep,Petrol,3.0,192000.0,6.0,Tiptronic,4x4,Black,8
2,8467,HONDA,FIT,2006,Hatchback,Petrol,1.3,200000.0,4.0,Variator,Front,Black,2
3,3607,FORD,Escape,2011,Jeep,Hybrid,2.5,168966.0,4.0,Automatic,4x4,White,0
4,11726,HONDA,FIT,2014,Hatchback,Petrol,1.3,91901.0,4.0,Automatic,Front,Silver,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,8467,MERCEDES-BENZ,CLK 200,1999,Coupe,CNG,2.0,300000.0,4.0,Manual,Rear,Silver,5
19233,15681,HYUNDAI,Sonata,2011,Sedan,Petrol,2.4,161600.0,4.0,Tiptronic,Front,Red,8
19234,26108,HYUNDAI,Tucson,2010,Jeep,Diesel,2.0,116365.0,4.0,Automatic,Front,Grey,4
19235,5331,CHEVROLET,Captiva,2007,Jeep,Diesel,2.0,51258.0,4.0,Automatic,Front,Black,4
