In [18]:
import pandas as pd
# Import the dataset into a DataFrame
path = r'data/audi.csv'
df = pd.read_csv(path, converters={col: str.strip for col in ['model']})

df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuel_type,tax,mpg,engine_size,brand
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi


In [None]:
# See the number of unique values in each column
df.nunique()

model             26
year              21
price           3260
transmission       3
mileage         7725
fuel_type          3
tax               37
mpg              104
engine_size       19
brand              1
dtype: int64

In [19]:
# Let's delete the brand columns because there's only one brand in the dataset
df.drop(columns=['brand'], inplace=True)

In [20]:
# Make sure there are no NULL values in the dataset
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuel_type       0
tax             0
mpg             0
engine_size     0
dtype: int64

In [11]:
# Let's do a quick overview of the values
df.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engine_size
count,10668.0,10668.0,10668.0,10668.0,10668.0,10668.0
mean,2017.100675,22896.685039,24827.244001,126.011436,50.770022,1.930709
std,2.167494,11714.841888,23505.257205,67.170294,12.949782,0.602957
min,1997.0,1490.0,1.0,0.0,18.9,0.0
25%,2016.0,15130.75,5968.75,125.0,40.9,1.5
50%,2017.0,20200.0,19000.0,145.0,49.6,2.0
75%,2019.0,27990.0,36464.5,145.0,58.9,2.0
max,2020.0,145000.0,323000.0,580.0,188.3,6.3


In [12]:
# Let's look at the columns, again, see what columns need to be encoded
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuel_type,tax,mpg,engine_size
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0


In [13]:
# One-hot encoding categorical variables
data_onehot = pd.get_dummies(df,columns=['model', 'transmission','fuel_type'])
data_onehot

Unnamed: 0,year,price,mileage,tax,mpg,engine_size,model_A1,model_A2,model_A3,model_A4,...,model_S8,model_SQ5,model_SQ7,model_TT,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuel_type_Diesel,fuel_type_Hybrid,fuel_type_Petrol
0,2017,12500,15735,150,55.4,1.4,True,False,False,False,...,False,False,False,False,False,True,False,False,False,True
1,2016,16500,36203,20,64.2,2.0,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
2,2016,11000,29946,30,55.4,1.4,True,False,False,False,...,False,False,False,False,False,True,False,False,False,True
3,2017,16800,25952,145,67.3,2.0,False,False,False,True,...,False,False,False,False,True,False,False,True,False,False
4,2019,17300,1998,145,49.6,1.0,False,False,True,False,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10663,2020,16999,4018,145,49.6,1.0,False,False,True,False,...,False,False,False,False,False,True,False,False,False,True
10664,2020,16999,1978,150,49.6,1.0,False,False,True,False,...,False,False,False,False,False,True,False,False,False,True
10665,2020,17199,609,150,49.6,1.0,False,False,True,False,...,False,False,False,False,False,True,False,False,False,True
10666,2017,19499,8646,150,47.9,1.4,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True


In [21]:
# Create training and testing datasets
# X is the feature set, and y is the target variable (price)
from sklearn.model_selection import train_test_split

X = data_onehot.drop(['price'],axis=1)
y = data_onehot['price']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=25)

In [22]:
# Train a linear regression model and save it with pickle
from sklearn.linear_model import LinearRegression
import pickle
regressor = LinearRegression()
regressor.fit(X_train,y_train)
regressor.score(X,y) # evaluate the model performance

# Save the trained model to a file
with open('used_car_price.pkl', 'wb') as model_file:
    pickle.dump(regressor, model_file)


In [None]:
# Predicting the price of a new car
new_data = {
    'year': [2020],
    'mileage': [5000],
    'tax': [150],
    'mpg': [50.0],
    'engine_size': [1.4],
    'model_A1': [0],
    'model_A2': [0],
    'model_A3': [0],
    'model_A4': [0],
    'model_A5': [0],
    'model_A6': [0],
    'model_A7': [0],
    'model_A8': [0],
    'model_Q2': [0],
    'model_Q3': [0],
    'model_Q5': [0],
    'model_Q7': [0],
    'model_Q8': [0],
    'model_R8': [1],
    'model_RS3': [0],
    'model_RS4': [0],
    'model_RS5': [0],
    'model_RS6': [0],
    'model_RS7': [0],
    'model_S3': [0],
    'model_S4': [0],
    'model_S5': [0],
    'model_S8': [0],
    'model_SQ5': [0],
    'model_SQ7': [0],
    'model_TT': [0],
    'transmission_Automatic': [0],
    'transmission_Manual': [1],
    'transmission_Semi-Auto': [0],
    'fuel_type_Diesel': [0],
    'fuel_type_Hybrid': [0],
    'fuel_type_Petrol': [1],
}

# Convert the new input data to a DataFrame
import pandas as pd
new_input = pd.DataFrame(new_data)

# Ensure the columns match the training data
new_input = new_input[X_train.columns]

# Make predictions using the trained model
predicted_price = regressor.predict(new_input)
print("Predicted Price:", predicted_price[0])

Predicted Price: 116514.53699627286
