In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
lab = LabelEncoder()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv("car_price_prediction.csv")


### Reading the data from dataset.

In [2]:
data = pd.read_csv("car_price_prediction.csv")

In [None]:
data.head()

In [None]:
print(data.shape)

data.info()

### Data cleaning, pre-processing and exploring process.

#### checking for duplicate values and removing them:

In [None]:
data.duplicated().sum()

In [4]:
data.drop_duplicates(inplace= True)

#### Checking for missing values and removing them:

In [None]:
data.isna().sum()

#### No missing values are present.

In [None]:
data.describe()

#### Removing ID and doors column which may affect the modal since it is not that significant:

In [6]:
data = data.drop(["ID", "Doors"], axis=1)

In [None]:
data.head()

#### Since Levy column has '-' which might be a useless data value,changle type of the column to float and fill it up with medium instead of 0:

In [None]:
data['Levy'] = data['Levy'].replace('-', np.nan).astype(float)
data['Levy'].fillna(data['Levy'].median(), inplace=True)

data["Levy"].unique()

#### Converting Prod. year to be age of the car instead since it will be easier to deal to with instead of the year of production since it also has a impact on a vehicle price:

In [8]:
current_date = dt.datetime.now()

data["Age"] = current_date.year - data["Prod. year"]

data = data.drop("Prod. year", axis=1)

#### Converting cylinder to int and not float 

In [None]:
data['Cylinders'] = data['Cylinders'].astype(int)
data['Cylinders']

#### Converting mileage to an integer value and removing "km":

In [None]:
data["Mileage"] = data["Mileage"].str.replace("km", "")

data.Mileage = data.Mileage.astype("Int64")

print(data["Mileage"])

In [None]:
print(data["Engine volume"].unique())

#### Removing "turbo" keyword in Engine volume and converting it into a float type:

In [None]:
data["Engine volume"] = data["Engine volume"].str.replace("Turbo", "")
data["Engine volume"] = data["Engine volume"].astype("float64")

data["Engine volume"].unique()

In [None]:
data.info()

### Visualizing and analyzing process


In [None]:
data.hist(bins=25,figsize=(15,10),color='green')
plt.show()

#### Observations:
1) Levy column is always between 0 and 2000.
2) Most cars are new because they have mileage of 0.
3) Most cars are 10 to 15 years old, majority being less than 20 years old.
4) Engine volumes is always in the range from 0 to 5.
5) Most cars have 4 cylinder engines.


#### Checking most frequent vehicle category:

In [None]:
plt.subplots(figsize=(20,5), dpi=120)
sns.countplot(data= data, x='Category', palette='crest')
plt.title("Category",fontsize=20)
plt.show()

#### Sedan is majority, followed by hatchbacks and jeeps.

#### Checking the vechicle colors:

In [None]:
plt.subplots(figsize=(15,5), dpi=120)
sns.countplot(data= data, x='Color',palette='crest')
plt.title("Of Colors ",fontsize=20)
plt.show()

#### Most sold color type are black, silver, white and grey.

#### Checking most sold gear box type and fuel type:

In [None]:
plt.subplots(figsize=(10,5), dpi=120)
sns.countplot(data= data, x='Gear box type',palette='crest')
plt.title("Gear box ",fontsize=20)
plt.show()

plt.subplots(figsize=(10,5), dpi=120)
sns.countplot(data= data, x='Fuel type',palette='crest')
plt.title("Fuel ",fontsize=20)
plt.show()

#### Most cars sold are automatic and uses petrol, while hybrid is closely with diesel engines.

#### Checking top 5 car manufactuers: 

In [None]:
manufacterers = data.Manufacturer.value_counts().sort_values(ascending=False)[:5]

print(manufacterers)

plt.figure(figsize=(10, 5))
sns.barplot(x=manufacterers.index, y=manufacterers,palette='crest',linewidth = 4)
plt.title('5 most frequent manufacterurs',loc='center',fontweight='bold',fontsize=20)
plt.xlabel('Brand name',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.tight_layout()
plt.show()


#### Checking average price for top 5 car manufacturers:

In [None]:
manufacterers_average_price = [data[data['Manufacturer']==i]['Price'].mean() for i in list(manufacterers.index)]

plt.figure(figsize=(10,5))
plt.plot(manufacterers.index, manufacterers_average_price,color='g',
         linewidth = 4, marker='o',markersize = 10)
plt.title('Top 5 Car brands by average price',loc='center',fontweight='bold',fontsize=18)
plt.ylabel('Average Price',fontsize=20)
plt.xlabel('Cars',fontsize=20)
plt.tight_layout()
plt.show()

#### Checking the relation between color and price:

In [None]:
plt.figure(figsize=(15, 5), dpi=120)
sns.scatterplot(data=data, x='Color', y='Price', palette="crest")

#### Color dosen't seem to make significant difference on a car's price so we can remove it.

In [13]:
data = data.drop("Color",axis=1)

#### Checking and visualizing correlation between numerical columns:

In [None]:
correlation_data = data.select_dtypes(exclude=object).corr()

correlation_data

sns.heatmap(correlation_data, annot= True, linewidths= 0.4,cmap='crest')
plt.title('Correlation Heatmap')
plt.show()

### Outlier detection and removal process

#### Visualizing in distribution plot to help us understand skewness and box plot to see median, IQR and outliers:

In [14]:
numeric_data = data.select_dtypes(exclude=object)

# for col in numeric_data:
#     fig, ax =plt.subplots(1,2, constrained_layout=True)
#     fig.set_size_inches(10, 6)
#     sns.distplot(data[col], ax=ax[0]).set(title="Distplot")
#     sns.boxplot(data[col], ax=ax[1]).set(title="Boxplot")
#     plt.suptitle(f'{col.title()} (Before handling outliers)',weight='bold')
#     fig.show()

#### Calculating outliers for each column:

In [None]:
for col in numeric_data:
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iq = q3 - q1
    
    low = q1 - 1.5 * iq
    high = q3 + 1.5 * iq
    outlier = ((numeric_data[col] > high) | (numeric_data[col] < low)).sum()

    total = numeric_data[col].shape[0]
    print(f"Total Outliers in {col}: {outlier}, {round(100*(outlier)/total,2)}%")


#### Removing outliers:

In [16]:
for col in numeric_data:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])
    data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])

In [None]:
numeric_data = data.select_dtypes(exclude=object)

for col in numeric_data:
    fig, ax =plt.subplots(1,2, constrained_layout=True)
    fig.set_size_inches(10, 6)
    sns.distplot(data[col], ax=ax[0]).set(title="Distplot")
    sns.boxplot(data[col], ax=ax[1]).set(title="Boxplot")
    plt.suptitle(f'{col.title()} (After handling outliers)',weight='bold')
    fig.show()

In [None]:
data.head()

In [17]:
# Log Transformation for skewed features
data['Price'] = np.log1p(data['Price'])
data['Mileage'] = np.log1p(data['Mileage'])
data['Levy'] = np.log1p(data['Levy'])

In [18]:
# Creating interaction features
data['Price_per_KM'] = data['Price'] / (data['Mileage'] + 1)
data['Engine_per_Cylinder'] = data['Engine volume'] / data['Cylinders']
data['Age*Mileage'] = data['Age'] * data['Mileage']

In [None]:
data.head()

#### Preparing data for model by using hot encoding

In [20]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
lab = LabelEncoder()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
categorical_cols = ['Manufacturer', 'Category', 'Fuel type', 'Gear box type', 'Model', 'Leather interior', 'Wheel']

# Apply One-Hot Encoding
ohe = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
encoded_features = pd.DataFrame(ohe.fit_transform(data[categorical_cols]))

# Assign proper column names
encoded_features.columns = ohe.get_feature_names_out()

# Merge encoded features and drop original categorical columns
data = pd.concat([data, encoded_features], axis=1).drop(columns=categorical_cols)

# Ensure all columns are numeric
print("All columns are now numeric:", data.dtypes.unique())

In [23]:
X = data.drop(columns=['Price'])
y = data['Price']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure x_test has same columns as x_train (reindexing)
x_test = x_test.reindex(columns=x_train.columns, fill_value=0)

In [None]:
print(x_train.select_dtypes(include=['object']).columns)
data.head()

In [None]:
# Step 6: Feature Scaling
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

#### Splitting tests and train data:

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20,random_state=42)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

#### Using different aglorithms:

In [None]:
algorithm = ['LinearRegression','DecisionTreeRegressor','RandomForestRegresosr']
R2 = []
RMSE = []
Mae = []

def models(model):
    model.fit(x_train,y_train)
    pre = model.predict(x_test)
    r2 = r2_score(y_test,pre)
    rmse = np.sqrt(mean_squared_error(y_test,pre))
    mae = mean_absolute_error(y_test, pre)
    R2.append(r2)
    RMSE.append(rmse)
    Mae.append(mae)
    score = model.score(x_test,y_test)
    print(f'The Score of Model is :{score}')
    
model1 = LinearRegression()
model2 = DecisionTreeRegressor()
model3 = RandomForestRegressor()

models(model1)
models(model2)
models(model3)

df = pd.DataFrame({'Algorithm':algorithm, 'R2_score': R2, 'RMSE':RMSE, 'MAE': Mae})
df

#### Random forest regressor has the best performance across all metrics with the highest R2 Score and the lowest RMSE and MAE.

In [None]:
x = np.arange(len(algorithm))
bar_width = 0.25  

fig, ax = plt.subplots(figsize=(10, 5))

ax.bar(x, RMSE, width=bar_width, label='RMSE', color='orange')
ax.bar(x + bar_width, Mae, width=bar_width, label='MAE', color='green')

ax.set_xticks(x)
ax.set_xticklabels(algorithm)
ax.set_title('Comparison of RMSE and MAE for different Models')
ax.set_xlabel('Models')
ax.set_ylabel('Metric Values')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(10,5))
plt.plot(df.Algorithm,df.R2_score ,label='R2_score',lw=5,color='black',marker='.',markersize = 15)
plt.legend(fontsize=15)
plt.show()

In [None]:
MAE_random_forest = Mae[2]  
mean_car_price = data['Price'].mean()

accuracy = (1 - (MAE_random_forest / mean_car_price)) * 100
print(accuracy, "% Accuracy")

### Random Forest Regressor is the most suitable model for predicting car prices with a seudo-accuracy of 79% using the MAE divided by mean of the actual prices.

In [None]:
data.head()

In [3]:
# preprocessing part 1 

data.drop_duplicates(inplace= True)

data = data.drop(["ID", "Doors"], axis=1)

data['Levy'] = data['Levy'].replace('-', np.nan).astype(float)

data['Levy'].fillna(data['Levy'].median(), inplace=True)

current_date = dt.datetime.now()

data["Age"] = current_date.year - data["Prod. year"]

data = data.drop("Prod. year", axis=1)

data['Cylinders'] = data['Cylinders'].astype(int)

data["Mileage"] = data["Mileage"].str.replace("km", "")

data.Mileage = data.Mileage.astype("Int64")

data["Engine volume"] = data["Engine volume"].str.replace("Turbo", "")

data["Engine volume"] = data["Engine volume"].astype("float64")

data = data.drop("Color",axis=1)
    
# outlier treament

numeric_data = data.select_dtypes(exclude=object)

for col in numeric_data:
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iq = q3 - q1
    
    low = q1 - 1.5 * iq
    high = q3 + 1.5 * iq
    outlier = ((numeric_data[col] > high) | (numeric_data[col] < low)).sum()

    total = numeric_data[col].shape[0]
    print(f"Total Outliers in {col}: {outlier}, {round(100*(outlier)/total,2)}%")


for col in numeric_data:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])
    data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])
        
# preprocessing part 2

# Log Transformation for skewed features
data['Price'] = np.log1p(data['Price'])
data['Mileage'] = np.log1p(data['Mileage'])
data['Levy'] = np.log1p(data['Levy'])

# Creating interaction features
data['Price_per_KM'] = data['Price'] / (data['Mileage'] + 1)
data['Engine_per_Cylinder'] = data['Engine volume'] / data['Cylinders']
data['Age*Mileage'] = data['Age'] * data['Mileage']

#label encoding and trainig

categorical_cols = ['Manufacturer', 'Category', 'Fuel type', 'Gear box type', 'Model', 'Leather interior', 'Wheel', 'Drive wheels']

# Apply One-Hot Encoding
ohe = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
encoded_features = pd.DataFrame(ohe.fit_transform(data[categorical_cols]))

# Assign proper column names
encoded_features.columns = ohe.get_feature_names_out()

# Merge encoded features and drop original categorical columns
data = pd.concat([data, encoded_features], axis=1).drop(columns=categorical_cols)

# Ensure all columns are numeric
print("All columns are now numeric:", data.dtypes.unique())

data = data.dropna(subset=['Price'])

X = data.drop(columns=['Price'])
y = data['Price']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure x_test has same columns as x_train (reindexing)
x_test = x_test.reindex(columns=x_train.columns, fill_value=0)


    

Total Outliers in Price: 1055, 5.57%
Total Outliers in Levy: 3103, 16.4%
Total Outliers in Engine volume: 1358, 7.18%
Total Outliers in Mileage: 635, 3.36%
Total Outliers in Cylinders: 4765, 25.18%
Total Outliers in Airbags: 0, 0.0%
Total Outliers in Age: 962, 5.08%
All columns are now numeric: [dtype('float64')]


In [4]:
print(x_train.select_dtypes(include=['object']).columns)

Index([], dtype='object')


In [5]:
# Step 6: Feature Scaling
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Step 7: Model Training & Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_
