# Vehicle Price Prediction
Estimate the selling price of cars based on the features of a car.

In [None]:
# import all required libraries for reading, analysing and visualizing data
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data Analysis

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# read the data from the csv file
details = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv')

In [None]:
details.head()

In [None]:
print('Dataset shape: ', details.shape)

In [None]:
details.info()

In [None]:
details.describe(include = 'all')

In [None]:
# check if any of the columns has null values
details.isnull().sum()

## Data Visualization

### Selling price vs Fuel type

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (18,5))
sns.countplot(x = 'fuel', data = details, ax = axis1)
sns.barplot(x = 'fuel', y = 'selling_price', data = details, ax = axis2);

Thus we can see that most of the cars of Diesel and Petrol type, and the selling price of Diesel cars are maximum follwed by Petrol, CNG and LPG.

### Selling price vs Transmission type

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (18,5))
sns.countplot(x = 'transmission', data = details, ax = axis1)
sns.barplot(x = 'transmission', y = 'selling_price', data = details, ax = axis2);

Thus from the above plots we can infer that most cars have Manual transmission and the price of cars having automatic transmission are way higher than of Manual type.

### Selling price vs Seller type

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (18,5))
sns.countplot(x = 'seller_type', data = details, ax = axis1)
sns.barplot(x = 'seller_type', y = 'selling_price', data = details, ax = axis2);

Thus most of the cars sold by Individual sellers but the Dealers charge the highest price for the cars.

### Selling price vs Purchase year

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (18,5))
sns.countplot(x = 'year', data = details, ax = axis1)
sns.lineplot(x = 'year', y = 'selling_price', data = details, ax = axis2);

From the above plots we can see that most of the cars are 2-10 years old and the selling price decreases as the car become older and older.

### Selling price vs Km Driven

In [None]:
plt.figure(figsize = [8,5])
sns.scatterplot(x = 'km_driven', y = 'selling_price', data = details);
plt.title('Selling Price vs Km Driven in lakhs');

The above plot shows that the less driven cars are usually sold at higher prices.

### Selling price vs Owner

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (18,5))
sns.countplot(x = 'owner', data = details, ax = axis1)
sns.lineplot(x = 'owner', y = 'selling_price', data = details, ax = axis2);

From the above plots we can see that most of the cars are sold by their first owners.

### Selling price vs No of seats

In [None]:
fig, (axis1,axis2) = plt.subplots(1, 2, figsize = (18,5))
sns.countplot(x = 'seats', data = details, ax = axis1)
sns.lineplot(x = 'seats', y = 'selling_price', data = details, ax = axis2);

Thus we can see the most of the cars sold have 5 seats and their prices are usually between 6 to 7 lakhs.

## Preprocessing of Data

In [None]:
# drop the rows having null values
details = details[details['torque'].notna()]
# drop the rows having 'Test Drive Car' as Owner type
details = details[details.owner != 'Test Drive Car']

In [None]:
# convert the values of 'fuel' column to numerical format
z = lambda x: 0 if x == 'Diesel' else (1 if x == 'Petrol' else (2 if x == 'CNG' else 3))
details['fuel'] = details['fuel'].apply(z)

In [None]:
# convert the values of 'seller_type' column to numerical format
z = lambda x: 0 if x == 'Individual' else (1 if x == 'Dealer' else 2)
details['seller_type'] = details['seller_type'].apply(z)

In [None]:
# convert the values of 'transmission' column to numerical format
z = lambda x: 0 if x == 'Manual' else 1
details['transmission'] = details['transmission'].apply(z)

In [None]:
# convert the values of 'owner' column to numerical format
z = lambda x: 0 if x == 'First Owner' else (1 if x == 'Second Owner' else (2 if x == 'Third Owner' else 3))
details['owner'] = details['owner'].apply(z)

In [None]:
# convert the values of 'mileage' column to numerical format
z = lambda x: float(str(x).split(' ')[0])
details['mileage'] = details['mileage'].apply(z)

In [None]:
# convert the values of 'engine' column to numerical format
z = lambda x: float(str(x).split(' ')[0])
details['engine'] = details['engine'].apply(z)

In [None]:
# convert the values of 'max_power' column to numerical format
z = lambda x: float(str(x).split(' ')[0])
details['max_power'] = details['max_power'].apply(z)

In [None]:
# drop the columns which are not needed
details.drop(['name', 'torque'], axis = 1, inplace = True)

## Machine Learning

In [None]:
# import the required modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import r2_score, mean_absolute_error

In [None]:
# get the training and test data
(Y, X) = (details['selling_price'].values, details.drop(['selling_price'], axis = 1))
Y = Y.reshape((X.shape[0], 1))

(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size = 0.3)
print("X_train shape:" + str(X_train.shape))
print("Y_train shape:" + str(Y_train.shape))
print("X_test shape:" + str(X_test.shape))
print("Y_test shape:" + str(Y_test.shape))

In [None]:
def plotPred(Y_pred):
    x_points=np.linspace(0,7e6)
    plt.figure(figsize=(12,5))
    plt.plot(x_points, x_points, color='r')
    plt.scatter(Y_test, Y_pred)
    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.title('True Values Vs Predicted Values');

### Using Linear Regression

In [None]:
lireg = LinearRegression()
lireg.fit(X_train, Y_train)

# train and test scores
lr_train_score = round(lireg.score(X_train, Y_train), 2)
lr_test_score = round(lireg.score(X_test, Y_test), 2)
# predicted output
Y_pred_lr = lireg.predict(X_test)

print('Linear Regression train score: ', lr_train_score)
print('Linear Regression test score: ', lr_test_score)
print('Mean absolute error: ', mean_absolute_error(Y_test, Y_pred_lr))
print('Coefficient of determination: ', r2_score(Y_test, Y_pred_lr))

# plot predicted vs true values
plotPred(Y_pred_lr)

### Using Decision Tree Regressor

In [None]:
dtreg = DecisionTreeRegressor()
dtreg.fit(X_train, Y_train)

# train and test scores
dt_train_score = round(dtreg.score(X_train, Y_train), 2)
dt_test_score = round(dtreg.score(X_test, Y_test), 2)
# predicted output
Y_pred_dt = dtreg.predict(X_test)

print('Decision Tree Regressor train score: ', dt_train_score)
print('Decision Tree Regressor test score: ', dt_test_score)
print('Mean absolute error: ', mean_absolute_error(Y_test, Y_pred_dt))
print('Coefficient of determination: ', r2_score(Y_test, Y_pred_dt))

# plot predicted vs true values
plotPred(Y_pred_dt)

### Using Random Forest Regressor

In [None]:
rfreg = RandomForestRegressor()
rfreg.fit(X_train, Y_train)

# train and test scores
rf_train_score = round(rfreg.score(X_train, Y_train), 2)
rf_test_score = round(rfreg.score(X_test, Y_test), 2)
# predicted output
Y_pred_rf = rfreg.predict(X_test)

print('Decision Tree Regressor train score: ', rf_train_score)
print('Decision Tree Regressor test score: ', rf_test_score)
print('Mean absolute error: ', mean_absolute_error(Y_test, Y_pred_rf))
print('Coefficient of determination: ', r2_score(Y_test, Y_pred_rf))

# plot predicted vs true values
plotPred(Y_pred_rf)