<a href="https://colab.research.google.com/github/rohankundu23/CarPricePrediction/blob/master/Car_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Basic Steps

#### 1.1 Importing the necessary libraries and the dataset

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

  import pandas.util.testing as tm


In [None]:
from google.colab import files
uploaded = files.upload()

#### 1.2 Loading the data in th dataframe

In [None]:
df = pd.read_csv("data.csv")
df.shape

In [None]:
#To display first five rows of the dataset

df.head

In [None]:
#To display the columns of the dataset

df.columns

#### 1.3 Statistical Summary and data type of the data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='all')

In [None]:
# null values

df.isnull().sum()

In [None]:
## target variable 

df['MSRP'].plot.hist()
plt.xlabel('MSRP', fontsize=12)

+ As the graph is right skewed graph, we will plot the graph without the outliers.

In [None]:
# Plotting without the outliers

(df['MSRP'].loc[df['MSRP']<4.223125e+04 ]).plot.hist()

In [None]:
## Car year (Marketing)

df['Year'].plot.hist()
plt.xlabel('Car Year', fontsize=12)

+ The graph is left skewed graph, therefore we will plot the graph without the outliers.

In [None]:
(df['Year'].loc[df['Year']<2015 ]).plot.hist()

In [None]:
## Popularity of the car

df['Popularity'].plot.hist()
plt.xlabel('Popularity of the Car', fontsize=12)

### 2. Dropping irrelevant columns

In [None]:
df.isnull().sum()

+ Market Category has many null values, therefore we will drop this column, as it can manipulate our final result.
+ Any other column will be dropped once we will manipulate the data.

In [None]:
# Dropping the column named Market Category

df = df.drop(['Market Category'], axis=1)
df.head(5)

### 3. Renaming the columns

In [None]:
df = df.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price" })
df.head(5)

### 4. Data Manipulation

##### 4.1 Dropping the duplicate rows

In [None]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

In [None]:
df = df.drop_duplicates()
df.head(5)

In [None]:
df.shape

##### 4.2 Missing or null values Manipulation

In [None]:
df.isnull().sum()

In [None]:
temp = pd.DataFrame(index=df.columns)
temp['data_type'] = df.dtypes
temp['null_count'] = df.isnull().sum()
temp['unique_count'] = df.nunique()
temp

In [None]:
#for Cylinders

df['Cylinders'].value_counts()

In [None]:
df['Cylinders'].mode()

In [None]:
# Replacing null values with the mode of Cylinders column

df['Cylinders'].fillna(value = (df['Cylinders'].mode()[0]), inplace=True)

In [None]:
# for Number of doors

df['Number of Doors'].value_counts()

In [None]:
df['Number of Doors'].mode()

In [None]:
# Replacing null values with the mode of Number of Doors column

df['Number of Doors'].fillna(value = (df['Number of Doors'].mode()[0]), inplace=True)
df.isnull().sum()

+ HP has 69 missing values and 356 unique values, therefore we will drop the rows with null values in this column.

In [None]:
df = df.dropna()    # Dropping the missing values.
df.isnull().sum()

In [None]:
df.count()

In [None]:
#changing the datatype

df = df.astype({'HP': 'int', 'Cylinders': 'int', 'Number of Doors': 'int'})

In [None]:
df.dtypes

### 5. Detecting Outliers

In [None]:
#Plotting Graphs of Data(Columns)
sns.boxplot(x=df['Price'])

In [None]:
sns.boxplot(x=df['HP'])

In [None]:
sns.boxplot(x=df['Cylinders'])

In [None]:
sns.boxplot(x=df['MPG-C'])

In [None]:
sns.boxplot(x=df['MPG-H'])

In [None]:
sns.boxplot(x=df['Popularity'])

In [None]:
#Finding IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
#Removing Outliners
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

### 6. Most represented car brands

In [None]:
# Percentage of car per brand
counts = df['Make'].value_counts()*100/sum(df['Make'].value_counts())

# Top 10 car brands
popular_labels = counts.index[:10]
    
# Plot
plt.figure(figsize=(10,5))
plt.barh(popular_labels, width=counts[:10])
plt.title('Top 10 Car brands')
plt.show()

##### 6.1 Average price of these car brands

In [None]:
print('Average prices are: ')
prices = df[['Make','Price']].loc[(df['Make'] == 'Chevrolet')|
               (df['Make'] == 'Ford')|
               (df['Make'] == 'Volkswagen')|
               (df['Make'] == 'Toyota')|
               (df['Make'] == 'Dodge')|
               (df['Make'] == 'Nissan')|
               (df['Make'] == 'GMC')|
               (df['Make'] == 'Honda')|
               (df['Make'] == 'Mazda')].groupby('Make').mean()
print(prices)

### 7. Correlation Matrix

In [None]:
df.corr()

##### High correlation between
+ HP and Cylinders: The more cylinders there are, the more powerful the car is.
+ HP and Price: The more powerful the car is, the more costly it is.
+ MPG-H and MPG-C

##### High anticorrelation between
+ Cylinders and MPG-H: Highway mpg / Engine Cylinders have a strong negative correlation with highway and city MPG because lower MPG figures mean higher fuel consumption.

In [None]:
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Correlation with target variable
cor_target = abs(cor["Price"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
relevant_features

### 8. EDA and Graph plots

In [None]:
#Scatterplot between highly correlated features

fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['HP'], df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()

In [None]:
#Scatterplot between highly correlated features

fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['Year'], df['Price'])
ax.set_xlabel('Year')
ax.set_ylabel('Price')
plt.show()

In [None]:
#Scatterplot between highly correlated features

fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['Cylinders'], df['HP'])
ax.set_xlabel('Cylinders')
ax.set_ylabel('HP')
plt.show()

In [None]:
#Bar plot for Vehicle Style vs frequency by which they are sold

df['Vehicle Style'].value_counts().plot.bar(figsize=(10,6))                      # Bar chart for car 'body' variable
plt.title("Vehicle style vs Frequency of vehicles sold")
plt.ylabel('Number of vehicles')
plt.xlabel('Vehicle Style');

In [None]:
# Vehicle Style type and Vehicle Size analysis

sns.countplot(y='Vehicle Style',data=df, hue='Vehicle Size')
plt.title("Vehicle Type v/s Vehicle Size")
plt.ylabel('Vehicle Type')
plt.xlabel('Count of vehicles')

In [None]:
# Vehicle Style type and Drive type analysis

sns.countplot(y='Vehicle Style',data=df, hue='Drive Mode')
plt.title("Vehicle Type v/s Drive mode Type")
plt.ylabel('Vehicle Type')
plt.xlabel('Count of vehicles')

In [None]:
# Create a new column 'Grouping through Price' and assign the value based on car price

df['price_group'] = pd.cut(df['Price'],[0,20000,30000,40000,50000,60000,70000,80000,90000,100000,600000], 
                                labels = ['<20K','20-29K','30-39K','40-49K','50-59K','60-69K','70-79K','80-89K','90-999K','>100K'], include_lowest = True)
df['price_group'] = df['price_group'].astype(object)

In [None]:
(df['price_group'].value_counts() / len(df) *100).plot.bar(figsize=(10,6))
plt.title("Grouping through Price bar diagram")
plt.ylabel('Percentage of vehicles')
plt.xlabel('Price Group');

##### With all the above graphs, EDA generated is
+ Price of the car is dependent mostly on the engine house power and the car year.
+ The car with more cylinders is more powerful.
+ Car price increased after year 2000.
+ Sedan cars were the most sold cars followed by 4dr SUV
+ More the no the cylinders, more is the price.
+ For MPG-H 13, price is maximum and for MPG-H 15, price is lowest.
+ For MPG-C 10, price is maximum and for MPG-C 31, price is minimum.

### 9. Splitting of the dataset

In [None]:
df = df.drop(['Make', 'Model'], axis=1)

In [None]:
# One-hot encode the data using pandas get_dummies

df = pd.get_dummies(df)

In [None]:
x = df.drop('Price', axis=1)
y= df['Price']

In [None]:
# train and test split of the data into 80 and 20 ratio

from sklearn.model_selection import train_test_split
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size = 0.2, random_state = 2)

train_x.shape, valid_x.shape, train_y.shape, valid_y.shape

### 10. Performance check of different algorithms over metrics like R- Squared, RMSE, MSE, and MAE.

##### 10.1 Linear Regression Model

In [None]:
# Importing the necessary libraries

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [None]:
# Fitting the model

lreg = LinearRegression()
lreg.fit(train_x, train_y)

In [None]:
pred_train = lreg.predict(train_x)

In [None]:
pred_test = lreg.predict(valid_x)

In [None]:
plt.scatter(valid_y,pred_test)

In [None]:
sns.distplot((valid_y-pred_test),bins=50)

##### Performance Check

In [None]:
mse=mean_squared_error(valid_y,pred_test)
rmse=np.sqrt(mean_squared_error(valid_y,pred_test))
r2score=r2_score(valid_y,pred_test)
mae=mean_absolute_error(valid_y,pred_test)

In [None]:
print('mse= ',mse)
print('rmse= ',rmse)
print('r2_score= ',r2score)
print('mae= ',mae )

+ This model performs good as its R- Squared Score is 0.9487 that is elucidation of 94.87%

##### 10.2 Decision Trees Regression

In [None]:
#import decision tree regressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
#creating the decision tree function
dt_model = DecisionTreeRegressor(random_state=10, max_depth=12)

In [None]:
#fitting the model
dt_model.fit(train_x, train_y)

In [None]:
# Training Score

dt_model.score(train_x, train_y)

In [None]:
#checking the validation score

dt_model.score(valid_x, valid_y)

In [None]:
pred_y=dt_model.predict(valid_x)

In [None]:
plt.scatter(valid_y,pred_y)

In [None]:
sns.distplot((valid_y-pred_y),bins=50)

In [None]:
train_accuracy = []
validation_accuracy = []
for depth in range(1,20):
    dt_model = DecisionTreeRegressor(max_depth=depth, random_state=10)
    dt_model.fit(train_x, train_y)
    train_accuracy.append(dt_model.score(train_x, train_y))
    validation_accuracy.append(dt_model.score(valid_x, valid_y))

In [None]:
frame = pd.DataFrame({'max_depth':range(1,20), 'train_acc':train_accuracy, 'valid_acc':validation_accuracy})
frame.head()

In [None]:
#Plotting graph for Depth of tree
plt.figure(figsize=(12,6))
plt.plot(frame['max_depth'], frame['train_acc'], marker='o', label='Train Accuracy')
plt.plot(frame['max_depth'], frame['valid_acc'], marker='o', label='Test Accuracy')
plt.xlabel('Depth of tree')
plt.ylabel('performance')
plt.legend(loc='upper left')

##### Performance Check

In [None]:
mse=mean_squared_error(valid_y,pred_y)
rmse=np.sqrt(mean_squared_error(valid_y,pred_y))
r2score=r2_score(valid_y,pred_y)
mae=mean_absolute_error(valid_y,pred_y)

In [None]:
print('mse= ',mse)
print('rmse= ',rmse)
print('r2_score= ',r2score)
print('mae= ',mae)

+ Decision tress model give the R- Squared Score of 0.9782, that is elucidation of 97.82%.

##### 10.3 Random Forests Regression

In [None]:
# Fitting Random Forest Regression to the dataset 
# import the regressor 
from sklearn.ensemble import RandomForestRegressor 
  
 # create regressor object 
regressor = RandomForestRegressor(n_estimators = 50, random_state = 5, max_depth=25) 
  
# fit the regressor with x and y data 
regressor.fit(valid_x, valid_y) 

In [None]:
y_pred=regressor.predict(valid_x)

In [None]:
# checking the training score

dt_model.score(train_x, train_y)

In [None]:
# checking the validity score

dt_model.score(valid_x, valid_y)

In [None]:
plt.scatter(valid_y,y_pred)

In [None]:
sns.distplot((valid_y-y_pred),bins=50)

##### Performance Check

In [None]:
mse=mean_squared_error(valid_y,y_pred)
rmse=np.sqrt(mean_squared_error(valid_y,y_pred))
r2score=r2_score(valid_y,y_pred)
mae=mean_absolute_error(valid_y,y_pred)

In [None]:
print('mse= ',mse)
print('rmse= ',rmse)
print('r2_score= ',r2score)
print('mae= ',mae)

+ This model give the R- Squared Score of 0.9953, elucidation of 99.53%.

## With the above comparisons, we find that:
+ Random Forests Regression model is giving the best results with the R- Squared Score of 99.53% when converted to percentage.
+ It is also giving the least RMSE value with 1037.41.


+ Decision Trees Regression Model also gives good results with the R- Squared Score of 97.82%, when converted to percentage.


+ Linear Regression model is showing the lowest R- Squared Score with 94.87%, when converted to percentage.