# Car Prediction

In [None]:
# All the imports reside here
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LinearRegression

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# read the cardata csv file using pandas read_csv method
cars=pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")


**Apply head function on the dataframe and see what all columns it has**

**head() -> provides top 5 records from the dataset**

**tail() -> provides last 5 records from the dataset**


In [None]:
cars.head()

# Understandings
* selling_price and present_price are in units [lakhs]
* Fuel_type, seller_type,Transmission have categorical values

**It is always wise to check the datatypes of the columns, instead of assuming based on the data it has
Lets use info() method and check the datatypes and missing values**

In [None]:
cars.info()

# Understandings
* We dont have any missing or null values in our dataframe

In [None]:
# Lets look at the centeral tendencey value and percentails
# we can get those values by using describe() method
# by default, it only provides the measure for numerical values
cars.describe()

# Understandings 
* By observing the mean and 50 percentail, we can tell that there are some outliers

In [None]:
# Lets try to see the details for categorical values 

cars.describe(include = 'O')

# Understandings

* Looks like there are 3/2/2 unique values for fuel/seller/transmission columns respectively
* We can also see the mode[most frequent value] for each column

In [None]:
# visualization for Categorical Columns

fig, axes = plt.subplots(1, 3, figsize=(16, 5), sharey=True)
fig.suptitle('Visuallization of categorical columns')

# Fuel_Type
sns.barplot(x = 'Fuel_Type', y = 'Selling_Price', data = cars, ax = axes[0])

# Seller_Type
sns.barplot(x = 'Seller_Type', y = 'Selling_Price', data = cars, ax = axes[1])

# Transmission
sns.barplot(x = 'Transmission', y = 'Selling_Price', data = cars, ax = axes[2])

# Understandings 

* Fuel_Type of Diesel have higher resale value
* Dealer cars are sold for higher values [marketing techniques, I guess :P]
* Automatic cars have high resale value

In [None]:
# Feature Engineering 

# we will be converting the year column into number of years
cars['Year'] = 2021 - cars['Year']

# we can also fetch the current year from datetime library
# current_year = datatime.datatime.now().year

# visualization for Numerical Columns

There are multiple plotting for numerical values
* Scatter plot
* regplot
* lmplot

Here we will be using the regplot, as it plots the regression line along with points

numerial columns = [Year	Selling_Price	Present_Price	Kms_Driven	Owner]

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (16,10))
fig.suptitle('Visuallization of Numerical columns')

sns.regplot(x = 'Year', y = 'Selling_Price', data = cars, ax = axes[0,0])
sns.regplot(x = 'Present_Price', y = 'Selling_Price', data = cars, ax = axes[0,1])
sns.regplot(x = 'Kms_Driven', y = 'Selling_Price', data = cars, ax = axes[1,0])
sns.regplot(x = 'Owner', y = 'Selling_Price', data = cars, ax = axes[1,1])

# Understandings 

* Year : with increase in the number of years the car bought, the price is descreasing
* Present_price : sellling price is highly proportional with the present_price
* Kms_Driven : selling price is inversely proportional to selling price
* Owner : resale price is more for first hand cars, compared to second and third hand cars

In [None]:
# Outliers 

fig, axes = plt.subplots(2, 2, figsize = (16,10))
fig.suptitle('Outliers')

sns.boxplot(x = 'Year', data = cars, ax = axes[0,0])
sns.boxplot(x = 'Present_Price', data = cars, ax = axes[0,1])
sns.boxplot(x = 'Kms_Driven', data = cars, ax = axes[1,0])
sns.boxplot(x = 'Owner', data = cars, ax = axes[1,1])

# Understandings 
* we can see there are few outliers for the above columns.

# Converting the categorical values into Numerical values

There are 2 types
* Ordinal Encoding
* OneHot Encoding

in pandas there is a method called get_dummies() which does the operation of OneHotEncoding

we are using drop_first params, to eradicate the dummy varaible trap

In [None]:
cars = pd.get_dummies(cars, columns = ['Fuel_Type','Seller_Type','Transmission'], drop_first = True)

In [None]:
cars.head()

In [None]:
# lets see the correlation between the each columns in the dataframe 

plt.figure(figsize = (16,7))
sns.heatmap(cars.corr(), annot = True)
plt.title('Correlation between the columns in the dataframe')
plt.show()

In [None]:
# split the data into train_data and test_data from train_test_split library

train_data , test_data = train_test_split(cars, test_size = 0.2 , random_state = 42)

In [None]:
# we can see that 20% of the data is sent to test and remaining into train data

print(train_data.shape)
print(test_data.shape)

If we see the numerial columns ['Year','Selling_Price','Present_Price','Kms_Driven']

All the values of these columns are in different units, lets try to normalize them
There are 2 type of normalizations
* StandardScaler
* MinMaxScalar  [log normalization]

Lets use StandaradScaler

In [None]:
# We will be doing fit transform for only train data and transform for test data to eradcate data leakage

num_cols = ['Year','Selling_Price','Present_Price','Kms_Driven']
scaler = StandardScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])
test_data[num_cols] = scaler.transform(test_data[num_cols])

In [None]:
X_train = train_data.drop(['Car_Name', 'Selling_Price'], axis = 1) 
y_train = train_data['Selling_Price']
X_test = test_data.drop(['Car_Name', 'Selling_Price'], axis = 1) 
y_test = test_data['Selling_Price']

In [None]:
print("X_train Shape: "+str(X_train.shape))
print("y_train Shape: "+str(y_train.shape))
print("X_test Shape: "+str(X_test.shape))
print("y_test Shape: "+str(y_test.shape))

In [None]:
# Linear Regression Model

model = LinearRegression()
model.fit(X_train, y_train)
predicted = model.predict(X_test)

In [None]:
sns.scatterplot(predicted , y_test)

**We can use all 3 loss function [MAE,MSE,RMSE] on the predicted data**

In [None]:


print("Mean absolute error : "+str(metrics.mean_absolute_error(predicted , y_test)))
print("Mean sqaure error : "+str(metrics.mean_squared_error(predicted , y_test)))
print("Root Mean absolute error : "+str(np.sqrt(metrics.mean_squared_error(predicted , y_test))))

print("R2 Score : "+str(metrics.r2_score(predicted , y_test)))