# Multiple Linear Regression- Car dataset

## Importing the libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Importing the dataset

In [2]:
dataset=pd.read_csv('CAR.csv')

In [3]:
dataset.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


### Split the dataset into independent and Dependent variables

In [4]:
dataset.columns

Index(['year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [5]:
X=dataset[['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]
Y=dataset[['selling_price']]

### Work with the catagorical data

In [6]:
X=pd.get_dummies(dataset[['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']],drop_first=True)

In [7]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


In [8]:
dataset['seller_type'].value_counts()

Individual          3242
Dealer               993
Trustmark Dealer     102
Name: seller_type, dtype: int64

In [9]:
dataset['fuel'].value_counts()

Diesel      2151
Petrol      2122
CNG           40
LPG           23
Electric       1
Name: fuel, dtype: int64

In [10]:
dataset['owner'].value_counts()

First Owner             2831
Second Owner            1104
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: owner, dtype: int64

In [11]:
dataset['transmission'].value_counts()

Manual       3889
Automatic     448
Name: transmission, dtype: int64

## Splitting the dataset into the Training set and Test set
- Random State 20

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=.2,random_state=20)

## Training the Multiple Linear Regression model on the Training set

In [13]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values,Y_train)

LinearRegression()

## Intercept and Coefficient

In [14]:
print('Coefficients: ',regressor.coef_)
print("Intercept: ", regressor.intercept_)

Coefficients:  [[ 3.62245575e+04 -8.31584195e-01  2.88702246e+05  5.82076609e-11
   4.65464254e+04  2.64057958e+03 -6.04980455e+04  1.71882689e+05
  -8.64323880e+05 -3.50851884e+03 -4.04890692e+04  1.83178786e+05
  -2.83903020e+04]]
Intercept:  [-71683645.58006924]


In [15]:
X_train.columns

Index(['year', 'km_driven', 'fuel_Diesel', 'fuel_Electric', 'fuel_LPG',
       'fuel_Petrol', 'seller_type_Individual', 'seller_type_Trustmark Dealer',
       'transmission_Manual', 'owner_Fourth & Above Owner',
       'owner_Second Owner', 'owner_Test Drive Car', 'owner_Third Owner'],
      dtype='object')

#### selling price = -71,683,645.58 + 3.6210^4(year) - 8.3210^-1(km driven) + 2.89*10^5(fuel diesel) + 5.8210^-11(fuel electric) + 4.6510^4(fuel LPG) + 2.6410^3(fuel petrol) - 6.0510^4(individual seller) + 1.7210^5(trustmark dealer) - 8.6410^5(manual transmission) - 3.51*10^3(four or more owners) - 4.0510^4(two owners) + 1.8310^5(test driver car) - 2.84*10^4 (three owners)

## Predicting the Test set results

In [16]:
y_pred=regressor.predict(X_test.values)

### Calculate RMSE, R-Square

In [18]:
import math
from sklearn.metrics import mean_squared_error, r2_score
print(f'R squared: {r2_score(Y_test,y_pred):.2f}')
print(f'RMSE: {math.sqrt(mean_squared_error(Y_test,y_pred)):.2f}')

R squared: 0.52
RMSE: 377182.99


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


#### Fuel: 
- Diesel - 1000
- Electric - 0100
- LPG - 0010 
- Petrol - 0001 
- CNG - 0000

#### Seller: 
- Dealer - 01
- Individual - 10 
- Trustmark - 00

#### Transmission: 
- Manual - 1
- Automatic - 0 

#### Owner: 
- First - 0000
- Second - 0100
- Third - 0001
- 4+ - 1000
- Test Drive - 0010

In [19]:
X_test.columns

Index(['year', 'km_driven', 'fuel_Diesel', 'fuel_Electric', 'fuel_LPG',
       'fuel_Petrol', 'seller_type_Individual', 'seller_type_Trustmark Dealer',
       'transmission_Manual', 'owner_Fourth & Above Owner',
       'owner_Second Owner', 'owner_Test Drive Car', 'owner_Third Owner'],
      dtype='object')

In [20]:
regressor.predict([[2014,70000,1,0,0,0,0,0,1,0,0,0,0]])

array([[638780.60333151]])