## Multiple Linear Regression on Used Bikes to predict price

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("Used_Bikes.csv")
df.head()

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha


In [2]:
#Steps in Machine Learning 
#Step 1 - Data Collection or ingestion
#Step 2 - Data Pre-processing 
        #Rules in ML while data preprocessing->
            # 1. Missing values not present 
            # 2. All columns must be in numerical format
            # 3. Particular format      
            # 4. These should be no duplicate values
#Step 3 - Particular Type of Data

In [5]:
#Checking of missing values
df.isnull().sum()

bike_name     0
price         0
city          0
kms_driven    0
owner         0
age           0
power         0
brand         0
dtype: int64

In [6]:
#Handling duplicate Values
df.duplicated().sum()

25324

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.shape

(7324, 8)

In [9]:
#Handling different datatypes
df.dtypes

bike_name      object
price         float64
city           object
kms_driven    float64
owner          object
age           float64
power         float64
brand          object
dtype: object

In [10]:
cat_col = df.select_dtypes(include="O")
cat_col.head()

Unnamed: 0,bike_name,city,owner,brand
0,TVS Star City Plus Dual Tone 110cc,Ahmedabad,First Owner,TVS
1,Royal Enfield Classic 350cc,Delhi,First Owner,Royal Enfield
2,Triumph Daytona 675R,Delhi,First Owner,Triumph
3,TVS Apache RTR 180cc,Bangalore,First Owner,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,Bangalore,First Owner,Yamaha


In [11]:
num_col = df.select_dtypes(exclude="O")
num_col.head()

Unnamed: 0,price,kms_driven,age,power
0,35000.0,17654.0,3.0,110.0
1,119900.0,11000.0,4.0,350.0
2,600000.0,110.0,8.0,675.0
3,65000.0,16329.0,4.0,180.0
4,80000.0,10000.0,3.0,150.0


In [12]:
#Feature Reduction or Selections --> Select only those columns that provide valuable information and are used in training ML Algorithm

In [13]:
cat_col.nunique()

bike_name    471
city         443
owner          4
brand         23
dtype: int64

In [14]:
cat_col['bike_name'].nunique()

471

In [15]:
cat_col = cat_col.drop(['bike_name','city'], axis=1)

In [16]:
cat_col

Unnamed: 0,owner,brand
0,First Owner,TVS
1,First Owner,Royal Enfield
2,First Owner,Triumph
3,First Owner,TVS
4,First Owner,Yamaha
...,...,...
9362,First Owner,Hero
9369,First Owner,Bajaj
9370,First Owner,Harley-Davidson
9371,First Owner,Bajaj


In [17]:
cat_col['owner'].value_counts()

First Owner             6642
Second Owner             588
Third Owner               84
Fourth Owner Or More      10
Name: owner, dtype: int64

In [18]:
#Label Encoding - Encoding labels with numerical values by assigning them
dt = {'First Owner':1,'Second Owner':2,'Third Owner':3,'Fourth Owner Or More':4}
dt

{'First Owner': 1,
 'Second Owner': 2,
 'Third Owner': 3,
 'Fourth Owner Or More': 4}

In [19]:
cat_col['owner'] = cat_col['owner'].map(dt)

In [20]:
cat_col

Unnamed: 0,owner,brand
0,1,TVS
1,1,Royal Enfield
2,1,Triumph
3,1,TVS
4,1,Yamaha
...,...,...
9362,1,Hero
9369,1,Bajaj
9370,1,Harley-Davidson
9371,1,Bajaj


In [21]:
cat_col['brand'].nunique()

23

In [22]:
unique_brands = list(cat_col['brand'].unique())
unique_brands

['TVS',
 'Royal Enfield',
 'Triumph',
 'Yamaha',
 'Honda',
 'Hero',
 'Bajaj',
 'Suzuki',
 'Benelli',
 'KTM',
 'Mahindra',
 'Kawasaki',
 'Ducati',
 'Hyosung',
 'Harley-Davidson',
 'Jawa',
 'BMW',
 'Indian',
 'Rajdoot',
 'LML',
 'Yezdi',
 'MV',
 'Ideal']

In [23]:
num = 1
dt2 = {}
for i in cat_col['brand'].unique():
    dt2[i] = num
    num+=1

In [24]:
dt2

{'TVS': 1,
 'Royal Enfield': 2,
 'Triumph': 3,
 'Yamaha': 4,
 'Honda': 5,
 'Hero': 6,
 'Bajaj': 7,
 'Suzuki': 8,
 'Benelli': 9,
 'KTM': 10,
 'Mahindra': 11,
 'Kawasaki': 12,
 'Ducati': 13,
 'Hyosung': 14,
 'Harley-Davidson': 15,
 'Jawa': 16,
 'BMW': 17,
 'Indian': 18,
 'Rajdoot': 19,
 'LML': 20,
 'Yezdi': 21,
 'MV': 22,
 'Ideal': 23}

In [63]:
#By List Comprehensions - Assignment study list comprehesons 
{brand:i for i, brand in enumerate(unique_brands)}

{'TVS': 0,
 'Royal Enfield': 1,
 'Triumph': 2,
 'Yamaha': 3,
 'Honda': 4,
 'Hero': 5,
 'Bajaj': 6,
 'Suzuki': 7,
 'Benelli': 8,
 'KTM': 9,
 'Mahindra': 10,
 'Kawasaki': 11,
 'Ducati': 12,
 'Hyosung': 13,
 'Harley-Davidson': 14,
 'Jawa': 15,
 'BMW': 16,
 'Indian': 17,
 'Rajdoot': 18,
 'LML': 19,
 'Yezdi': 20,
 'MV': 21,
 'Ideal': 22}

In [26]:
cat_col['brand'] = cat_col['brand'].map(dt2)

In [27]:
cat_col

Unnamed: 0,owner,brand
0,1,1
1,1,2
2,1,3
3,1,1
4,1,4
...,...,...
9362,1,6
9369,1,7
9370,1,15
9371,1,7


In [28]:
pd.get_dummies(df).astype(int).shape

(7324, 945)

In [29]:
df2 = pd.concat([cat_col,num_col],axis=1)
df2.head(2)

Unnamed: 0,owner,brand,price,kms_driven,age,power
0,1,1,35000.0,17654.0,3.0,110.0
1,1,2,119900.0,11000.0,4.0,350.0


In [30]:
df2.isnull().sum()

owner         0
brand         0
price         0
kms_driven    0
age           0
power         0
dtype: int64

In [31]:
#Splitting Data into X and Y (Dependent & Independent)
x = df2.drop('price',axis='columns')
y = df2[['price']]

In [32]:
#Training & Testing Data Splitting
from sklearn.model_selection import train_test_split

In [33]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)  #80% - Training & #20% - Testing

In [34]:
### Model Training -- Linear Regression 
from sklearn.linear_model import LinearRegression

In [35]:
lr = LinearRegression()

In [36]:
lr.fit(x_train,y_train)

In [37]:
# Model Evaluation 
lr.score(x_train,y_train)   #Training Data Score

0.7241231619957185

In [38]:
lr.score(x_test,y_test)     #Testing Data Score

0.7047059625622413

In [39]:
#Model score should be above 90%
#Training and Testing Score Difference should be not more than 5%
#Model Overfit or Model Underfit

In [40]:
#Model Overfit 
# If the training score is high 
# If the testing score is low
# This above condition is overfit condition

In [41]:
#Model Underfit 
# If the training score is Low
# If the testing score is low/high 
# This is underfit condition

In [42]:
#Metrics Evaluation Method 
#MSE
#MAE
#RMSE - Root of mean squared error 
#SCORE
#ADJUSTED AND R SQUARED

In [43]:
prediction = lr.predict(x_test)

In [44]:
y_test['predicted_price'] = prediction 

In [45]:
y_test

Unnamed: 0,price,predicted_price
8841,40000.0,73814.334693
645,41500.0,38042.299966
4793,75000.0,84143.565114
1105,70000.0,50845.954232
340,47000.0,56224.408565
...,...,...
7261,24000.0,14761.870736
2550,130000.0,221274.610702
3750,90000.0,25413.690613
373,42000.0,58930.103500


In [None]:
# 6th June

In [47]:
y_test['difference'] = y_test['price'] - y_test['predicted_price']

In [48]:
y_test 

Unnamed: 0,price,predicted_price,difference
8841,40000.0,73814.334693,-33814.334693
645,41500.0,38042.299966,3457.700034
4793,75000.0,84143.565114,-9143.565114
1105,70000.0,50845.954232,19154.045768
340,47000.0,56224.408565,-9224.408565
...,...,...,...
7261,24000.0,14761.870736,9238.129264
2550,130000.0,221274.610702,-91274.610702
3750,90000.0,25413.690613,64586.309387
373,42000.0,58930.103500,-16930.103500


In [58]:
#Mean Squared Error is used to avoid negative values
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [59]:
mse = mean_squared_error(y_test['price'],y_test['predicted_price'])

In [60]:
mae = mean_absolute_error(y_test['price'],y_test['predicted_price'])

In [61]:
#MSE and #MAE should be low 

In [69]:
#RMSE 
import numpy as np
np.sqrt(mse)

65207.05469432481

In [64]:
#Assignment -> Make a function of Mean Sqaured Error, mean absolute error, RMSE

In [68]:
lr.score(x_train,y_train)

0.7241231619957185

In [70]:
#Interview Question - Difference b/w r_squared & adjusted_r_sqaured

In [71]:
from sklearn.metrics import r2_score

In [74]:
r2_score(y_test['price'],y_test['predicted_price'])

0.7047059625622413

In [77]:
y_test.shape

(1465, 3)

In [82]:
#Adjusted R Squared
loss = (1-0.7047059625622413)*(1465-1)/(1465-5-1)
print(1-loss)
print(loss)

0.7036939884791784
0.29630601152082164


In [83]:
lr

In [93]:
#To save and share a trained model - joblib
import joblib, pickle #Pickle can also be used
ls = ['latapranav@gmail.com','hello@123']

In [94]:
joblib.dump(ls,'email_id.lb')

['email_id.lb']

In [95]:
ls2 = joblib.load('email_id.lb')

In [96]:
ls2

['latapranav@gmail.com', 'hello@123']

In [97]:
joblib.dump(lr,'linear_regression_model.lb')

['linear_regression_model.lb']

In [98]:
lr2 = joblib.load('linear_regression_model.lb')

In [99]:
lr2