# <center>Linear Regression</center>

In [1]:
import pandas as pd

df = pd.read_csv("car_price.csv")
df.head(10)

Unnamed: 0,name,company,year,price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
5,Ford EcoSport Titanium 1.5L TDCi,Ford,Temp Data,Ask For Price,"59,000 kms",Diesel
6,Ford Figo,Ford,2012,175000,Petrol,Diesel
7,Hyundai Eon,Hyundai,2013,190000,"25,000 kms",Petrol
8,Ford EcoSport Ambiente 1.5L TDCi,Ford,2016,830000,"24,530 kms",Diesel
9,Maruti Suzuki Alto K10 VXi AMT,Maruti,2015,250000,"60,000 kms",Petrol


## step-1

- check years values
- delete the inappropriate values
- change year data type

In [2]:
df = df[df['year'].str.isnumeric()] #taking only the numeric value rows
df['year'].unique()


array(['2007', '2006', '2018', '2014', '2012', '2013', '2016', '2015',
       '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',
       '2003', '2004', '1995', '2002', '2001'], dtype=object)

In [3]:
df['year'] = df['year'].astype(int)
# df['year'].info()
df['year'].head(10)

0     2007
1     2006
2     2018
3     2014
4     2014
6     2012
7     2013
8     2016
9     2015
10    2010
Name: year, dtype: int32

## step-2

- check price values
- removing the comma
- change price data type

In [4]:
df = df[df['price'] != 'Ask For Price']
df['price'] = df['price'].str.replace(',', '')
# df['price'].info()
df['price'].head(10)

0      80000
1     425000
3     325000
4     575000
6     175000
7     190000
8     830000
9     250000
10    182000
11    315000
Name: price, dtype: object

## step-3

- check kms_driven values
- removing the comma 
- change kms_driven data type

In [5]:
df['kms_driven'].unique()

df['kms_driven'] = df['kms_driven'].str.split(' ').str.get(0).str.replace(',', '')
df['kms_driven'].head(10)

0      45000
1         40
3      28000
4      36000
6     Petrol
7      25000
8      24530
9      60000
10     60000
11     30000
Name: kms_driven, dtype: object

In [6]:
df = df[df['kms_driven'].str.isnumeric()]
df['kms_driven'] = df['kms_driven'].astype(int)
df.head(10)

Unnamed: 0,name,company,year,price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40,
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
7,Hyundai Eon,Hyundai,2013,190000,25000,Petrol
8,Ford EcoSport Ambiente 1.5L TDCi,Ford,2016,830000,24530,Diesel
9,Maruti Suzuki Alto K10 VXi AMT,Maruti,2015,250000,60000,Petrol
10,Skoda Fabia Classic 1.2 MPI,Skoda,2010,182000,60000,Petrol
11,Maruti Suzuki Stingray VXi,Maruti,2015,315000,30000,Petrol
12,Hyundai Elite i20 Magna 1.2,Hyundai,2014,415000,32000,Petrol


## step-4

- check fuel_type values

In [7]:
print(df['fuel_type'].unique())
df = df[~df['fuel_type'].isna()]
df

['Petrol' nan 'Diesel' 'LPG']


Unnamed: 0,name,company,year,price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
7,Hyundai Eon,Hyundai,2013,190000,25000,Petrol
8,Ford EcoSport Ambiente 1.5L TDCi,Ford,2016,830000,24530,Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz VXI ABS,Maruti,2011,270000,50000,Petrol
885,Tata Indica V2 DLE BS III,Tata,2009,110000,30000,Diesel
886,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
888,Tata Zest XM Diesel,Tata,2018,260000,27000,Diesel


## step-5

- reset the indexes
- save the values in new csv file

In [8]:
df = df.reset_index(drop=True)
df.to_csv('cleaned_car_price.csv')

## Implement with the new cleaned file

In [9]:
ndf = pd.read_csv("cleaned_car_price.csv")
#ndf.iloc[row_start:row_end , col_start, col_end]
ndf = ndf.iloc[:, 1:]
ndf.head(10)

Unnamed: 0,name,company,year,price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
2,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
3,Hyundai Eon,Hyundai,2013,190000,25000,Petrol
4,Ford EcoSport Ambiente 1.5L TDCi,Ford,2016,830000,24530,Diesel
5,Maruti Suzuki Alto K10 VXi AMT,Maruti,2015,250000,60000,Petrol
6,Skoda Fabia Classic 1.2 MPI,Skoda,2010,182000,60000,Petrol
7,Maruti Suzuki Stingray VXi,Maruti,2015,315000,30000,Petrol
8,Hyundai Elite i20 Magna 1.2,Hyundai,2014,415000,32000,Petrol
9,Mahindra Scorpio SLE BS IV,Mahindra,2015,320000,48660,Diesel


In [10]:
X = ndf[['name', 'company', 'year', 'kms_driven', 'fuel_type']]
X.head(10)

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,45000,Petrol
1,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,28000,Petrol
2,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,36000,Diesel
3,Hyundai Eon,Hyundai,2013,25000,Petrol
4,Ford EcoSport Ambiente 1.5L TDCi,Ford,2016,24530,Diesel
5,Maruti Suzuki Alto K10 VXi AMT,Maruti,2015,60000,Petrol
6,Skoda Fabia Classic 1.2 MPI,Skoda,2010,60000,Petrol
7,Maruti Suzuki Stingray VXi,Maruti,2015,30000,Petrol
8,Hyundai Elite i20 Magna 1.2,Hyundai,2014,32000,Petrol
9,Mahindra Scorpio SLE BS IV,Mahindra,2015,48660,Diesel


In [11]:
Y = ndf['price']
Y.head(10)

0     80000
1    325000
2    575000
3    190000
4    830000
5    250000
6    182000
7    315000
8    415000
9    320000
Name: price, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [13]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

ohe.fit(X[['name','company','fuel_type']]) #changing categorical data to numerical

In [14]:
from sklearn.compose import make_column_transformer

column_trans = make_column_transformer(
    (
        OneHotEncoder(categories=ohe.categories_),
        ['name', 'company', 'fuel_type']),
    remainder = 'passthrough',
)

column_trans

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

lr = LinearRegression()
pipe = make_pipeline(column_trans, lr)
pipe

In [16]:
pipe.fit(X_train, Y_train)

In [17]:
X_test

Unnamed: 0,name,company,year,kms_driven,fuel_type
291,Mahindra Scorpio S4,Mahindra,2015,30000,Diesel
205,Renault Duster 110PS Diesel RxZ,Renault,2012,35000,Diesel
260,Renault Duster 85 PS RxE Diesel,Renault,2013,27000,Diesel
289,Mahindra Scorpio VLX Special Edition BS III,Mahindra,2004,160000,Diesel
90,Toyota Etios GD,Toyota,2013,60000,Diesel
...,...,...,...,...,...
327,Renault Scala RxL Diesel Travelogue,Renault,2015,25000,Diesel
731,Maruti Suzuki Alto 800 Lxi,Maruti,2015,14000,Petrol
449,Honda City,Honda,2015,55000,Petrol
488,Hyundai Grand i10 Magna AT 1.2 Kappa VTVT,Hyundai,2017,20000,Petrol


In [18]:
Y_preds = pipe.predict(X_test)

Y_preds

array([ 7.61654734e+05,  3.55793346e+05,  3.81583221e+05,  2.67391742e+05,
        5.52040196e+05,  3.98376631e+05,  1.02785531e+05,  4.50618783e+05,
        4.51361406e+05,  3.13479974e+05,  5.96066821e+05,  3.76672757e+05,
        5.05156261e+05,  5.72949198e+05,  7.25740368e+02,  4.20568557e+05,
        2.61180753e+05,  2.45466354e+04,  7.07378181e+05,  4.09560416e+05,
       -5.98725453e+03,  4.55223595e+05,  5.90001982e+05,  3.04308543e+05,
        5.96728135e+05,  1.22847804e+05,  4.52899524e+05,  5.43436685e+05,
        4.47398311e+05,  3.56215041e+05,  3.72830734e+05,  3.90503759e+05,
        3.33458646e+05,  4.29032129e+05,  6.15312594e+05,  4.15788035e+05,
        4.53255069e+05,  4.16182804e+05,  2.91231491e+05,  1.63389061e+05,
        4.44643309e+05,  2.27001439e+05, -4.61320215e+04,  3.86979306e+05,
        3.32031764e+05,  3.61773499e+05,  7.90359320e+04,  4.08866386e+05,
       -1.09708787e+04,  1.36968269e+05,  4.18391896e+05,  4.65825428e+05,
        6.50872048e+05,  

In [19]:
import numpy as np

res = pipe.predict(
    pd.DataFrame(
        columns = X_test.columns, 
        data=np.array(['Maruti Suzuki Swift', 'Maruti', 2019, 100, 'Petrol']).reshape(1,5)
    )
)

res

array([470128.34575368])

In [20]:
print('>>>>>>>> Your predicted car price ' + str(res[0]) + ' <<<<<<<')

>>>>>>>> Your predicted car price 470128.34575368464 <<<<<<<
