In [121]:
## Import required libs
import pandas as pd 
from sklearn.decomposition import PCA 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn import metrics 
from sklearn.naive_bayes import BernoulliNB 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
%matplotlib inline 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.plotly as py 
import plotly.graph_objs as go

In [122]:
## Read the CSV file in a dataframe. Remove the column headers from the data.
data = pd.read_csv("12)ToyotaCorolla.csv", header=0)

In [123]:
data.shape

(1436, 10)

In [124]:
cols = ['Price','Age','KM','FuelType','HP','MetColor','Automatic','CC','Doors','Weight'] 
data.columns = cols

In [125]:
data.columns

Index(['Price', 'Age', 'KM', 'FuelType', 'HP', 'MetColor', 'Automatic', 'CC',
       'Doors', 'Weight'],
      dtype='object')

In [126]:
## Check if data has missing values 
missing_vals=data.isnull().sum().sum() 
print('Number of missing values = {0}'.format(missing_vals))

Number of missing values = 0


In [127]:
## Verify that data is loaded 
data.head(1)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23,46986,Diesel,90,1,0,2000,3,1165


In [128]:
## Check various characteristics of data 
data.describe()

Unnamed: 0,Price,Age,KM,HP,MetColor,Automatic,CC,Doors,Weight
count,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0
mean,10730.824513,55.947075,68533.259749,101.502089,0.674791,0.05571,1566.827994,4.033426,1072.45961
std,3626.964585,18.599988,37506.448872,14.98108,0.468616,0.229441,187.182436,0.952677,52.64112
min,4350.0,1.0,1.0,69.0,0.0,0.0,1300.0,2.0,1000.0
25%,8450.0,44.0,43000.0,90.0,0.0,0.0,1400.0,3.0,1040.0
50%,9900.0,61.0,63389.5,110.0,1.0,0.0,1600.0,4.0,1070.0
75%,11950.0,70.0,87020.75,110.0,1.0,0.0,1600.0,5.0,1085.0
max,32500.0,80.0,243000.0,192.0,1.0,1.0,2000.0,5.0,1615.0


In [129]:
# It's observed that there are some rows which have 'KM' as 1 despite having number of years of age of the car. 
# These data points appear as noise.Hence we can remove those. 

data[data.KM==1]
data = data[data.KM!= 1]
data.shape


(1428, 10)

In [130]:
# First column 'Price' in the data shows the price of the car. Our task is to apply the linear regression model to this data and 
# predict the price of the car. Save the column 'Price' in another variable

original_data=data.copy() 
y = data.pop('Price')
print(data.shape)

(1428, 9)


In [131]:
# We can see that Fuel type column has three categories - Diesel, Petrol and CNG. 
# Since we cannot use the string data type we have to convert these to integer labels as below
# Diesel:1, Petrol:2, CNG: 3

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data.columns
data.FuelType=data[['FuelType']].apply(le.fit_transform)
data.head(10)

Unnamed: 0,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,23,46986,1,90,1,0,2000,3,1165
1,23,72937,1,90,1,0,2000,3,1165
2,24,41711,1,90,1,0,2000,3,1165
3,26,48000,1,90,0,0,2000,3,1165
4,30,38500,1,90,0,0,2000,3,1170
5,32,61000,1,90,0,0,2000,3,1170
6,27,94612,1,90,1,0,2000,3,1245
7,30,75889,1,90,1,0,2000,3,1245
8,27,19700,2,192,0,0,1800,3,1185
9,23,71138,1,69,0,0,1900,3,1105


In [132]:
# Now we will divide the data into 70% as training data and 30% as test data.

from sklearn.model_selection import train_test_split

x_tr, x_test, y_tr, y_test = train_test_split(data, y, test_size=0.3) 
x_tr.shape, y_tr.shape, x_test.shape, y_test.shape


((999, 9), (999,), (429, 9), (429,))

In [135]:
# Now we will apply linear regression

from sklearn.linear_model import LinearRegression

linreg=LinearRegression()
linreg.fit(x_tr,y_tr)
predicted=linreg.predict(x_test)
 
# Now calculate the accuracy score of this model on the test data
accuracy=linreg.score(x_test, y_test)*100
print("**** Accuracy with Linear Regression Model : {} % ****".format(accuracy))


**** Accuracy with Linear Regression Model : 87.8983919685639 % ****


In [None]:
######## THIS CONCLUDES LINEAR REGRESSION ########