# Model Development

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
filepath = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/automobileEDA.csv"
df = pd.read_csv(filepath) 

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Creating Linear Regression object

lm = LinearRegression()
lm

In [None]:
# Define Variables

X = df[['highway-mpg']]
Y = df['price']

In [None]:
# Fit variables

lm.fit(X,Y)

In [None]:
# Predict

Yhat=lm.predict(X)
Yhat[0:5]   

In [None]:
lm.intercept_ # Intercept

In [None]:
lm.coef_ # Coefficient

## Multiple-variable linear regression

In [None]:
# Define independent variables

Z = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]

In [None]:
# Fit points

lm.fit(Z, df['price'])

In [None]:
lm.intercept_

In [None]:
lm.coef_

## Model evaluation via visualisation

In [None]:
import seaborn as sns
%matplotlib inline 

In [None]:
# Regression Plot

width = 12
height = 10
plt.figure(figsize=(width, height))
sns.regplot(x="highway-mpg", y="price", data=df)
plt.ylim(0,)

In [None]:
# Residual Plot

width = 12
height = 10
plt.figure(figsize=(width, height))
sns.residplot(x=df['highway-mpg'], y=df['price'])
plt.show()

In [None]:
# Let us use the multi-variable linear regression to predict

Y_hat = lm.predict(Z)

In [None]:
# Distribution plot as with a MVLR you cannot use a regression/residual plot

plt.figure(figsize=(width, height))

sns.kdeplot(df['price'], color="r", label="Actual Value")
sns.kdeplot(Y_hat, color="b", label="Fitted Values")

plt.title('Actual vs Fitted Values for Price')
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')

plt.legend()

plt.show()
plt.close()

# Polynomial Regressions and Pipelines

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pr=PolynomialFeatures(degree=2)
pr

In [None]:
Z_pr=pr.fit_transform(Z)

In [None]:
# Pipeline can simplify the steps of processing data

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
Input=[('scale',StandardScaler()), ('polynomial', PolynomialFeatures(include_bias=False)), ('model',LinearRegression())]

y = df['price']

In [None]:
pipe=Pipeline(Input)
pipe

In [None]:
# Normalise the data, perform a transformation and model it

Z = Z.astype(float)
pipe.fit(Z,y)

In [None]:
# Similarly we can do all that and do a prediction

ypipe=pipe.predict(Z)
ypipe[0:4]

In [42]:
# Measures for in-sample eevaluation include Rsquared and MSE

lm.fit(X, Y)
print('The R-square is: ', lm.score(X, Y))

The R-square is:  0.4965911884339176


In [43]:
Yhat=lm.predict(X)
print('The output of the first four predicted value is: ', Yhat[0:4])

The output of the first four predicted value is:  [16236.50464347 16236.50464347 17058.23802179 13771.3045085 ]


In [45]:
from sklearn.metrics import mean_squared_error

In [46]:
mse = mean_squared_error(df['price'], Yhat)
print('The mean square error of price and predicted value is: ', mse)

The mean square error of price and predicted value is:  31635042.944639888
