In [None]:
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
#importing the data

filename = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv"

In [None]:
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

In [None]:
#dataframe
df = pd.read_csv(filename, names=headers)

In [None]:
#first few lines of df
df.head()

In [None]:
#missing or garbage values
import numpy as np

df.replace("?", np.nan, inplace=True)


In [None]:
df.head()
#? is replaced with Not a Number

In [None]:
df.info()

#price, peak-rpm, horsepower, bore, stroke, num-of-doors, normalized-losses

In [None]:
#replace missing value with mean/average


avg_normloss = df['normalized-losses'].astype('float').mean(axis='index')

In [None]:
avg_normloss

In [None]:
df['normalized-losses'].replace(np.nan, avg_normloss, inplace=True)

In [None]:
df.head(10)

In [None]:
#bore

avg_bore = df['bore'].astype('float').mean(axis='index')

df['bore'].replace(np.nan, avg_bore, inplace=True)

In [None]:
#stroke


avg_stroke = df['stroke'].astype('float').mean(axis='index')

df['stroke'].replace(np.nan, avg_stroke, inplace=True)

In [None]:
#horsepower

avg_horsepower = df['horsepower'].astype('float').mean(axis='index')

df['horsepower'].replace(np.nan, avg_horsepower, inplace=True)

In [None]:
#peak-rpm

avg_peakrpm = df['peak-rpm'].astype('float').mean(axis='index')

df['peak-rpm'].replace(np.nan, avg_peakrpm, inplace=True)

In [None]:
#num-of-doors
#Average might give a float and a car cannot have a decimal number of doors
#replacing missing value with mode - value that maximum times in the columns

df['num-of-doors'].value_counts()




In [None]:
df['num-of-doors'].replace(np.nan, "four", inplace=True)

In [None]:
df.info()

In [None]:
#price
#delete rows for missing values in target variable
#axis = 'index' (rows) and axis='columns' (columns)

df.dropna(subset=['price'], axis='index', inplace=True)

In [None]:
df

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
df.info()

In [None]:
#summary
#drop
#replace with mean/average
#replace with mode/highest frequency

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df[['bore',"stroke","price","peak-rpm"]] = df[['bore',"stroke","price","peak-rpm"]].astype('float')

df[['normalized-losses']] = df[['normalized-losses']].astype('int')

In [None]:
df.dtypes

In [None]:
df[['horsepower']] = df[['horsepower']].astype('int')

In [None]:
#Convert mpg to Lt per 100km
#L/100km = 235/mpg

df['city-L/100km'] = 235/df['city-mpg']

df['highway-L/100km'] = 235/df['highway-mpg']

In [None]:
df.head()

In [None]:
#normalization
#average of 0 and variance of 1
#Column 1 -  100    200    300
#Column 2 -  1       2      3
#100/300   200/300   300/300
#1/3   2/3    3/3

#Column 1 -  0.33    0.66    1
#Column 2 -  0.33    0.66    1



df['length'] = df['length']/df['length'].max()
df['width'] = df['width']/df['width'].max()


In [None]:
df

In [None]:
#indicator variable (converting categorical variables to dummy/indicator variables)

results = pd.get_dummies(df['fuel-type'])

In [None]:
results

In [None]:
results.rename(columns={'gas':'fuel-type-gas', 'diesel':'fuel-type-diesel'}, inplace=True)
results

In [None]:
df = pd.concat([df, results], axis='columns')

In [None]:
df

In [None]:
#summary
#missing values
#changed datatypes
#normalized
#converted to same units
#convert categorical to numeric

In [None]:
#Data Visualization
!pip install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
#understand relationship between x and y

sns.regplot(x='engine-size',y='price',data=df)

In [None]:
sns.regplot(x='highway-mpg',y='price',data=df)

In [None]:
sns.regplot(x='peak-rpm',y='price',data=df)

In [None]:
df.corr()


#correlation - gives strength of relationship between x and y
#negative - 0 to -1 (inverse proportional)
#0 (no effect)
#positive - 0 to 1 (directly proportional)

In [None]:
plt.plot(df.corr())

In [None]:
#plot correlation matrix
corr = df.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr, cmap='coolwarm')

#heatmap colorbar
fig.colorbar(cax)

ticks = np.arange(0, len(corr.columns), 1)

ax.set_xticks(ticks)

plt.xticks(rotation=90)

ax.set_yticks(ticks)

ax.set_xticklabels(corr.columns)
ax.set_yticklabels(corr.columns)


plt.show()

In [None]:
#boxplot - min, first quartile, median, second quartile, max


sns.boxplot(x='fuel-type', y='price', data=df)

In [None]:
sns.boxplot(x='body-style', y='price', data=df)

In [None]:
sns.boxplot(x='engine-location', y='price', data=df)

In [None]:
#data visualization
#regplot
#corr
#boxplot
#identify outliers

#Model development


1. Regression - Where target can have continuous values
2. Classification - Where target can have discrete values





In [None]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()

lm

In [None]:
X = df[['highway-mpg']]
Y = df[['price']]

In [None]:
lm.fit(X, Y)

In [None]:
lm.intercept_

In [None]:
lm.coef_

In [None]:
sns.regplot(x='highway-mpg', y='price',data=df)

#Price = -821.73 * highway-mpg + 38423.3

In [None]:
X = df[['horsepower','curb-weight','engine-size','highway-mpg']]

lm.fit(X, Y)

In [None]:
lm.coef_

In [None]:
lm.intercept_

#Price = 53.53022809*horsepower +  4.70805253*curb-weight + 81.51280006*engine-size + 36.1593925*highway-mpg -15811.86

#Determine model performance

In [None]:
#Mean squared error

from sklearn.metrics import mean_squared_error, mean_absolute_error
X = df[['highway-mpg']]
Y = df[['price']]
lm.fit(X, Y)
yhat = lm.predict(X)

result1 = mean_squared_error(df['price'],yhat)
print(result1)

result2 = mean_absolute_error(df['price'],yhat)
print(result2)

In [None]:
#R-squared score - coeff of determination range -inf to 1
#best possible is 1
#R2score as 49% / 0.49 - 


lm.score(X, Y)

In [None]:
X = df[['horsepower','curb-weight','engine-size','highway-mpg']]

lm.fit(X, Y)

yhat = lm.predict(X)

result1 = mean_squared_error(df['price'],yhat)
print(result1)

result2 = mean_absolute_error(df['price'],yhat)
print(result2)

lm.score(X, Y)

In [None]:
#univariate regression
#multivariate regression
#coeff, intercept
#predictions
#r2
#mean squared error and mean absolute error

#Model Refinement and evaluation

In [None]:
lm=LinearRegression()

In [None]:
df

In [None]:
X = df.drop(['price','fuel-type-diesel','fuel-type-gas'], axis='columns')
Y = df['price']

In [None]:
X_dummies = pd.get_dummies(X)

In [None]:
X_dummies

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_dummies)

In [None]:
#ssplit my data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size = 0.2)

In [None]:
from sklearn.model_selection import GridSearchCV

fit_intercept = [True, False]



param_grid = dict(fit_intercept=fit_intercept)

resultscv = GridSearchCV(estimator=lm, param_grid=param_grid, scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1)


cvresults  = resultscv.fit(X_train, y_train)

In [None]:
cvresults.best_score_

In [None]:
cvresults.best_params_

In [None]:
yhattest = resultscv.predict(X_test)

result1 = mean_squared_error(y_test,yhattest)
print(result1)

result2 = mean_absolute_error(y_test,yhattest)
print(result2)

print(resultscv.score(X_test, y_test))

#Model building

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

sample = {'Age': [10,20,30,40,50,60,70,80,90,100],
         'Weight': [10,20,30,40,50,60,70,80,90,100]
        }

df = pd.DataFrame(sample, columns = ['Age', 'Weight'])

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

In [None]:
X_scaled

In [None]:
#ssplit my data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Age'], df['Weight'], test_size = 0.2)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
import numpy as np


lm=LinearRegression()



fit_intercept = [True, False]


param_grid = dict(fit_intercept=fit_intercept)

resultscv = GridSearchCV(estimator=lm, param_grid=param_grid, scoring="neg_root_mean_squared_error", verbose=1, n_jobs=-1)


cvresults  = resultscv.fit(X_train, y_train)

In [None]:
resultscv.get_params(deep=True)

In [None]:
cvresults.best_score_

In [None]:
cvresults.best_params_

In [None]:
yhattest = resultscv.predict(X_test)

result1 = mean_squared_error(y_test,yhattest)
print(result1)

result2 = mean_absolute_error(y_test,yhattest)
print(result2)

from sklearn.metrics import r2_score
r2_score(X_test, y_test)