In [16]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

df = pd.read_csv(path, delim_whitespace=True, header=None,
            names = ['mpg', 'cylinders', 'displacement','horsepower',
            'weight', 'acceleration', 'model_year', 'origin', 'name'],
            na_values='?')

In [17]:
df.head()


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [18]:
df = df.drop('name', axis=1)
df.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1


In [20]:
df['origin'] = df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})

df = pd.get_dummies(df, columns=['origin'])
df.head(1)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,0,0


In [21]:
import numpy as np

df = df.replace('?', np.nan)
df = df.dropna()

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 10 columns):
mpg               392 non-null float64
cylinders         392 non-null int64
displacement      392 non-null float64
horsepower        392 non-null float64
weight            392 non-null float64
acceleration      392 non-null float64
model_year        392 non-null int64
origin_america    392 non-null int64
origin_asia       392 non-null int64
origin_europe     392 non-null int64
dtypes: float64(5), int64(5)
memory usage: 33.7 KB


In [23]:
X = df.drop('mpg', axis=1)
y = df[['mpg']]

from sklearn.model_selection import train_test_split

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [24]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [25]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cylinders is -0.24633755869961935
The coefficient for displacement is 0.02387033830714961
The coefficient for horsepower is -0.006017238617773211
The coefficient for weight is -0.007336432943899317
The coefficient for acceleration is 0.2189777810412489
The coefficient for model_year is 0.7851801072779486
The coefficient for origin_america is -1.7624934092199251
The coefficient for origin_asia is 0.8096269190858502
The coefficient for origin_europe is 0.9528664901340758


In [26]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -19.809183848815824


In [27]:
regression_model.score(X_test, y_test)

0.8285231316459775