In [1]:
# Linear Regression Tutorial (2): sample regression model using real-world dataset
# Shomik Jain, USC CAIS++

In [2]:
# Real-world dataset involving automobile MPG data 
# Need to clean up/preprocess the data (missing values, categorical features)
# Then, apply sklearn's built-in linear regression on cleaned data 

In [3]:
# Dataset: https://archive.ics.uci.edu/ml/datasets/auto+mpg
# Go to "Data Folder", download auto-mpg.data, save to current working directory 

In [4]:
# 1. Reading in the data using Pandas 
import pandas as pd

In [5]:
headers = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']

In [6]:
# file white-space delimited (not CSV, so manually set delimiter) 
mpg_df = pd.read_csv('auto-mpg.txt', names=headers, delim_whitespace=True)

In [7]:
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [8]:
# 2. Data Preprocessing: deal with missing values, categorical features, etc. 

In [9]:
# name is an irrelevant feature for prediction purposes
mpg_df = mpg_df.drop('name', axis=1)

In [10]:
# Origin is a categorical value => split origin into 3 separate variables (for each region)
# Each region of origin variable will have a 0 or 1 value (depending on where origin is)
# Allows each region of origin to be assigned its own weight 
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})

In [11]:
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])

In [12]:
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,0,0
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,0,0
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,0,0
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,0,0
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,0,0


In [13]:
# dataset description says "Missing Attribute Values: horsepower has 6 missing values"
# remove entries with missing horespower values 
mpg_df['horsepower'].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [14]:
import numpy as np

mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.dropna()

In [15]:
mpg_df['horsepower'].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [16]:
# remove target feature (mpg) from the dataframe
X = mpg_df.drop('mpg', axis=1)
y = mpg_df[['mpg']]

In [17]:
# 3. Training the Model: use Sklearn built-in functionality

In [18]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [19]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [20]:
regression_model.coef_

array([[-0.24633756,  0.02387034, -0.00601724, -0.00733643,  0.21897778,
         0.78518011, -1.76249341,  0.80962692,  0.95286649]])

In [21]:
regression_model.intercept_

array([-19.80918385])

In [22]:
# Evaluate Model
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, regression_model.predict(X_test))

12.230963834602667