In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series,DataFrame
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline


import sklearn


%precision 3

'%.3f'

## 多元線性迴歸

In [3]:
import requests,zipfile
import io

In [4]:
# 取得汽車價格資料
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

# 用dataframe讀取資料
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header = None)

# 設定欄位名稱
auto.columns =['symboling','normalized-losses','make','fuel-type' ,'aspiration','num-of-doors',
                            'body-style','drive-wheels','engine-location','wheel-base','length','width','height',
                            'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore',
                            'stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

print('汽車資料的形式:{}'.format(auto.shape))

汽車資料的形式:(205, 26)


In [5]:
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [6]:
# 用price以外的值去預測price
auto['price'].describe()

count     205
unique    187
top         ?
freq        4
Name: price, dtype: object

In [7]:
auto[['horsepower','width','height']].head()

Unnamed: 0,horsepower,width,height
0,111,64.1,48.8
1,111,64.1,48.8
2,154,65.5,52.4
3,102,66.2,54.3
4,115,66.4,54.3


In [8]:
auto_temp = auto[['price','horsepower','width','height']]

In [9]:
# 確認異常值「？」
auto_temp.isin(['?']).sum()

price         4
horsepower    2
width         0
height        0
dtype: int64

In [10]:
# 將?取代為NaN，並刪除對應的列
auto_temp = auto_temp.replace('?',np.nan).dropna()
print('汽車資料的形式:{}'.format(auto_temp.shape))

汽車資料的形式:(199, 4)


In [11]:
# 資料型別確認 
auto_temp.dtypes

price          object
horsepower     object
width         float64
height        float64
dtype: object

In [12]:
# 轉換成數值型
auto_temp = auto_temp.assign(price = pd.to_numeric(auto_temp.price))
auto_temp = auto_temp.assign(horsepower = pd.to_numeric(auto_temp.horsepower))

In [13]:
auto_temp.dtypes

price           int64
horsepower      int64
width         float64
height        float64
dtype: object

### 相關性確認 : 注意多元共線性

In [14]:
auto_temp.corr()

Unnamed: 0,price,horsepower,width,height
price,1.0,0.810533,0.753871,0.13499
horsepower,0.810533,1.0,0.615315,-0.087407
width,0.753871,0.615315,1.0,0.309223
height,0.13499,-0.087407,0.309223,1.0


### 模型建構＆評估

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [16]:
X = auto_temp.drop('price', axis=1)
y = auto_temp['price']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [18]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [19]:
# 決定係數
print('決定係數(train):{:.3f}'.format(model.score(X_train,y_train)))
print('決定係數(test):{:.3f}'.format(model.score(X_test,y_test)))

決定係數(train):0.733
決定係數(test):0.737


In [20]:
#迴歸係數與截距
print('\n迴歸係數\n{}'.format(pd.Series(model.coef_, index=X.columns)))
print('截距: {:.3f}'.format(model.intercept_))


迴歸係數
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
截距: -128409.046


## 邏輯回歸

In [21]:
# 取得成人資料
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
res = requests.get(url).content

adult = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)
adult.columns =['age','workclass','fnlwgt','education','education-num','marital-status',
                             'occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week',
                             'native-country','flg-50K']
print('資料的型式:{}'.format(adult.shape))
print('遺失的數量:{}'.format(adult.isnull().sum().sum()))

adult.head()

資料的型式:(32561, 15)
遺失的數量:0


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,flg-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


 ### 預測目標收入是否超過五萬元 (50K)

In [22]:
adult.groupby('flg-50K').size()

flg-50K
 <=50K    24720
 >50K      7841
dtype: int64

In [23]:
# 資料轉換
adult['fin_flg'] = adult['flg-50K'].map(lambda x:1 if x == ' >50K' else 0)
adult.groupby('fin_flg').size()

fin_flg
0    24720
1     7841
dtype: int64

### 模型建構與評估

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = adult[['age','fnlwgt','education-num','capital-gain','capital-loss']]
y = adult['fin_flg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)


model = LogisticRegression()
model.fit(X_train,y_train)

print('正確率(train):{:.3f}'.format(model.score(X_train, y_train)))
print('正確率(test):{:.3f}'.format(model.score(X_test, y_test)))

正確率(train):0.797
正確率(test):0.798


In [25]:
model.coef_

array([[-1.185e-02, -4.379e-06, -2.774e-03,  3.274e-04,  7.532e-04]])

In [26]:
np.exp(model.coef_)

array([[0.988, 1.   , 0.997, 1.   , 1.001]])

### 標準化變數

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = adult[['age','fnlwgt','education-num','capital-gain','capital-loss']]
y = adult['fin_flg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# 標準化處理
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

model = LogisticRegression()
model.fit(X_train_std,y_train)

print('正確率(train):{:.3f}'.format(model.score(X_train_std, y_train)))
print('正確率(test):{:.3f}'.format(model.score(X_test_std, y_test)))

正確率(train):0.811
正確率(test):0.810


## Lasso / Ridge (具有正規化的迴歸)

In [28]:
auto_temp.head()

Unnamed: 0,price,horsepower,width,height
0,13495,111,64.1,48.8
1,16500,111,64.1,48.8
2,16500,154,65.5,52.4
3,13950,102,66.2,54.3
4,17450,115,66.4,54.3


In [29]:
# Ridge
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

X = auto_temp.drop('price', axis=1)
y = auto_temp['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

linear = LinearRegression()
ridge = Ridge(random_state=0)

for model in [linear, ridge]:
    model.fit(X_train,y_train)
    print('{}(train):{:.6f}'.format(model.__class__.__name__ , model.score(X_train,y_train)))
    print('{}(test):{:.6f}'.format(model.__class__.__name__ , model.score(X_test,y_test)))

LinearRegression(train):0.733358
LinearRegression(test):0.737069
Ridge(train):0.733355
Ridge(test):0.737768


In [30]:
# Lasso
from sklearn.linear_model import LinearRegression, Lasso

X = auto_temp.drop('price', axis=1)
y = auto_temp['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

models = {
    'linear': LinearRegression(),
    'lasso1':  Lasso(alpha=1.0, random_state=0),
    'lasso2':  Lasso(alpha=200.0, random_state=0)
}

scores = {}
for model_name, model in models.items():
    model.fit(X_train,y_train)
    scores[(model_name, 'train')] = model.score(X_train, y_train)
    scores[(model_name, 'test')] = model.score(X_test, y_test)

pd.Series(scores).unstack()

Unnamed: 0,test,train
lasso1,0.737107,0.733358
lasso2,0.743235,0.733082
linear,0.737069,0.733358
