# 1.2_準備python環境

In [1]:
import numpy as np
import pandas as pd

In [2]:
from matplotlib import pyplot as plt

# 讓圖形在jupyter notebook裡直接顯示
%matplotlib inline

In [3]:
# matplotlib的圖形比較單調，如果需要多一點的變化，需要import seaborn
import seaborn as sns

sns.set()

In [4]:
plt.rcParams["font.family"] = "STXIHEI"

In [5]:
# 如果需要使用statsmodels, 可以考慮用api接口簡化後再調用
import statsmodels.api as sm

# 1.3_sklearn的樣本資料集

In [6]:
from sklearn import datasets

boston = datasets.load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [7]:
# 顯示數據對象內容  (打boston.接著按tab鍵，就會有關於這個數據集的很多屬性可以選擇)
boston.data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [8]:
bostondf = pd.DataFrame(boston.data, columns = boston.feature_names)
bostondf.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


# 1.4_sklearn的基本操作

sklearn 可以做資料探勘的變量變換、變量處理、統計建模、模型優化、模型評估方法，為便於使用，這些操作都封裝成具有統一API的類，調用時遵循統一的操作規範

### 標準的類參數

class sklearn.大類名稱.Modelclass(類參數列表)

Modelclass中基本通用的類參數：
    
    fit_intercept = True ：模型是否包括常數項，使用此選項就不需要在DataFrame中設定cons
    n_jobs = 1 ：使用的例程數，為-1時使用全部CPU
    max_iter = 200 ：int,模型最大迭代次數
    tol = 0.0001 ：模型收斂標準 (一般不太需要改預設值)
    warm_start = False ：是否使用上一次的模型擬合結果作為本次初始值
    sample_weight = None ：案例權重
    random_state = None ：int/RandomState instance/None,隨機器的設定
    shuffle = True ：是否在拆分前對樣本做隨機排列
    
 )# 大多數類參數都會有默認值

In [9]:
# 生成實例

from sklearn import preprocessing

# 完整的類名稱為sklearn.preprocessing.StandardScaler()
std = preprocessing.StandardScaler()
std

StandardScaler()

In [10]:
# 生成實例

from sklearn import linear_model

# 完整的類名稱為sklearn.linear_model.LinearRegression()

reg = linear_model.LinearRegression()
reg

LinearRegression()

### Modelclass中基本通用的類方法：

    get_params([deep]) ：獲取模型的具體參數設定
    set_params(**params) ：重新設定模型參數
    fit(X, y[, sample_weight]) ：使用數據擬合模型/方法
    
    特徵處理class： Preprocessing、降維、Feature extraction/selection
        transform(X[, y]) ：使用擬合好的模型對指定數據進行轉換
        fit_transform(X[, y]) ： 對數據擬合相應的方法，並且進行轉換
        
    建模分析class： Classification、Regression、Clustering
        predict(X) ：使用擬合好的模型對數據計算預測值
        predict_proba(X) ：模型給出的每個案例(各個類別)的預測概率  (適用Classification)
        score(X, y[, sample_weight]) ：返回模型決定係數/模型準確度評價指標

In [11]:
std.get_params()

{'copy': True, 'with_mean': True, 'with_std': True}

In [12]:
# 使用fit方法，使std類獲取數據中相應的訊息
std.fit(boston.data)

StandardScaler()

In [25]:
std.mean_

array([3.61352356e+00, 1.13636364e+01, 1.11367787e+01, 6.91699605e-02,
       5.54695059e-01, 6.28463439e+00, 6.85749012e+01, 3.79504269e+00,
       9.54940711e+00, 4.08237154e+02, 1.84555336e+01, 3.56674032e+02,
       1.26530632e+01])

In [14]:
std.var_

array([7.38403597e+01, 5.42861840e+02, 4.69714297e+01, 6.43854770e-02,
       1.34010989e-02, 4.92695216e-01, 7.90792473e+02, 4.42525226e+00,
       7.56665313e+01, 2.83486236e+04, 4.67772630e+00, 8.31828042e+03,
       5.08939794e+01])

In [29]:
ZX = std.transform(boston.data)
ZX[:2]

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
         0.41367189, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
        -1.45900038,  0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, -0.27259857, -0.74026221,
         0.19427445,  0.36716642,  0.55715988, -0.8678825 , -0.98732948,
        -0.30309415,  0.44105193, -0.49243937]])

In [28]:
std.fit_transform(boston.data)[:2]

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
         0.41367189, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
        -1.45900038,  0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, -0.27259857, -0.74026221,
         0.19427445,  0.36716642,  0.55715988, -0.8678825 , -0.98732948,
        -0.30309415,  0.44105193, -0.49243937]])

In [17]:
# 使用fit方法，使reg類基於指定數據估計出回歸模型的相應參數

reg.fit(boston.data, boston.target)

LinearRegression()

In [18]:
reg.coef_

array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])

In [32]:
pred = reg.predict(boston.data)
pred[:10]

array([30.00384338, 25.02556238, 30.56759672, 28.60703649, 27.94352423,
       25.25628446, 23.00180827, 19.53598843, 11.52363685, 18.92026211])

In [20]:
reg.score(boston.data, boston.target)

0.7406426641094095

### Modelclass中基本通用的類屬性 (模型擬合前該屬性不存在)：

    coef_ : array,多因變量時為二維數組
    intercept_ : 常數項
    
    classes_ : 每個輸出的類標籤
    n_classses_ : int or list, 類別數
    n_features_ : int,特徵數
    
    loss_ : 損失函數計算出來的當前損失值
    n_iter_ : 迭代次數

In [21]:
std.mean_, std.scale_

(array([3.61352356e+00, 1.13636364e+01, 1.11367787e+01, 6.91699605e-02,
        5.54695059e-01, 6.28463439e+00, 6.85749012e+01, 3.79504269e+00,
        9.54940711e+00, 4.08237154e+02, 1.84555336e+01, 3.56674032e+02,
        1.26530632e+01]),
 array([8.59304135e+00, 2.32993957e+01, 6.85357058e+00, 2.53742935e-01,
        1.15763115e-01, 7.01922514e-01, 2.81210326e+01, 2.10362836e+00,
        8.69865112e+00, 1.68370495e+02, 2.16280519e+00, 9.12046075e+01,
        7.13400164e+00]))

In [22]:
reg.intercept_, reg.coef_

(36.459488385089855,
 array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
        -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
         3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
        -5.24758378e-01]))

### 簡化的調用函數

    特徵處理class往往會有簡化版本的函數可供調用，功能類似，但使用上更簡單
    
        class sklearn.preprocessing.StandardScaler()
        sklearn.preprocessing.scale()

In [23]:
preprocessing.scale(boston.data)[:2]

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
         0.41367189, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
        -1.45900038,  0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, -0.27259857, -0.74026221,
         0.19427445,  0.36716642,  0.55715988, -0.8678825 , -0.98732948,
        -0.30309415,  0.44105193, -0.49243937]])

### 模型的保存 (持久化)

    可以直接使用Python的pickle模塊將訓練好的模型保存為外部文件，但最好使用sklearn中的joblib模塊進行操作。

In [35]:
# 保存為外部文件
# sklearn在0.23版之後移除joblib，因此，環境需要安裝joblib包並import joblib才行使用
import joblib

joblib.dump(std,'C:/儲存的路徑/std.pk1')
joblib.dump(reg,'C:/儲存的路徑/reg.pk1')

FileNotFoundError: [Errno 2] No such file or directory: 'C:/儲存的路徑/std.pk1'

In [34]:
# 讀入外部保存的模型文件
reg2 = joblib.load('C:/儲存的路徑/reg.pk1')
reg2.coef_

array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])