In [1]:
from sklearn import linear_model
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings(action='ignore')

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
#因為使用scikit-learn
#X須為2維陣列,所以需要先將list轉換為ndarray,並將其維度設為2
#y須為1維陣列

#將資料集分為訓練資料與測試資料(80/20)
#random_state若相同，可以確保每次執行時，訓練/測試資料都是相同分割結果

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state= 31)
    # X, y = array(num)
    # same random_state can generate to same result

## Linear Regression

In [None]:
lr = linear_model.LinearRegression()
lr.fit(X_train,y_train)
# y=係數*X+截距
print('係數:',lr.coef_)
print('截距',lr.intercept_)
#regression預設使用的分數為決定係數(0~1,約接近1越好)
print(lr.score(X_train,y_train))
#使用X_test與y_test測試模型對未曾訓練過的資料的預測能力(泛化能力)
print(lr.score(X_test,y_test))
l#使用前面fit(X,y)後所學習到的函數,對X進行預測
pred_y = lr.predict(X_train)

In [None]:
# 殘差圖 residual plot
y_pred= lr.predict(X_train)
plt.scatter(X_train,y_pred-y_train,c='red')
plt.hlines(0,np.min(X_train),np.max(X_train))
plt.ylim(-100,100)
plt.show()

### Ridge

In [None]:
#Ridge/Lasso皆自動加上隨機誤差,所以不會提升訓練效果,但可能提升泛化能力.
Width_ridge = linear_model.Ridge(alpha=1.0)              #使用Ridge regression自動進行L2正則化,多元(複)迴歸時,所有自變數的係數將變得較小
Width_ridge.fit(X_train,y_train)            #係數變小,讓對應特徵影響最終結果的程度減少
print(Width_ridge.score(X_train,y_train))          #因為目前範例只有一個特徵
                                            #alpha設定regularation強度,越大越強

print("係數:",Width_ridge.coef_)
print("截距:",Width_ridge.intercept_)
print(Width_ridge.score(X_test,y_test)) 

### Lasso

In [None]:
Width_lasso = linear_model.Lasso(alpha=1.0)  #使用Lasso regression自動進行L1正則化,多元(複)迴歸時,有些自變數的係數將變成0
Width_lasso.fit(X_train,y_train)             #用來減少特徵數(特徵選擇)以避免overfit(過度擬和)
print(Width_lasso.score(X_train,y_train))            #Lasso(alpha=0)等同於OLS
print("係數:",Width_lasso.coef_)
print("截距:",Width_lasso.intercept_)
print(Width_lasso.score(X_test,y_test))

## Polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

#make_pipeline(step1,step2) -> data->step1->step2
model3 = make_pipeline(PolynomialFeatures(3),linear_model.LinearRegression())
model3.fit(X_train,y_train)
plt.scatter(X_train,y_train,c='red')
plt.scatter(X_test,y_test,c='green')
plt.plot(np.sort(X_train,axis=0),model3.predict(np.sort(X_train,axis=0)))
plt.show()
print("訓練誤差",model3.score(X_train,y_train))
print("測試誤差",model3.score(X_test,y_test))

### DummyRegression

In [None]:
from sklearn.dummy import DummyRegressor

clf_dummy = DummyRegressor(strategy='mean')   #任何輸入都回傳train_y的平均值
clf_dummy.fit(X_train,y_train)
clf_dummy.predict(X_train)

## Logistic Regression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression

In [None]:
model_p = LogisticRegression(penalty=None) 
model_p.fit(X,y)
model_p.score(X,y) 

#降低C值,希望能夠避免overfit
model = LogisticRegression(C=0.1)

In [None]:
# PolynomialFeatures 線性不可分時
from sklearn import preprocessing
poly5 = preprocessing.PolynomialFeatures(5) # 1, x1, x2, x1^2, x1*x2, x2, x1^3, .... x1*x2^4, x2^5 
X = poly5.fit_transform(np.c_[data['x1'],data['x2']]) # np.c_[array1,array2]按照column連接兩個array,np.r_[]則按照row連接
LR_poly5 = linear_model.LogisticRegression(penalty='l2', C=1.0) #l2是預設值,採用Ridge(降低特徵係數值)
LR_poly5.fit(X,y)