# 回帰まとめ

## 回帰とは
- 目的変数が連続値の時にデータに Y = f(X) というモデル(「定量的な関係の構造」)を当てはめる事。
- この時Xを説明変数とよぶ

## 単回帰

In [8]:
# 単一の特徴量のみでの予測をする
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score

In [9]:
# boston_dataframeを使用する
# 説明変数 13カラム
# 目的変数 単位は、1,000(USD) ボストンの住宅価格
boston = load_boston()

# 説明変数取得
df = pd.DataFrame(boston.data, columns=boston.feature_names)

# 目的変数付与
df["y"] = boston.target

In [12]:
# 訓練用データと評価用データに分割する
train_df, test_df = train_test_split(df)

In [24]:
target_col = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"]
for target in target_col:
    X_train = train_df.loc[:, [target]]
    Y_train = train_df.loc[:, ["y"]]

    X_test = test_df.loc[:, [target]]
    Y_test = test_df.loc[:, ["y"]]
    
    model = linear_model.LinearRegression()
    model.fit(X_train,Y_train)
    
    pred_y = model.predict(X_test)
    
    print("{}_決定係数:".format(target),r2_score(Y_test, pred_y))

CRIM_決定係数: 0.12028843089245389
ZN_決定係数: 0.192680450654928
INDUS_決定係数: 0.24770059890566976
CHAS_決定係数: 0.04391448044120616
NOX_決定係数: 0.21571011400145212
RM_決定係数: 0.4030082780556503
AGE_決定係数: 0.1787368701304276
DIS_決定係数: 0.08992506943448964
RAD_決定係数: 0.16491986088282828
TAX_決定係数: 0.23190802147728107
PTRATIO_決定係数: 0.35363157159299363
B_決定係数: 0.07469980254307773
LSTAT_決定係数: 0.5397355832460002


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

0.12028843089245389

## Ridge回帰とは?
- 過学習を防ぐため線形回帰に正則化項(ペナルティ項)としてL2ノルムを導入したモデル
- 最小二乗法の式に正則化項を加え、その最小を求めることでモデル関数を発見する
- L2ノルム : ユークリッド距離と呼ばれるもので、ベクトル成分同士の差の二乗和の平方根

- 0に収束することが出来ないことに留意
- 係数の絶対値の大きさを可能な限り小さくする

In [3]:
import pandas as pd

from sklearn.linear_model import Lasso
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [38]:
df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [57]:
target_col = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [59]:
# 訓練用データと、評価用データに分割する
x_train, x_test, y_train, y_test = train_test_split(df.loc[:,target_col],df["y"],random_state=123)

# モデルの作成
model = Lasso()
model.fit(x_train, y_train)

0.6554460628881653

In [60]:
pred = model.predict(x_test)
r2_score(y_test, pred)

0.6554460628881653

In [74]:
# 作成したデータに対して、下記のように説明変数を持ったinputに対して、一意なyが得られる
test_df = x_df.head(1)[target_col]
display(test_df)

predict= model.predict(test_df)
print(predict)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98


[31.09073857]
