In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn import preprocessing
import warnings
warnings.simplefilter("ignore")

In [None]:
california = fetch_california_housing()
print(california.DESCR)
california_df = pd.DataFrame(data=california.data, columns=california.feature_names)
california_df["MEDV"] = california.target
california_df

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MEDV
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [None]:
#目的変数に住宅価格
X = np.array(california_df.drop("MEDV", axis= 1))
y = np.array(california_df["MEDV"])
y = y.reshape(-1, 1)

#トレーニングデータとテストデータを分ける(作成したモデルを評価するため)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 0)

#scikit-learnのLinearRegressionクラスからmodelインスタンス生成
model = LinearRegression()

#学習(フィッティング=パラメータを見つける)
model.fit(X_train, y_train)

#精度を算出
print(f'訓練データ精度={model.score(X_train, y_train)}')
print(f'テストデータ精度={model.score(X_test, y_test)}')
print(f'偏回帰係数={model.coef_}')
print(f'定数項={model.intercept_}')
#print(f'目的変数の平均値={np.average(y_train)}')
result_df = pd.DataFrame(data=model.coef_, columns=california.feature_names)
result_df.T

訓練データ精度=0.6112583475310978
テストデータ精度=0.5926096960745018
偏回帰係数=[[ 4.46776785e-01  9.18543842e-03 -1.18103991e-01  6.42251430e-01
  -9.35852708e-06 -4.08558745e-03 -4.09044501e-01 -4.23454488e-01]]
定数項=[-36.01575598]


Unnamed: 0,0
MedInc,0.446777
HouseAge,0.009185
AveRooms,-0.118104
AveBedrms,0.642251
Population,-9e-06
AveOccup,-0.004086
Latitude,-0.409045
Longitude,-0.423454


In [None]:
class Human():
    def walk(self, x):
        return x+1
    def speak(self, x):
        print(x)
        return 0
    def __repr__(self):
        return self


In [None]:
#標準化
from sklearn import preprocessing

sscaler = preprocessing.StandardScaler()
print(sscaler)
# nscaler = preprocessing.MinMaxScaler()

X = np.array(california_df.drop("MEDV", axis= 1))
y = np.array(california_df["MEDV"])
y = y.reshape(-1,1)

sscaler.fit(X)
# nscaler.fit(X)

scaled_X = sscaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, train_size = 0.7, test_size = 0.3, random_state = 0)

model_scaled = LinearRegression()
model_scaled.fit(X_train, y_train)


print(f'訓練データ精度={model_scaled.score(X_train, y_train)}')
print(f'テストデータ精度={model_scaled.score(X_test, y_test)}')
print(f'偏回帰係数={model_scaled.coef_}')
print(f'定数項={model_scaled.intercept_}')
print(f'目的変数の平均値{np.average(y_train)}')
result_df_scaled = pd.DataFrame(data=model_scaled.coef_, columns= california.feature_names)
result_df_scaled.T, result_df.T

StandardScaler()
訓練データ精度=0.6112584575654898
テストデータ精度=0.5925987798223493
偏回帰係数=[[-2.76378842e+10  1.15592957e-01 -2.92230606e-01  3.04380655e-01
  -1.05995834e-02 -4.24253903e-02 -8.73669624e-01 -8.48380417e-01
   2.76378842e+10]]
定数項=[2.0707549]
目的変数の平均値2.0683413684501972


ValueError: ignored

In [None]:
# multicollinearity

# 'MedInc'カラムと相関の強いデータを作成
W = california_df['MedInc']/1000 + 0.63
#W = np.random.rand(506, 1)

# dfに加える
california_df["W"] = W
X_multi = np.array(california_df.drop("MEDV", axis= 1))
Y = np.array(california_df["MEDV"])
Y = Y.reshape(-1, 1)

# 標準化
sscaler_multi = preprocessing.StandardScaler()
sscaler_multi.fit(X_multi)

scaled_X_multi = sscaler_multi.transform(X_multi)

# データ分割
X_train, X_test, Y_train, Y_test = train_test_split(scaled_X_multi, Y, train_size = 0.7, test_size = 0.3, random_state = 0)

# インスタンス生成
model_multi = LinearRegression()

# 学習
model_multi.fit(X_train, Y_train)

# 精度を見る
print(f'訓練データ精度={model_multi.score(X_train, Y_train)}')
print(f'テストデータ精度={model_multi.score(X_test, Y_test)}')
print(f'偏回帰係数={model_multi.coef_}')
print(f'定数項={model_multi.intercept_}')
print(f'目的変数の平均値{np.average(Y_train)}')
result_df = pd.DataFrame(data= model_multi.coef_, columns= california_df.drop("MEDV", axis= 1).columns)
result_df.T

訓練データ精度=0.6112584575654898
テストデータ精度=0.5925987798223493
偏回帰係数=[[-2.76378842e+10  1.15592957e-01 -2.92230606e-01  3.04380655e-01
  -1.05995834e-02 -4.24253903e-02 -8.73669624e-01 -8.48380417e-01
   2.76378842e+10]]
定数項=[2.0707549]
目的変数の平均値2.0683413684501972


Unnamed: 0,0
MedInc,-27637880000.0
HouseAge,0.115593
AveRooms,-0.2922306
AveBedrms,0.3043807
Population,-0.01059958
AveOccup,-0.04242539
Latitude,-0.8736696
Longitude,-0.8483804
W,27637880000.0


In [None]:
#predict
predict_X = np.array([scaled_X[0, :]])
predict_X_multi = np.array([scaled_X_multi[0, :]])
print(model_scaled.predict(predict_X))
print(model_multi.predict(predict_X_multi))

[[4.16382499]]
[[4.16434468]]


In [None]:
# Ridge regression
from sklearn.linear_model import Ridge

model_ridge = Ridge(alpha=1)
model_ridge.fit(scaled_X_multi, Y)
result_df = pd.DataFrame(data=model_ridge.coef_, columns=california_df.drop("MEDV", axis= 1).columns)
result_df.T

Unnamed: 0,0
MedInc,0.414821
HouseAge,0.11882
AveRooms,-0.265465
AveBedrms,0.305583
Population,-0.004479
AveOccup,-0.039331
Latitude,-0.899226
Longitude,-0.869881
W,0.414821


In [None]:
# Lasso Regression
from sklearn.linear_model import Lasso

model_lasso = Lasso(alpha=0.1)
model_lasso.fit(scaled_X_multi, Y)
coef = model_lasso.coef_.reshape(1, -1)
result_df = pd.DataFrame(data=coef, columns=california_df.drop("MEDV", axis= 1).columns)
result_df.T

Unnamed: 0,0
MedInc,0.693989
HouseAge,0.106011
AveRooms,-0.0
AveBedrms,-0.0
Population,-0.0
AveOccup,-0.0
Latitude,-0.011213
Longitude,-0.0
W,0.011724
