# データの読み込み

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

In [2]:
#CSVデータの読み込み
df = pd.read_csv('housing.csv')

In [18]:
# df.head(3)

# 分布の確認

In [4]:
import seaborn as sns

In [11]:
# sns.distplot(df['x6'])

# 外れ値除去（３σ法）

In [7]:
col = 'x6'

In [12]:
mean = df.mean()
# mean

In [9]:
mean.x6

6.284634387351788

In [10]:
mean[col]

6.284634387351788

In [17]:
# 標準偏差（standart deviation
sigma = df.std()
# sigma

In [16]:
sigma[col]

0.7026171434153234

In [20]:
low = mean[col] - 3 * sigma[col]
low

4.176782957105817

In [21]:
high = mean[col] + 3 * sigma[col]
high

8.392485817597759

In [24]:
df2 = df[(df[col] > low) & (df[col] < high)]
df2.head(5)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [25]:
len(df)

506

In [26]:
len(df2)

498

In [77]:
# 分布の確認
# sns.distplot(df2[col])

In [78]:
# sns.distplot(df[col])

# 全変数の適応

In [32]:
cols = df.columns
cols

Index(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11',
       'x12', 'x13', 'y'],
      dtype='object')

In [34]:
_df = df
for col in cols:
    # 3σ法の上下限値を設定
    low = mean[col] - 3 * sigma[col]
    high = mean[col] + 3 * sigma[col]
    # 条件での絞り込み
    _df = _df[ (_df[col] > low) & (_df[col] < high) ]

In [35]:
# オリジナル
len(df)

506

In [37]:
# 3σ法適応後
len(_df)
# 変数が多ければ多いほど減っていく
# 減らし過ぎがボトルネックになるようであれば調整する

415

### 対処法

- 今回@外れ値は取り除く
- 外れ値を平均もしくは中央値などで埋める
- 主成分分析等を使って、潜在変数に変換した後に3σ法を適応 ← 高度

# 入力変数と出力変数に分割

In [38]:
_df.head(1)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0


In [39]:
# df.iloc[行, 列]
X = _df.iloc[:, :-1]
y = _df.iloc[:, -1]

In [42]:
X.head(1)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98


In [43]:
y.head(1)

0    24.0
Name: y, dtype: float64

In [44]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

# 重回帰分析

In [47]:
from sklearn.linear_model import LinearRegression

In [48]:
# モデルを宣言
model = LinearRegression()

In [51]:
# モデルの学習（パラメータの調整）
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [52]:
# 検証←訓練データ
model.score(X_train, y_train)


0.7972109224535133

In [53]:
# 検証←検証データ
model.score(X_test, y_test)

0.6253776004329592

In [54]:
# 過学習（オーバーフィッティング）
# 訓練データに対して、検証データの数値が下がること

# スケーリング

In [55]:
from sklearn.preprocessing import StandardScaler

In [56]:
# scalerの宣言
scaler = StandardScaler()

In [57]:
# scalerの学習←平均と標準偏差を計算して持っておくこと
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [59]:
# scaling
X_train2 = scaler.transform(X_train)
X_test2 = scaler.transform(X_test)

In [67]:
# X_train2

In [68]:
# X_test2

In [62]:
# モデルの宣言
model = LinearRegression()

In [63]:
# モデルの学習
model.fit(X_train2, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [64]:
# 検証←訓練データ
model.score(X_train2, y_train)

0.7972109224535133

In [65]:
# 検証←検証データ
model.score(X_test2, y_test)

0.6253776004329598

In [70]:
# 重みの確認
model.coef_

array([-0.2 ,  0.21,  0.51, -0.  , -1.21,  3.92, -0.37, -1.86,  1.17,
       -1.53, -1.87,  0.24, -2.76])

In [69]:
np.set_printoptions(precision=2, suppress=True)

In [76]:
# sns.distplot(_df['x13'])
# 分布がきれいなものが重みとして効きやすい