In [63]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import load_boston

df_train = pd.read_csv('gs://sample_machine_learning_input/HousePrices/train.csv')
df_test = pd.read_csv('gs://sample_machine_learning_input/HousePrices/test.csv')



df_train = df_train.drop(df_train[(df_train['LotArea']>100000)].index)
df_train = df_train.drop(df_train[(df_train['BsmtFinSF1']>5000)].index)
df_train = df_train.drop(df_train[(df_train['LowQualFinSF']>560)].index)
df_train = df_train.drop(df_train[(df_train['GrLivArea']>4500)].index)
df_train = df_train.drop(df_train[(df_train['BsmtFullBath']>2.5)].index)
df_train = df_train.drop(df_train[(df_train['BsmtHalfBath']>1.75)].index)
df_train = df_train.drop(df_train[(df_train['BedroomAbvGr']>7)].index)
df_train = df_train.drop(df_train[(df_train['KitchenAbvGr']>2.75)].index)
df_train = df_train.drop(df_train[(df_train['OpenPorchSF']>500)].index)
df_train = df_train.drop(df_train[(df_train['EnclosedPorch']>500)].index)



df_train_y = df_train['SalePrice']
df_train_x = df_train.drop(['Id','SalePrice'],axis=1)
df_test_id = df_train['Id']
df_test = df_test.drop(['Id'],axis=1)

# データタイプがobjectの列の値をラベル化した数値に変換
for i in range(df_train_x.shape[1]):
    if df_train_x.iloc[:,i].dtypes == object:
        lbl = LabelEncoder()
        lbl.fit(list(df_train_x.iloc[:,i].values) + list(df_test.iloc[:,i].values))
        df_train_x.iloc[:,i] = lbl.transform(list(df_train_x.iloc[:,i].values))
        df_test.iloc[:,i] = lbl.transform(list(df_test.iloc[:,i].values))

Xmat = pd.concat([df_train_x, df_test])
# 欠損が多いカラムを削除
Xmat = Xmat.drop(['LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
Xmat = Xmat.drop(['LotShape','LotConfig','BsmtFinSF1','3SsnPorch','ScreenPorch','PoolArea','PoolQC','MoSold','YrSold'], axis=1)
# 欠損値の少ないカラムのNaNは中央値(median)で埋める
Xmat = Xmat.fillna(Xmat.median())


#y_ch_train = np.log(df_train_y)
y_ch_train = df_train_y

# trainデータとtestデータを含んでいるXmatを、再度trainデータとtestデータに分割
X_ch_train = Xmat.iloc[:df_train_x.shape[0],:]
X_ch_test = Xmat.iloc[df_train_x.shape[0]:,:]


        
X_data = np.array(X_ch_train)
y_data = np.array(y_ch_train)

X_testdata = np.array(X_ch_test)


# 正規化
def norm(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return (data - mean) / std
# データを変更
X_data = norm(X_data)
X_testdata = norm(X_testdata)


# 1を作成
ones = np.ones((X_data.shape[0], 1))
# 1を追加
X_data = np.c_[ones, X_data]

ones = np.ones((1459, 1))
# 1を追加
X_testdata = np.c_[ones, X_testdata]




# 訓練データとテストデータへ切り分け
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
y_train = y_train.reshape(y_train.shape[0],1)
y_test = y_test.reshape(y_test.shape[0],1)

# 学習率とエポック（反復処理回数）
learning_rate = 0.00001
training_epochs = 1000000
# 特徴量の数
n_dim = X_data.shape[1]
# 特徴量（X)とターゲット（y）のプレースホルダー
X = tf.placeholder(tf.float32,[None,n_dim])
Y = tf.placeholder(tf.float32,[None,1])
# 係数（W）と定数項（b）の変数
W = tf.Variable(tf.ones([n_dim,1]))
b = tf.Variable(0.0)

# 線形モデル
y = tf.add(b, tf.matmul(X, W))
# コスト関数
cost = tf.reduce_mean(tf.square(y - Y))
# 最適化
training_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

# 初期化
init = tf.global_variables_initializer()
# モデル訓練開始
sess = tf.Session()
sess.run(init)
cost_history = []
for epoch in range(training_epochs):
    sess.run(training_step, feed_dict={X:X_train, Y:y_train})
    cost_history = np.append(cost_history, sess.run(cost, feed_dict={X:X_train, Y:y_train}))
    if epoch % 100 == 0:
        W_val = sess.run(W)
        b_val = sess.run(b)

# 誤差（cost）を確認
print(cost_history[1])
print(cost_history[50])
print(cost_history[100])
print(cost_history[1000])
print(cost_history[10000])
print(cost_history[100000])
print(cost_history[999999])


# テストデータを使って予測
pred_test = sess.run(y, feed_dict={X: X_test})

pred = pd.DataFrame({"実際":y_test[:,0], "予測":pred_test[:,0]})
print(pred.head())

pred_test2 = sess.run(y, feed_dict={X: X_testdata})

output_y = pred_test2[:,0]

#y_test_pred = np.exp(output_y)

print(output_y)

submission = pd.DataFrame({
    "Id": test_id,
    "SalePrice": output_y
})
submission.to_csv('gs://sample_machine_learning_output/HousePrices/hp_submission9.csv', index=False)



38940430336.0
38715600896.0
38488559616.0
34773118976.0
15798285312.0
711867072.0
603353344.0
              予測      実際
0  127876.992188  142000
1  328054.656250  354000
2   85962.648438   84900
3  253691.437500  180000
4   60736.589844   80000
[127977.625 170504.95  188340.19  ... 182424.06  131349.62  240035.38 ]
