In [45]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.preprocessing import LabelEncoder
from google.cloud import bigquery
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

#一時的にwarnings非表示に
import warnings
warnings.filterwarnings('ignore')
import time

print("--Start--")
start = time.time()

# データの読み込み
train =pd.read_csv('gs://sample_machine_learning_input/HousePrices/train.csv')
test = pd.read_csv('gs://sample_machine_learning_input/HousePrices/test.csv')

test_id = test['Id']

# データタイプがobjectの列の値をラベル化した数値に変換
for i in range(train.shape[1]):
    if train.iloc[:,i].dtypes == object:
        lbl = LabelEncoder()
        lbl.fit(list(train.iloc[:,i].values) + list(test.iloc[:,i].values))
        train.iloc[:,i] = lbl.transform(list(train.iloc[:,i].values))
        test.iloc[:,i] = lbl.transform(list(test.iloc[:,i].values))


train = train.drop(train[(train['LotArea']>100000)].index)
train = train.drop(train[(train['Street']<0.1)].index)
train = train.drop(train[(train['Utilities']>0.9)].index)
train = train.drop(train[(train['SalePrice']>700000)].index)
train = train.drop(train[(train['BsmtFinSF1']>5000)].index)
train = train.drop(train[(train['Electrical']>4.5)].index)
train = train.drop(train[(train['LowQualFinSF']>560)].index)
train = train.drop(train[(train['GrLivArea']>4500)].index)
train = train.drop(train[(train['BsmtFullBath']>2.5)].index)
train = train.drop(train[(train['BsmtHalfBath']>1.75)].index)
train = train.drop(train[(train['BedroomAbvGr']>7)].index)
train = train.drop(train[(train['KitchenAbvGr']>2.75)].index)
train = train.drop(train[(train['OpenPorchSF']>500)].index)
train = train.drop(train[(train['EnclosedPorch']>500)].index)
train = train.drop(train[(train['SaleCondition']>-1) & (train['SalePrice']>700000)].index)


train = train.drop(['Street','Utilities','Condition2','CentralAir','LowQualFinSF','KitchenAbvGr','PavedDrive','PoolArea','PoolQC','MiscVal'],axis=1)
test = test.drop(['Street','Utilities','Condition2','CentralAir','LowQualFinSF','KitchenAbvGr','PavedDrive','PoolArea','PoolQC','MiscVal'],axis=1)


total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

train = train.drop((missing_data[missing_data['Total'] > 1]).index,1)
train = train.drop(train.loc[train['Electrical'].isnull()].index)
train.isnull().sum().max()

#test = test.drop((missing_data[missing_data['Total'] > 1]).index,1)
#test = test.drop(test.loc[test['Electrical'].isnull()].index)
test.isnull().sum().max()


saleprice_scaled = StandardScaler().fit_transform(train['SalePrice'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]


train.sort_values(by = 'GrLivArea', ascending = False)[:2]
train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)

train['SalePrice'] = np.log(train['SalePrice'])
train['GrLivArea'] = np.log(train['GrLivArea'])
test['GrLivArea'] = np.log(test['GrLivArea'])


train['HasBsmt'] = pd.Series(len(train['TotalBsmtSF']), index=train.index)
train['HasBsmt'] = 0 
train.loc[train['TotalBsmtSF']>0,'HasBsmt'] = 1

test['HasBsmt'] = pd.Series(len(test['TotalBsmtSF']), index=test.index)
test['HasBsmt'] = 0 
test.loc[test['TotalBsmtSF']>0,'HasBsmt'] = 1

train.loc[train['HasBsmt']==1,'TotalBsmtSF'] = np.log(train['TotalBsmtSF'])
test.loc[test['HasBsmt']==1,'TotalBsmtSF'] = np.log(test['TotalBsmtSF'])

Xmat = pd.concat([train, test])  
#欠損値の少ないカラムのNaNは中央値(median)で埋める
Xmat = Xmat.fillna(Xmat.median())
#trainデータとtestデータを含んでいるXmatを、再度trainデータとtestデータに分割
train = Xmat.iloc[:train.shape[0],:]
test = Xmat.iloc[train.shape[0]:,:]

y = train['SalePrice']
train = train.drop(['Id','SalePrice'],axis=1)
test = test.drop(['Id','SalePrice'],axis=1)




#---
skl = LinearRegression()
skl.fit(train,y)
print(skl.score(train,y))


#---
las_model = Lasso()
las_cv = GridSearchCV(las_model, {'alpha': [10, 1, 0.1, 0.01, 0.001, 0.0001]}, verbose=1)
las_cv.fit(train, y)
print(las_cv.best_params_, las_cv.best_score_)
# 改めて最適パラメータで学習
las_reg = Lasso(**las_cv.best_params_)
las_reg.fit(train, y)

#---
forest_model = RandomForestRegressor()
forest_cv = GridSearchCV(forest_model, {'n_estimators'  : [3, 10, 100, 1000, 10000], 'n_jobs': [-1]}, verbose=1)
forest_cv.fit(train, y)
print(forest_cv.best_params_, forest_cv.best_score_)

# 改めて最適パラメータで学習
forest_reg = RandomForestRegressor(**forest_cv.best_params_)
forest_reg.fit(train, y)




#---
# xgboostモデルの作成
xgb_model = xgb.XGBRegressor()

xgb_cv = GridSearchCV(xgb_model, {'max_depth': [2,4,6], 'n_estimators': [50,100,200]}, verbose=1)
xgb_cv.fit(train, y)
print(xgb_cv.best_params_, xgb_cv.best_score_)

# 改めて最適パラメータで学習
xgb_reg = xgb.XGBRegressor(**xgb_cv.best_params_)
xgb_reg.fit(train, y)
#---


blend_models_predict = ( (0.25 * skl.predict(test)) 
                        + (0.25 * las_reg.predict(test))
                        + (0.25 * xgb_reg.predict(test))
                        + (0.25 * forest_reg.predict(test)) )
                        

result = np.exp(blend_models_predict)

submission = pd.DataFrame({
    "Id": test_id,
    "SalePrice": result
})
submission.to_csv('gs://sample_machine_learning_output/HousePrices/hp_submission8.csv', index=False)
########
elapsed_time = time.time() - start
print ("経過時間:{0}".format(elapsed_time) + "[sec]")
print("--End--")

--Start--
0.9211249370095845
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.9s finished


{'alpha': 0.0001} 0.9087271005633317
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  5.8min finished


{'n_jobs': -1, 'n_estimators': 1000} 0.8757505302628772
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    8.6s finished


{'max_depth': 2, 'n_estimators': 200} 0.9057338944783676
経過時間:390.771213054657[sec]
--End--
