In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Import data

In [None]:
sample_submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
f = open('/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt', 'r')
data_description = f.read()
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
df_train = pd.DataFrame(train)
df_test = pd.DataFrame(test)

In [None]:
pd.set_option('display.max_columns', 100)

# 2. Preprocessing

In [None]:
df_all = pd.concat([df_train.drop(columns='SalePrice'),df_test],ignore_index=True)

In [None]:
df_all

In [None]:
# Search for missing data
import missingno as msno
msno.matrix(df=df_all, figsize=(20,14), color=(0,.3,.3))


In [None]:
# String label to categorical values
from sklearn.preprocessing import LabelEncoder

for i in range(df_all.shape[1]):
    if df_all.iloc[:,i].dtypes == object:
        lbl = LabelEncoder()
        lbl.fit(list(df_all.iloc[:,i].values))
        df_all.iloc[:,i] = lbl.transform(list(df_all.iloc[:,i].values))

In [None]:
# Missing data (type = int or float) fill in 0
for column in df_all.columns:
    df_all[column] = df_all[column].fillna(0)

In [None]:
# Add new columns
def add_new_columns(df):
    # 建物内の総面積 = 1階の面積 + 2階の面積 + 地下の面積
    df["TotalSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"]

    # 一部屋あたりの平均面積 = 建物の総面積 / 部屋数
    df['AreaPerRoom'] = df['TotalSF']/df['TotRmsAbvGrd']

    # 築年数 + 最新リフォーム年 : この値が大きいほど値段が高くなりそう
    df['YearBuiltPlusRemod']=df['YearBuilt']+df['YearRemodAdd']

    # お風呂の総面積
    # Full bath : 浴槽、シャワー、洗面台、便器全てが備わったバスルーム
    # Half bath : 洗面台、便器が備わった部屋)(シャワールームがある場合もある)
    # シャワーがない場合を想定してHalf Bathには0.5の係数をつける
    df['TotalBathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))

    # 合計の屋根付きの玄関の総面積 
    df['TotalPorchSF'] = (df['OpenPorchSF'] + df['3SsnPorch'] + df['EnclosedPorch'] + df['ScreenPorch'] + df['WoodDeckSF'])

    # プールの有無
    df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)

    # 2階の有無
    df['Has2ndFloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)

    # ガレージの有無
    df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)

    # 地下室の有無
    df['HasBsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)

    # 暖炉の有無
    df['HasFireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

add_new_columns(df_all)

In [None]:
msno.matrix(df=df_all, figsize=(20,14), color=(0,.3,.3))

In [None]:
df_all.info()

In [None]:
df_train = pd.merge(df_all.iloc[df_train.index[0]:df_train.index[-1]+1],df_train['SalePrice'],left_index=True,right_index=True)
df_test = df_all.iloc[df_train.index[-1]+1:]

# 3. Histogram

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df_train['SalePrice'])

In [None]:
# Truncate outliers
df_train = df_train[(df_train['SalePrice'] < 510000)]
sns.histplot(df_train['SalePrice'])

# 4. Check the correlation for each item

In [None]:
df_train_corr = df_train.corr()
df_train_corr

In [None]:
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(df_train_corr, vmax=.8, square=True, cmap='Blues');

# 5. Extract items with high correlation coefficient

In [None]:
predictor_cols = []
for i in df_train_corr:
    if df_train_corr[i]['SalePrice'] > 0.05 or df_train_corr[i]['SalePrice'] < -0.05:
        innerName = df_train_corr[i].name
        if innerName != 'SalePrice':
            predictor_cols.append(innerName)
predictor_cols

# 6. Modeling

In [None]:
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import GridSearchCV

train_x = df_train[predictor_cols]

In [None]:
train_x.info()

In [None]:
# Scaling
#scale_train_x = preprocessing.minmax_scale(train_x[:, :])

train_y = df_train.SalePrice

# Tune parameters
search_params = {
    'n_estimators'      : [600],
    'max_features'      : [24],
    'random_state'      : [0],
    'n_jobs'            : [-1],
    'min_samples_split' : [3],
    'max_depth'         : [17]
}

model = GridSearchCV(
    RFR(),
    search_params,
    cv = 3,
    verbose=True
)

model.fit(train_x, train_y)

In [None]:
print(model.best_estimator_)

In [None]:
from sklearn import preprocessing
from sklearn.metrics import r2_score

# Check score
t_true = np.array(df_train['SalePrice'])
x_forScore = df_train[predictor_cols]
np_x_forScore = np.array(x_forScore)
float_x_forScore = np_x_forScore.astype('float32')
# Scaling
#scale_float_x_forScore = preprocessing.minmax_scale(float_x_forScore[:, :])

predict = model.predict(float_x_forScore)
print(t_true)
print(predict)
r2_score = r2_score(t_true, predict)
r2_score

# 7. Predict

In [None]:
test_x = df_test[predictor_cols]

In [None]:
# Scaling
#scale_test_x = preprocessing.minmax_scale(test_x[:, :])

In [None]:
test_y = model.predict(test_x)
test_y

# 8. Prepare upload data

In [None]:
if r2_score > 0.95:
    my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': test_y})
    my_submission.to_csv('submission.csv', index=False)
    print('Succeeded : r2 = {:.5f}'.format(r2_score))
else:
    print('Low Score : r2 = {:.5f}'.format(r2_score))