In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

# **1. Read the Data**

In [None]:
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
train_data.info()

# **2. Data preprocessing**

## 2-1. Split the 'onject' columns and 'numberic'

In [None]:
#Split the train_data as dtype : 'object'(train_data_object) and others
df_object = []
df_numberic = []
for i in range(len(train_data.columns)):
    if train_data[train_data.columns[i]].dtype ==  'object':
        df_object.append(train_data.columns[i])
    else:
        df_numberic.append(train_data.columns[i])

train_data_object = train_data[df_object]
train_data_numberic = train_data[df_numberic]

#Fill NaN
train_data_object = train_data_object.fillna('NO')
train_data_numberic = train_data_numberic.fillna(0)
train_data = pd.concat([train_data_object, train_data_numberic], axis = 1)

#  **3. EDA**

In [None]:
sns.set()
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('The SalePrice analysis')

sns.distplot(train_data['SalePrice'], ax = axes[0], norm_hist = True)
axes[0].set_title('Distribution')
axes[0].set_xlabel('SalePrice')
axes[0].set_ylabel('Count')


sns.boxplot(x = train_data['SalePrice'], ax = axes[1])
axes[1].set_title('Boxplot')
axes[1].set_xlabel('SalePrice')


sns.scatterplot(y = train_data['SalePrice'],x = train_data.index, ax = axes[2])
axes[2].set_title('Scatter')
axes[2].set_ylabel('SalePrice')

## 3-1. Numberic feature columns analysis

In [None]:
fig, axes = plt.subplots(19, 2, figsize=(15, 80))
fig.suptitle('The numeric feature')
for i in range(19):
    for j in range(2):
        sns.regplot(x = train_data_numberic[train_data_numberic.columns[2*i + j]], y = train_data_numberic['SalePrice'], ax = axes[i, j])

## 3-2. Heatmap with cor > 0.5 numberic columns

In [None]:
plt.figure(figsize = (20 , 20))
sns.heatmap(train_data_numberic.corr()[(train_data_numberic.corr() >= 0.5) | (train_data_numberic.corr() <= - 0.5)], annot = True, center = 0)

In [None]:
highcor_columns = []
for i in pd.DataFrame(train_data_numberic.corr()[train_data_numberic.corr() >= 0.5]['SalePrice']).dropna().index:
    highcor_columns.append(i)
sns.pairplot(train_data_numberic.corr()[highcor_columns])

In [None]:
sns.set()
fig, axes = plt.subplots(1, 5, figsize=(25, 5))
fig.suptitle('The highcor_columns')
for i in range(5):
    sns.scatterplot(x = train_data[highcor_columns[i]], y = train_data['SalePrice'], ax = axes[i])
    sns.regplot(x = train_data[highcor_columns[i]], y = train_data['SalePrice'], ax = axes[i], color = 'r')
    axes[i].set_title(f'{highcor_columns[i]}')
    axes[i].set_xlabel(f'{highcor_columns[i]}')
    axes[i].set_ylabel('SalePrice')
fig, axes = plt.subplots(1, 5, figsize=(25, 5))
fig.suptitle('The highcor_columns')
for i in range(5):
    sns.scatterplot(x = train_data[highcor_columns[i+5]], y = train_data['SalePrice'], ax = axes[i])
    sns.regplot(x = train_data[highcor_columns[i+5]], y = train_data['SalePrice'], ax = axes[i], color = 'g')
    axes[i].set_title(f'{highcor_columns[i+5]}')
    axes[i].set_xlabel(f'{highcor_columns[i+5]}')
    axes[i].set_ylabel('SalePrice')

## 3.3 Object feature columns analysis

In [None]:
train_data_object['SalePrice'] = train_data_numberic['SalePrice']
train_data_object['MSSubClass'] = train_data_numberic['MSSubClass'].apply(str)
fig, axes = plt.subplots(9, 5, figsize=(30, 80))
fig.suptitle('The object feature')
for i in range(9):
    for j in range(5):
        sns.boxplot(x = train_data_object[train_data_object.columns[5*i + j]], y = train_data_object['SalePrice'], ax = axes[i, j])
train_data_object = train_data_object.drop(columns = ['SalePrice'])

# **4. Create the Combine DataFrame for each train.csv and test.csv (important!! for same dimension)**

In [None]:
train_data_numberic = train_data_numberic.drop(columns = ['Id'])
train_data_numberic['TotalFlrSF'] = train_data_numberic['1stFlrSF'] + train_data_numberic['2ndFlrSF']
train_data_object['MSSubClass'] = train_data_numberic['MSSubClass'].apply(str)
train_data_object_dum = pd.get_dummies(train_data_object)
train_data = pd.concat([train_data_object_dum, train_data_numberic], axis = 1)

## 4-1. Test_data

In [None]:
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

#Split the test_data as dtype : 'object'(test_data_object) and others
df_object = []
df_numberic = []
for i in range(len(test_data.columns)):
    if test_data[test_data.columns[i]].dtype ==  'object':
        df_object.append(test_data.columns[i])
    else:
        df_numberic.append(test_data.columns[i])

test_data_object = test_data[df_object]
test_data_numberic = test_data[df_numberic]

#Fill NaN
test_data_object = test_data_object.fillna('NO')
test_data_numberic = test_data_numberic.fillna(0)
test_data = pd.concat([test_data_object, test_data_numberic], axis = 1)

#dummies!
test_data_object['MSSubClass'] = test_data_numberic['MSSubClass'].apply(str)
test_data_numberic = test_data_numberic.drop(columns = ['Id'])
test_data_numberic['TotalFlrSF'] = test_data_numberic['1stFlrSF'] + test_data_numberic['2ndFlrSF']
test_data_object_dum = pd.get_dummies(test_data_object)
test_data = pd.concat([test_data_object_dum, test_data_numberic], axis = 1)

## 4-2. Combine the train and test columns

In [None]:
train_columns = []
for i in train_data.columns:
    train_columns.append(i)

for i in test_data.columns:
    train_columns.append(i)

#Unique in list
combine_columns = set(list(train_columns))

In [None]:
df_train = pd.DataFrame(train_data, columns = combine_columns)
df_train = df_train.fillna(0)

In [None]:
df_test = pd.DataFrame(test_data, columns = combine_columns)
df_test = df_test.fillna(0).drop(columns = ['SalePrice'])

In [None]:
df_train.head()

In [None]:
df_test.head()

# **5. Data Preprocessing**

## 5-1. Split the train_data as x_train , x_test, y_train, y_test 

### ->The target I decide transform target = np.log(df_train['SalePrice])

In [None]:
from sklearn.model_selection import train_test_split

data = df_train.drop(columns = 'SalePrice')
target = np.log(df_train['SalePrice'])


x_train, x_test, y_train, y_test = train_test_split(data, target, train_size = 0.8, random_state = 5)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
Normalize = StandardScaler()
x_train = Normalize.fit_transform(x_train)
x_test = Normalize.transform(x_test)

# **6. Four Machine Learning Model**

## 6-1. DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor(max_depth = 10, min_samples_leaf = 2, max_features = 250).fit(x_train, y_train)
y_pred_DTR = DTR.predict(x_test)
plt.scatter(np.exp(y_test), np.exp(y_pred_DTR))
plt.plot([100000*x for x in range(0, 8)], [100000*x for x in range(0, 8)], color = 'r')
plt.xlabel("Reality Prices")
plt.ylabel("Predicted prices")
plt.title('DecisionTreeRegressor')
plt.show()
plt.clf()

## 6-2. RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(max_depth = 3, n_estimators = 1500).fit(x_train, y_train)
y_pred_RFR = RFR.predict(x_test)
plt.scatter(np.exp(y_test), np.exp(y_pred_RFR))
plt.plot([100000*x for x in range(0, 8)], [100000*x for x in range(0, 8)], color = 'r')
plt.xlabel("Reality Prices")
plt.ylabel("Predicted prices")
plt.title('RandomFroestRegressor')
plt.show()
plt.clf()

## 6-3. GradientBoostRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor(learning_rate=0.015, max_depth= 5,
                                min_samples_leaf=1,n_estimators=300, random_state=2,subsample = 0.2).fit(x_train, y_train)
y_pred_GBR = GBR.predict(x_test)
plt.scatter(np.exp(y_test), np.exp(y_pred_GBR))
plt.plot([100000*x for x in range(0, 8)], [100000*x for x in range(0, 8)], color = 'r')
plt.xlabel("Reality Prices")

plt.ylabel("Predicted prices")
plt.title('GradientBoostingRegressor')
plt.show()
plt.clf()

## 6-4. SVR-rbf 

In [None]:
from sklearn.svm import SVR
svr = SVR(kernel = 'rbf', gamma = 'auto', C = 0.7, degree = 3, epsilon=0.05, coef0=20).fit(x_train, y_train)
y_pred_svr = svr.predict(x_test)
plt.scatter(np.exp(y_test), np.exp(y_pred_svr))
plt.plot([100000*x for x in range(0, 8)], [100000*x for x in range(0, 8)], color = 'r')
plt.xlabel("Reality Prices")
plt.ylabel("Predicted prices")
plt.title('SVM')
plt.show()
plt.clf()


## 6-5. KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
sns.set()
KNN = KNeighborsRegressor(n_neighbors = 5, weights = 'distance').fit(x_train, y_train)
y_pred_KNN = KNN.predict(x_test)
plt.scatter(np.exp(y_test), np.exp(y_pred_KNN))
plt.plot([100000*x for x in range(0, 8)], [100000*x for x in range(0, 8)], color = 'r')
plt.xlabel("Reality")
plt.ylabel("Predicted")
plt.title('KNeighborsRegressor')
plt.show()
plt.clf()

## 6-6. Deep Learning

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
model = Sequential()
model.add(Dense(512, input_shape = (x_train.shape[1], ), activation = 'sigmoid'))
model.add(Dense(256, activation = 'sigmoid'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(8, activation = 'sigmoid'))
model.add(Dense(1))
model.compile(loss = 'mse', optimizer = 'adam', metrics= 'mse')
history = model.fit(x_train, y_train, batch_size = 16, epochs = 300 , validation_split= 0.2, verbose = 0)

In [None]:
import seaborn as sns
sns.set()
df_history = pd.DataFrame(history.history)
sns.lineplot(x = df_history.index, y = df_history.loss)

In [None]:
sns.set()
y_pred_DL = model.predict(x_test)
plt.scatter(np.exp(y_test), np.exp(y_pred_DL))
plt.plot([100000*x for x in range(0, 8)], [100000*x for x in range(0, 8)], color = 'r')
plt.xlabel("Reality")
plt.ylabel("Predicted")
plt.title('Deep Learning')
plt.show()
plt.clf()

In [None]:
from numpy.ma.core import shape
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
def model_fit(x_train, x_test, y_train, y_test):

  from sklearn.ensemble import RandomForestRegressor
  RFR = RandomForestRegressor(max_depth = 3, n_estimators = 1500).fit(x_train, y_train)

  from sklearn.tree import DecisionTreeRegressor
  DTR = DecisionTreeRegressor(max_depth = 10, min_samples_leaf = 2, max_features = 250).fit(x_train, y_train)

  from sklearn.svm import SVR
  svr = SVR(kernel = 'rbf', gamma = 'auto', C = 0.7, degree=4, epsilon=0.002, coef0=20).fit(x_train, y_train)
  
  from sklearn.ensemble import GradientBoostingRegressor 
  GBR = GradientBoostingRegressor(learning_rate=0.015, max_depth=3,
                                min_samples_leaf=1,n_estimators=1500, random_state=2,subsample = 0.2).fit(x_train, y_train)
  KNN = KNeighborsRegressor(n_neighbors = 5, weights = 'distance').fit(x_train, y_train)
  
  from tensorflow.keras.models import Sequential
  from tensorflow.keras.layers import Dense, Dropout
  model = Sequential()
  model.add(Dense(512, input_shape = (x_train.shape[1], ), activation = 'sigmoid'))
  model.add(Dense(256, activation = 'sigmoid'))
  model.add(Dense(64, activation = 'relu'))
  model.add(Dense(8, activation = 'sigmoid'))
  model.add(Dense(1))
  model.compile(loss = 'mse', optimizer = 'adam', metrics= 'mse')
  history = model.fit(x_train, y_train, batch_size = 16, epochs = 300 , validation_split= 0.2, verbose = 0)
    
  return RFR, DTR, svr, GBR, KNN, model


In [None]:
Model = model_fit(x_train, x_test, y_train, y_test)

# **7. Model's Comparsion**

## 7-1. Relative reality and prediction

In [None]:
ML_model = ['RandomForestRegressor', 'DecisionTreeRegressor', 'SVR-rbf', 'GradientBoostingRegressor','KNeighborsRegressor', 'DeepLearning']
for i in range(6):
  plt.scatter(np.exp(y_test), np.exp(Model[i].predict(x_test)))
  plt.plot([100000*x for x in range(0, 8)], [100000*x for x in range(0, 8)], color = 'r')
  plt.xlabel("Reality Prices")
  plt.ylabel("Predicted prices")
  plt.title(ML_model[i])
  plt.show()
  plt.clf()

## 7-2 Score comparsion

In [None]:
sns.set()
from sklearn.metrics import r2_score
R_square_num = []
for i in range(6):
  R_square = r2_score(y_test, Model[i].predict(x_test))
  R_square_num.append(R_square)
plt.figure(figsize = (10, 10))
plt.xlabel('R Square Score')
plt.ylabel('Model Type')
plt.title('The R Square Score Comparsion')
sns.barplot(x = R_square_num, y = ML_model)

## 7-3 Mean Squared Error Comparsion

In [None]:
sns.set()
from sklearn.metrics import mean_squared_error
mse_num = []
for i in range(6):
  mse = mean_squared_error(y_test, Model[i].predict(x_test))
  mse_num.append(mse)
plt.figure(figsize = (10, 10))
plt.xlabel('mean_square_error')
plt.ylabel('Model Type')
plt.title('The mean_square_error Comparsion')
sns.barplot(x = mse_num, y = ML_model)

# 8. Prediction the df_test -> Choose Model[3] = GradientBoostRegressor to predict

In [None]:
Model[3].score(x_test, y_test)

In [None]:
df_test = Normalize.transform(df_test)
prediction_price = np.exp(Model[3].predict(df_test))

In [None]:
submi = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
submi['SalePrice'] = prediction_price
submi.to_csv('submission.csv', index=False)

# 9. Show the prediction of distribution

In [None]:
sns.set()
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('The SalePrice analysis')

sns.distplot(submi['SalePrice'], ax = axes[0], norm_hist = True)
axes[0].set_title('Distribution')
axes[0].set_xlabel('SalePrice')
axes[0].set_ylabel('Count')


sns.boxplot(x = submi['SalePrice'], ax = axes[1])
axes[1].set_title('Boxplot')
axes[1].set_xlabel('SalePrice')


sns.scatterplot(y = submi['SalePrice'],x = submi.index, ax = axes[2])
axes[2].set_title('Scatter')
axes[2].set_ylabel('SalePrice')

In [None]:
submi