<a href="https://colab.research.google.com/github/rhdtka21/DeepLearningProject/blob/master/Final_%EC%84%9D%EC%A0%95%EC%9A%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

In [0]:
train.set_index('Id', inplace=True)
test.set_index('Id', inplace=True)
len_train = len(train)
len_test = len(test)

In [0]:
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>=0.2]
top_corr_features

In [0]:
plt.figure(figsize=(13,10))
g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [0]:
train_y_label = train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace=True)

In [0]:
house = pd.concat((train, test), axis=0)
house_index = house.index
print('Length of House Dataset : ',len(house))
house.head()

In [0]:
check_null = house.isna().sum() / len(house)
check_null[check_null >= 0.5]

In [0]:
remove_cols = check_null[check_null >= 0.5].keys()
house = house.drop(remove_cols, axis=1)
house.head()

In [0]:
house_obj = house.select_dtypes(include='object')
house_num = house.select_dtypes(exclude='object')

print('Object type columns:\n',house_obj.columns)
print('--------------------------------')
print('Numeric type columns:\n',house_num.columns)

In [0]:
house_dummy = pd.get_dummies(house_obj, drop_first=True)
house_dummy.index = house_index
house_dummy.head()

In [0]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer.fit(house_num)
house_num_ = imputer.transform(house_num)
house_num = pd.DataFrame(house_num_, columns=house_num.columns, index=house_index)

house_num.head()

In [0]:
house = pd.merge(house_dummy, house_num, left_index=True, right_index=True)

house.head()

In [0]:
train = house[:len_train]
test = house[len_train:]
train['SalePrice'] = train_y_label

print('train set length: ',len(train))
print('test set length: ',len(test))

In [0]:
from sklearn.model_selection import train_test_split

X_train = train.drop(['SalePrice'], axis=1)
y_train = train['SalePrice']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True)

X_test = test
test_id_idx = test.index

print('X_train : ',len(X_train))
print('X_val : ',len(X_val))
print('X_test :',len(X_test))

In [0]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

param = {
  'max_depth':[2,3,4],
  'n_estimators':range(550,700,50),
  'colsample_bytree':[0.5,0.7,1],
  'colsample_bylevel':[0.5,0.7,1],
}

model = xgb.XGBRegressor()

grid_search = GridSearchCV(
    estimator = model, param_grid = param, cv=5, 
    scoring='neg_mean_squared_error', n_jobs=-1, iid = False
)

grid_search.fit(X_train, y_train)

#print(grid_search.best_params_)
#print(grid_search.best_estimator_)

In [0]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=13, solver='lbfgs', C=1000., multi_class='multinomial')

log_reg.fit(X_train, y_train)

In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

pred_train = grid_search.predict(X_train)
pred_val = grid_search.predict(X_val)

print('train mae score: ', mean_absolute_error(y_train, pred_train))
print('val mae score:', mean_absolute_error(y_val, pred_val))

In [0]:
plt.figure(figsize=(17,7))

plt.plot(range(0, len(y_val)), y_val, 'o-', label='Validation Actual')
plt.plot(range(0, len(pred_val)), pred_val, '-', label='Validation Predict')

plt.title('Prediction of House Prices')
plt.ylabel('Prices')

plt.legend()

In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

pred_train = log_reg.predict(X_train)
pred_val = log_reg.predict(X_val)

print('train mae score: ', mean_absolute_error(y_train, pred_train))
print('val mae score:', mean_absolute_error(y_val, pred_val))

In [0]:
plt.figure(figsize=(17,7))

plt.plot(range(0, len(y_val)), y_val,'o-', label='Validation Actual')
plt.plot(range(0, len(pred_val)), pred_val, '-', label='Validation Predict')

plt.title('Prediction of House Prices')

plt.ylabel('Prices')
plt.legend()