# Kaggle House Price 데이터 실습
> ### <진행 순서>
> #### 1. Data Loading
> #### 2. EDA
> #### 3. Data Preprocessing
> #### 4. Model Fitting 
> #### 5. Evaluation



In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### 1. Data Loading


In [5]:
from google.colab import files
uploaded = files.upload() # 파일 업로드 기능 실행
for fn in uploaded.keys(): # 업로드된 파일 정보 출력
	print('User uploaded file "{name}" with length {length} bytes'.format(name=fn , length =len(uploaded[fn])))


ImportError: No module named 'google.colab'

In [None]:
housetrn = pd.read_csv('train.csv')

In [None]:
# 가져온 자료의 shape 확인 -> 81개의 칼럼과 1460개의 행이 보인다.
housetrn.shape

In [None]:
# 카테고리컬에 해당하는 변수 확인해보기 -> 더미변수화 필요
housetrn.select_dtypes(include=['object']).columns

In [None]:
# 칼럼 확인 많은 변수가 보인다. 
# let us check the columsn that are part of the input file
print (housetrn.columns)
# Because the number of columns are large, we can set to display all columns
pd.options.display.max_columns = 999
print (pd.options.display.max_columns)

In [None]:
# 데이터셋의 상단만을 가져와 확인해보자.
# 카테고리 및 연속형 칼럼들이 보인다.
# 불필요한 index , NaN등도 삭제할 필요성이 있겠다.
print (housetrn.head())

### 2. EDA



In [None]:
# Seaborn을 사용한 데이터 분포 시각화
# https://datascienceschool.net/view-notebook/4c2d5ff1caab4b21a708cc662137bc65/

### 다음은 모든 피쳐중 하나와 y와의 스캐터플롯을 각각 그려본 결과

In [None]:
# 설명변수와 예측변수간의 stripplot을 그려보자.
# Let us do a scatterplot for the variable selected as HIGH
sns.set_style("whitegrid")
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(10, 5)
sns.stripplot(x="Neighborhood", y="SalePrice", data=housetrn, jitter=True)
plt.xticks(rotation=45)
plt.show()

Neighborhood에 따라 y축의 Price 분포가 상이
집 값을 떨어뜨리는 혹은 높이는 변수가 존재함을 유추할 수 있다. 

In [None]:
housetrn['BldgType']

In [None]:
## OverallQual와 BldgType , price 간의 factorplot
# factorplot은 색상(hue)과 행(row) 등을 동시에 사용하여 3 개 이상의 카테고리 값에 의한 분포 변화를 보여준다.
sns.set_style('ticks')
sns.factorplot(x="OverallQual", y="SalePrice", col="BldgType", data=housetrn, kind="swarm", col_wrap=3)
fig.set_size_inches(5, 4)
plt.show()

In [None]:
## 년도와 price간의 barplot
fig, ax = plt.subplots()
fig.set_size_inches(10, 5)
sns.barplot(x="YearBuilt", y="SalePrice", data=housetrn)
plt.xticks(rotation=90)
sns.set_style('ticks')
plt.show()

In [None]:
## TotalBsmtSF와 price간의 regplot
## GrLivArea와 price간의 regplot
fig, ax = plt.subplots()
fig.set_size_inches(10, 5)
sns.regplot(x="TotalBsmtSF", y="SalePrice", data=housetrn)
sns.regplot(x="GrLivArea", y="SalePrice", data=housetrn)
plt.show()

In [None]:
## SaleCondition와 price 간의 factorplot
sns.factorplot(x="SaleCondition", y="SalePrice", col="Functional", data=housetrn, kind="swarm", col_wrap=3)
plt.show()

In [None]:
# 변수간의 correlation 확인
# Plotting the Pearson correlation of the different features
corr_matrix = housetrn.corr()
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(corr_matrix,linewidths=0.1,vmax=0.8, square=True, cmap=colormap, linecolor='white')
plt.show()

- 노란색일수록 양의 상관 남색일수록 음의 상관이 강하다.
- 몇몇 변수간의 뚜렷한 양/음의 상관이 보인다.


### 3. Data preprocessing


In [None]:
# 의미없다고 판단되는 변수는 빼고 분석하자. 
#newhousetrn = housetrn
newhousetrn = housetrn[['Id', 'LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'OverallQual', 'TotalBsmtSF', '1stFlrSF',  'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars','GarageArea', 'Neighborhood', 'YearBuilt', 'YearRemodAdd','Functional', 'SalePrice']]


In [None]:
print (newhousetrn.head())

In [None]:
# missing values 확인 
total = newhousetrn.isnull().sum().sort_values(ascending=False)
percent = (newhousetrn.isnull().sum()/newhousetrn.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print (missing_data)

LotFrontage, GarageYrBlt, MasVnrArea 세 변수에 대해 missing values처리가 필요!

In [None]:
# LotFrontage 변수 미싱밸류 처리
# mean +- std 를 랜덤으로 집어넣기
lot_av = newhousetrn.LotFrontage.mean()
lot_sd = newhousetrn.LotFrontage.std()
tot_mislot = newhousetrn.LotFrontage.isnull().sum()
rand_lot= np.random.randint(lot_av - lot_sd, lot_av + lot_sd, size=tot_mislot)
newhousetrn['LotFrontage'][np.isnan(newhousetrn['LotFrontage'])] = rand_lot
newhousetrn['LotFrontage'] = newhousetrn['LotFrontage'].astype(int)
newhousetrn.LotFrontage.hist()
plt.show()

In [None]:
# MasVnrArea에 대해 같은 작업 반복
mva_av = newhousetrn.MasVnrArea.mean()
mva_sd = newhousetrn.MasVnrArea.std()
tot_mismva = newhousetrn.MasVnrArea.isnull().sum()
rand_mva= np.random.randint(mva_av - mva_sd, mva_av + mva_sd, size=tot_mismva)
newhousetrn['MasVnrArea'][np.isnan(newhousetrn['MasVnrArea'])] = rand_mva
newhousetrn['MasVnrArea'] = newhousetrn['MasVnrArea'].astype(int)
newhousetrn.MasVnrArea.hist()
plt.show()

In [None]:
# GarageYrBlt에도 같은 작업 반복 
gyr_av = newhousetrn.GarageYrBlt.mean()
gyr_sd = newhousetrn.GarageYrBlt.std()
tot_misgyr = newhousetrn.GarageYrBlt.isnull().sum()
rand_gyr= np.random.randint(gyr_av - gyr_sd, gyr_av + gyr_sd, size=tot_misgyr)
newhousetrn['GarageYrBlt'][np.isnan(newhousetrn['GarageYrBlt'])] = rand_gyr
newhousetrn['GarageYrBlt'] = newhousetrn['GarageYrBlt'].astype(int)
newhousetrn.GarageYrBlt.hist()
plt.show()

In [None]:
# missing values처리 확인
total = newhousetrn.isnull().sum().sort_values(ascending=False)
percent = (newhousetrn.isnull().sum()/newhousetrn.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print (missing_data)

In [None]:
# 데이터타입 중 카테고리형(object)을 바꿀 필요성이 있다. 
newhousetrn.dtypes

In [None]:
# Encoding the Categorical variables
cols_to_transform = newhousetrn[['Id', 'Neighborhood', 'Functional']]
newcols = pd.get_dummies(cols_to_transform)

In [None]:
newcols.head()

In [None]:
# 기존 칼럼 제거
del newhousetrn['Neighborhood'] # 또는 newhousetrn = newhousetrn.drop(labels=["Neighborhood"], axis=1)

del newhousetrn['Functional']

In [None]:
# 더미변수화 한 열을 join
fhoustrn = newhousetrn.merge(newcols, how='inner', on='Id' )
fhoustrn.head()

In [None]:
fhoustrn.shape

### train_test_split

In [None]:
from sklearn.model_selection import train_test_split
y = housetrn.SalePrice
#del fhoustrn['SalePrice']
X = fhoustrn[:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)

### Normalize

In [None]:
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit(y_train.reshape(-1,1))
y_train = std_scale.transform(y_train.reshape(-1,1))

In [None]:
# Scaler이용 
# Standardizing for the unscaled 15 variables in X_train
from sklearn import preprocessing
std_scale1 = preprocessing.StandardScaler().fit(X_train[['OverallQual']])
X_train[['OverallQual']] = std_scale1.transform(X_train[['OverallQual']])
std_scale2 = preprocessing.StandardScaler().fit(X_train[['TotalBsmtSF']])
X_train[['TotalBsmtSF']] = std_scale2.transform(X_train[['TotalBsmtSF']])
std_scale3 = preprocessing.StandardScaler().fit(X_train[['1stFlrSF']])
X_train[['1stFlrSF']] = std_scale3.transform(X_train[['1stFlrSF']])
std_scale4 = preprocessing.StandardScaler().fit(X_train[['GrLivArea']])
X_train[['GrLivArea']] = std_scale4.transform(X_train[['GrLivArea']])
std_scale5 = preprocessing.StandardScaler().fit(X_train[['FullBath']])
X_train[['FullBath']] = std_scale5.transform(X_train[['FullBath']])
std_scale6 = preprocessing.StandardScaler().fit(X_train[['GarageArea']])
X_train[['GarageArea']] = std_scale6.transform(X_train[['GarageArea']])
std_scale7 = preprocessing.StandardScaler().fit(X_train[['YearBuilt']])
X_train[['YearBuilt']] = std_scale7.transform(X_train[['YearBuilt']])
std_scale8 = preprocessing.StandardScaler().fit(X_train[['LotFrontage']])
X_train[['LotFrontage']] = std_scale8.transform(X_train[['LotFrontage']])
std_scale9 = preprocessing.StandardScaler().fit(X_train[['MasVnrArea']])
X_train[['MasVnrArea']] = std_scale9.transform(X_train[['MasVnrArea']])
std_scale10 = preprocessing.StandardScaler().fit(X_train[['BsmtFinSF1']])
X_train[['BsmtFinSF1']] = std_scale10.transform(X_train[['BsmtFinSF1']])
std_scale11 = preprocessing.StandardScaler().fit(X_train[['TotRmsAbvGrd']])
X_train[['TotRmsAbvGrd']] = std_scale11.transform(X_train[['TotRmsAbvGrd']])
std_scale12 = preprocessing.StandardScaler().fit(X_train[['Fireplaces']])
X_train[['TotRmsAbvGrd']] = std_scale12.transform(X_train[['Fireplaces']])
std_scale13 = preprocessing.StandardScaler().fit(X_train[['GarageYrBlt']])
X_train[['GarageYrBlt']] = std_scale13.transform(X_train[['GarageYrBlt']])
std_scale14 = preprocessing.StandardScaler().fit(X_train[['GarageCars']])
X_train[['GarageCars']] = std_scale14.transform(X_train[['GarageCars']])
std_scale15 = preprocessing.StandardScaler().fit(X_train[['YearRemodAdd']])
X_train[['YearRemodAdd']] = std_scale15.transform(X_train[['YearRemodAdd']])

In [None]:
X_train.head()

In [None]:
# Let us apply the same technique of rescaling for the Test data set too
std_scale0 = preprocessing.StandardScaler().fit(y_test.reshape(-1,1))
y_test = std_scale0.transform(y_test.reshape(-1,1))

std_scale16 = preprocessing.StandardScaler().fit(X_test[['OverallQual']])
X_test[['OverallQual']] = std_scale16.transform(X_test[['OverallQual']])
std_scale17 = preprocessing.StandardScaler().fit(X_test[['TotalBsmtSF']])
X_test[['TotalBsmtSF']] = std_scale17.transform(X_test[['TotalBsmtSF']])
std_scale18 = preprocessing.StandardScaler().fit(X_test[['1stFlrSF']])
X_test[['1stFlrSF']] = std_scale18.transform(X_test[['1stFlrSF']])
std_scale19 = preprocessing.StandardScaler().fit(X_test[['GrLivArea']])
X_test[['GrLivArea']] = std_scale19.transform(X_test[['GrLivArea']])
std_scale20 = preprocessing.StandardScaler().fit(X_test[['FullBath']])
X_test[['FullBath']] = std_scale20.transform(X_test[['FullBath']])
std_scale21 = preprocessing.StandardScaler().fit(X_test[['GarageArea']])
X_test[['GarageArea']] = std_scale21.transform(X_test[['GarageArea']])
std_scale22 = preprocessing.StandardScaler().fit(X_test[['YearBuilt']])
X_test[['YearBuilt']] = std_scale22.transform(X_test[['YearBuilt']])
std_scale23 = preprocessing.StandardScaler().fit(X_test[['LotFrontage']])
X_test[['LotFrontage']] = std_scale23.transform(X_test[['LotFrontage']])
std_scale24 = preprocessing.StandardScaler().fit(X_test[['MasVnrArea']])
X_test[['MasVnrArea']] = std_scale24.transform(X_test[['MasVnrArea']])
std_scale25 = preprocessing.StandardScaler().fit(X_test[['BsmtFinSF1']])
X_test[['BsmtFinSF1']] = std_scale25.transform(X_test[['BsmtFinSF1']])
std_scale26 = preprocessing.StandardScaler().fit(X_test[['TotRmsAbvGrd']])
X_test[['TotRmsAbvGrd']] = std_scale26.transform(X_test[['TotRmsAbvGrd']])
std_scale27 = preprocessing.StandardScaler().fit(X_test[['Fireplaces']])
X_test[['TotRmsAbvGrd']] = std_scale27.transform(X_test[['Fireplaces']])
std_scale28 = preprocessing.StandardScaler().fit(X_test[['GarageYrBlt']])
X_test[['GarageYrBlt']] = std_scale28.transform(X_test[['GarageYrBlt']])
std_scale29 = preprocessing.StandardScaler().fit(X_test[['GarageCars']])
X_test[['GarageCars']] = std_scale29.transform(X_test[['GarageCars']])
std_scale30 = preprocessing.StandardScaler().fit(X_test[['YearRemodAdd']])
X_test[['YearRemodAdd']] = std_scale30.transform(X_test[['YearRemodAdd']])

In [None]:
X_test.head()


### 4. Model Fitting 

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV

#### 1) 일반회귀

In [None]:
model1 = LinearRegression()
rst = model1.fit(X_train, y_train)
print(rst)

#### 2) 릿지 

In [None]:
# Performing Grid Search with specific alpha values for Ridge
#alphas = np.array([1,0.1,0.01,0.001,0.0001])
alphas = np.logspace(-6, 6, 100)
model2 = Ridge()
grid_Ridge = GridSearchCV(model2, cv = 10,  param_grid = dict(alpha=alphas), scoring = 'r2')
grid_Ridge.fit(X_train, y_train)
print(grid_Ridge)
# summarize the results of the grid search
print(grid_Ridge.best_score_)
print(grid_Ridge.best_estimator_.alpha)

#### 3) 라쏘 해보기(직접)

In [None]:
# Performing Grid Search with specific of alpha values for Lasso
alphas =



model3 =
grid_Lasso = GridSearchCV(estimator=, cv = , param_grid=)
grid_Lasso.fit(, )
print(grid_Lasso)

# summarize the results of the grid search
print(grid_Lasso.best_score_)
print(grid_Lasso.best_estimator_.alpha)

#### 4) Elastic net 

In [None]:
# Performing Grid Search specific alpha values for ElasticNet
#alphas = np.array([1,0.1,0.01,0.001,0.0001])
alphas = np.logspace(-6, 6, 100)

# param_grid = {'alpha': sp_rand(), 'l1_ratio': sp_rand()}
model4 = ElasticNet()
grid_ELN = GridSearchCV(estimator=model4,cv = 10, param_grid=dict(alpha=alphas))
# rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
grid_ELN.fit(X_train, y_train)
print(grid_ELN)
# summarize the results of the grid search
print(grid_ELN.best_score_)
print(grid_ELN.best_estimator_.alpha)

#### 5. Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error
#mean_squared_error(y_true, y_pred)


In [None]:
np.sqrt(mean_squared_error(y_test,grid_Ridge.predict(X_test)))

In [None]:
np.sqrt(mean_squared_error(y_test,grid_Lasso.predict(X_test)))

In [None]:
np.sqrt(mean_squared_error(y_test,grid_ELN.predict(X_test)))