In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm;
import sklearn
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
train =  pd.read_csv('data/EDA/train_EDA_0910.csv')
test =  pd.read_csv('data/EDA/test_EDA_0910.csv')
feat_class = pd.read_csv('data/EDA/feat_class_0910.csv')

In [3]:
feature_raw = feat_class['raw'].dropna().values.tolist()
feature_dis = feat_class['dis'].dropna().values.tolist()
feature_map = feat_class['map'].dropna().values.tolist()
feature_onehot = feat_class['onehot'].dropna().values.tolist()
feature_extra = feat_class['extra'].dropna().values.tolist()
feature_del = feat_class['del'].dropna().values.tolist()

# delete

In [4]:
feature_del

['Id',
 'LotFrontage',
 'Utilities',
 'Condition2',
 'RoofMatl',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'Heating',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'MiscFeature',
 'PoolArea',
 'PoolQC',
 'MiscVal']

In [5]:
train = train.drop(columns = feature_del)
test = test.drop(columns = feature_del)

# Extra

In [10]:
feature_extra

['Neighborhood',
 'YearBuilt',
 'YearRemodAdd',
 'Exterior1st',
 'Exterior2nd',
 'GarageYrBlt',
 'MoSold',
 'YrSold']

In [82]:
train.groupby('Neighborhood').mean().sort_values(by='SalePrice')['SalePrice']

Neighborhood
MeadowV     98576.470588
IDOTRR     100123.783784
BrDale     104493.750000
BrkSide    124834.051724
Edwards    128219.700000
OldTown    128225.300885
Sawyer     136793.135135
Blueste    137500.000000
SWISU      142591.360000
NPkVill    142694.444444
NAmes      145847.080000
Mitchel    156270.122449
SawyerW    186555.796610
NWAmes     189050.068493
Gilbert    192854.506329
Blmngtn    194870.882353
CollgCr    197965.773333
Crawfor    210624.725490
ClearCr    212565.428571
Somerst    225379.837209
Veenker    238772.727273
Timber     242247.447368
StoneBr    310499.000000
NridgHt    316270.623377
NoRidge    335295.317073
Name: SalePrice, dtype: float64

In [83]:
map_Nei={'MeadowV' :0,
'IDOTRR'  :1,
'BrDale'  :2,
'BrkSide' :3,
'Edwards' :4,
'OldTown' :5,
'Sawyer'  :6,
'Blueste' :7,
'SWISU'   :8,
'NPkVill' :9,
'NAmes'   :10,
'Mitchel' :11,
'SawyerW' :12,
'NWAmes'  :13,
'Gilbert' :14,
'Blmngtn' :15,
'CollgCr' :16,
'Crawfor' :17,
'ClearCr' :18,
'Somerst' :19,
'Veenker' :20,
'Timber'  :21,
'StoneBr' :22,
'NridgHt' :23,
'NoRidge' :24}

In [84]:
train['Neighborhood']=train['Neighborhood'].map(map_Nei)
test['Neighborhood']=test['Neighborhood'].map(map_Nei)

In [31]:
train['YearOld'] = (train['YearBuilt'] + train['YearRemodAdd'] )/2
train['YearOld'] -= train['YrSold']
test['YearOld'] = (test['YearBuilt'] + test['YearRemodAdd'] )/2
test['YearOld'] -= test['YrSold']

In [32]:
#Extra에서 따로 계산하고 지워줘야 할 컬럼 추가
train = train.drop(columns = ['YearBuilt', 'YearRemodAdd',
                    'Exterior1st', 'Exterior2nd', 'YrSold', 'GarageYrBlt'])
test = test.drop(columns = ['YearBuilt', 'YearRemodAdd',
                    'Exterior1st', 'Exterior2nd', 'YrSold', 'GarageYrBlt'])

In [33]:
feature_dis.extend(['Neighborhood', 'MoSold'])
feature_extra=[]
feature_raw.append('YearOld')

# onehot

In [53]:
feature_onehot

['MSSubClass',
 'MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'MasVnrType',
 'ExterCond',
 'Foundation',
 'Electrical',
 'Functional',
 'GarageType',
 'PavedDrive',
 'Fence',
 'SaleType',
 'SaleCondition']

In [54]:
from sklearn.preprocessing import LabelEncoder
LE1=LabelEncoder()
for i in feature_onehot:
    temp=LE1.fit_transform(train[i])
    train[i]=temp
    
LE2=LabelEncoder()
for i in feature_onehot:
    temp=LE2.fit_transform(test[i])
    test[i]=temp

train.column

# rename feature

In [61]:
# 모델링 할 때 컬럼명이 숫자로 시작해서 오류가 생겼음
train.rename(columns = {'3SsnPorch' : 'SsnPorch'}, inplace = True)
test.rename(columns = {'3SsnPorch' : 'SsnPorch'}, inplace = True)

feature_raw.remove('3SsnPorch')
feature_raw.append('SsnPorch')

In [63]:
train['SsnPorch'].value_counts()

0      1436
168       3
216       2
144       2
180       2
407       1
320       1
304       1
290       1
245       1
238       1
182       1
196       1
23        1
162       1
153       1
140       1
130       1
96        1
508       1
Name: SsnPorch, dtype: int64

# Scaling

In [74]:
MMS_list=[]
MMS_list.extend(feature_raw)
MMS_list.extend(feature_dis)
MMS_list.extend(feature_map)
MMS_list.extend(feature_onehot)

In [73]:
MMS_list

['LotArea',
 'MasVnrArea',
 'TotalBsmtSF',
 'GrLivArea',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'ScreenPorch',
 'YearOld',
 'SsnPorch',
 'OverallQual',
 'OverallCond',
 'BsmtFullBath',
 'BsmtHalfBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'Neighborhood',
 'MoSold',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'MSSubClass',
 'MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'MasVnrType',
 'ExterCond',
 'Foundation',
 'Electrical',
 'Functional',
 'GarageType',
 'PavedDrive',
 'Fence',
 'SaleType',
 'SaleCondition']

In [90]:
from sklearn.preprocessing import MinMaxScaler
MMS1=MinMaxScaler()
train[list_MMS]=MMS1.fit_transform(train[list_MMS])

MMS2=MinMaxScaler()
test[list_MMS]=MMS2.fit_transform(test[list_MMS])

In [93]:
train['SalePrice'] = np.log1p(train['SalePrice'])

# Save

In [96]:
train.to_csv('data/preprocess/train_0918.csv', index = False)
test.to_csv('data/preprocess/test_0918.csv', index = False)

In [97]:
feature_dict = {'raw':feature_raw, 'dis':feature_dis, 
               'map':feature_map, 'onehot':feature_onehot,
                'extra':feature_extra, 'del':feature_del
               }

#features = pd.DataFrame(feature_dict)
features = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feature_dict.items() ]))
features.to_csv('data/EDA/feat_class_0918.csv', index = False)

  features = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feature_dict.items() ]))
