In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm;
import sklearn
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
train =  pd.read_csv('data/EDA/train_EDA_0910.csv')
test =  pd.read_csv('data/EDA/test_EDA_0910.csv')
feat_class = pd.read_csv('data/EDA/feat_class_0918.csv')

In [3]:
feature_raw = feat_class['raw'].dropna().values.tolist()
feature_dis = feat_class['dis'].dropna().values.tolist()
feature_map = feat_class['map'].dropna().values.tolist()
feature_onehot = feat_class['onehot'].dropna().values.tolist()
feature_extra = feat_class['extra'].dropna().values.tolist()
feature_del = feat_class['del'].dropna().values.tolist()

# delete

In [4]:
feature_del

['Id',
 'LotFrontage',
 'Utilities',
 'Condition2',
 'RoofMatl',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'Heating',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'MiscFeature',
 'PoolArea',
 'PoolQC',
 'MiscVal']

In [5]:
train = train.drop(columns = feature_del)
test = test.drop(columns = feature_del)

# Extra

In [6]:
feature_extra

[]

In [7]:
train.groupby('Neighborhood').mean().sort_values(by='SalePrice')['SalePrice']

Neighborhood
MeadowV     98576.470588
IDOTRR     100123.783784
BrDale     104493.750000
BrkSide    124834.051724
Edwards    128219.700000
OldTown    128225.300885
Sawyer     136793.135135
Blueste    137500.000000
SWISU      142591.360000
NPkVill    142694.444444
NAmes      145847.080000
Mitchel    156270.122449
SawyerW    186555.796610
NWAmes     189050.068493
Gilbert    192854.506329
Blmngtn    194870.882353
CollgCr    197965.773333
Crawfor    210624.725490
ClearCr    212565.428571
Somerst    225379.837209
Veenker    238772.727273
Timber     242247.447368
StoneBr    310499.000000
NridgHt    316270.623377
NoRidge    335295.317073
Name: SalePrice, dtype: float64

In [8]:
map_Nei={'MeadowV' :0,
'IDOTRR'  :1,
'BrDale'  :2,
'BrkSide' :3,
'Edwards' :4,
'OldTown' :5,
'Sawyer'  :6,
'Blueste' :7,
'SWISU'   :8,
'NPkVill' :9,
'NAmes'   :10,
'Mitchel' :11,
'SawyerW' :12,
'NWAmes'  :13,
'Gilbert' :14,
'Blmngtn' :15,
'CollgCr' :16,
'Crawfor' :17,
'ClearCr' :18,
'Somerst' :19,
'Veenker' :20,
'Timber'  :21,
'StoneBr' :22,
'NridgHt' :23,
'NoRidge' :24}

In [9]:
train['Neighborhood']=train['Neighborhood'].map(map_Nei)
test['Neighborhood']=test['Neighborhood'].map(map_Nei)

In [10]:
train['YearOld'] = (train['YearBuilt'] + train['YearRemodAdd'] )/2
train['YearOld'] -= train['YrSold']
test['YearOld'] = (test['YearBuilt'] + test['YearRemodAdd'] )/2
test['YearOld'] -= test['YrSold']

In [11]:
#Extra에서 따로 계산하고 지워줘야 할 컬럼 추가
train = train.drop(columns = ['YearBuilt', 'YearRemodAdd',
                    'Exterior1st', 'Exterior2nd', 'YrSold', 'GarageYrBlt'])
test = test.drop(columns = ['YearBuilt', 'YearRemodAdd',
                    'Exterior1st', 'Exterior2nd', 'YrSold', 'GarageYrBlt'])

# Label Encoder

In [12]:
feature_onehot

['MSSubClass',
 'MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'MasVnrType',
 'ExterCond',
 'Foundation',
 'Electrical',
 'Functional',
 'GarageType',
 'PavedDrive',
 'Fence',
 'SaleType',
 'SaleCondition']

In [13]:
feature_onehot.remove('MSSubClass')

In [14]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
for i in feature_onehot:
    temp1=LE.fit_transform(train[i])
    train[i]=temp1
    temp2=LE.transform(test[i])
    test[i]=temp2

##  MSSubClass

In [17]:
train['MSSubClass']

0       60
1       20
2       60
3       70
4       60
        ..
1455    60
1456    20
1457    70
1458    20
1459    20
Name: MSSubClass, Length: 1460, dtype: int64

In [18]:
# MinMaxScale을 바로 하는 걸로
MMS_list=['MSSubClass']

# rename feature

In [19]:
# 모델링 할 때 컬럼명이 숫자로 시작해서 오류가 생겼음
train.rename(columns = {'3SsnPorch' : 'SsnPorch'}, inplace = True)
test.rename(columns = {'3SsnPorch' : 'SsnPorch'}, inplace = True)

In [20]:
train['SsnPorch'].value_counts()

0      1436
168       3
216       2
144       2
180       2
407       1
320       1
304       1
290       1
245       1
238       1
182       1
196       1
23        1
162       1
153       1
140       1
130       1
96        1
508       1
Name: SsnPorch, dtype: int64

# Scaling

In [21]:
MMS_list.extend(feature_raw)
MMS_list.extend(feature_dis)
MMS_list.extend(feature_map)
MMS_list.extend(feature_onehot)

In [22]:
MMS_list

['MSSubClass',
 'LotArea',
 'MasVnrArea',
 'TotalBsmtSF',
 'GrLivArea',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'ScreenPorch',
 'YearOld',
 'SsnPorch',
 'OverallQual',
 'OverallCond',
 'BsmtFullBath',
 'BsmtHalfBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'Neighborhood',
 'MoSold',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'MasVnrType',
 'ExterCond',
 'Foundation',
 'Electrical',
 'Functional',
 'GarageType',
 'PavedDrive',
 'Fence',
 'SaleType',
 'SaleCondition']

In [23]:
from sklearn.preprocessing import MinMaxScaler
MMS=MinMaxScaler()
train[MMS_list]=MMS.fit_transform(train[MMS_list])
test[MMS_list]=MMS.transform(test[MMS_list])

In [24]:
train['SalePrice'] = np.log1p(train['SalePrice'])

In [25]:
train

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,SsnPorch,ScreenPorch,Fence,MoSold,SaleType,SaleCondition,SalePrice,YearOld
0,0.235294,0.75,0.033420,1.0,1.0,1.0,1.0,1.0,0.0,0.666667,...,0.111517,0.000000,0.0,0.0,0.00,0.090909,1.0,0.8,12.247699,0.940860
1,0.000000,0.75,0.038795,1.0,1.0,1.0,1.0,0.5,0.0,0.833333,...,0.000000,0.000000,0.0,0.0,0.00,0.363636,1.0,0.8,12.109016,0.661290
2,0.235294,0.75,0.046507,1.0,1.0,0.0,1.0,1.0,0.0,0.666667,...,0.076782,0.000000,0.0,0.0,0.00,0.727273,1.0,0.8,12.317171,0.924731
3,0.294118,0.75,0.038561,1.0,1.0,0.0,1.0,0.0,0.0,0.708333,...,0.063985,0.492754,0.0,0.0,0.00,0.090909,1.0,0.0,11.849405,0.311828
4,0.235294,0.75,0.060576,1.0,1.0,0.0,1.0,0.5,0.0,1.000000,...,0.153565,0.000000,0.0,0.0,0.00,1.000000,1.0,0.8,12.429220,0.908602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.235294,0.75,0.030929,1.0,1.0,1.0,1.0,1.0,0.0,0.583333,...,0.073126,0.000000,0.0,0.0,0.00,0.636364,1.0,0.8,12.072547,0.913978
1456,0.000000,0.75,0.055505,1.0,1.0,1.0,1.0,1.0,0.0,0.541667,...,0.000000,0.000000,0.0,0.0,0.75,0.090909,1.0,0.8,12.254868,0.704301
1457,0.294118,0.75,0.036187,1.0,1.0,1.0,1.0,1.0,0.0,0.708333,...,0.109689,0.000000,0.0,0.0,1.00,0.363636,1.0,0.8,12.493133,0.602151
1458,0.000000,0.75,0.039342,1.0,1.0,1.0,1.0,1.0,0.0,0.416667,...,0.000000,0.202899,0.0,0.0,0.00,0.272727,1.0,0.8,11.864469,0.596774


# Save

In [26]:
train.to_csv('data/preprocess/train_0926.csv', index = False)
test.to_csv('data/preprocess/test_0926.csv', index = False)

#The End#