In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
house_df=pd.read_csv("house_price.csv")
print(house_df.head(3))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD         Normal     208500  
1   2007        WD         Normal     181500  
2   2008        WD         Normal     223500  

[3 rows x 81 columns]


In [3]:
print(house_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
print("\n",house_df.shape)
print("\n",house_df.dtypes.value_counts())


 (1460, 81)

 object     43
int64      35
float64     3
dtype: int64


### numeric/object feature 분리

In [5]:
# object_feature = house_df.dtypes[house_df.dtypes == 'object'].index # 범주형
# numeric_feature = house_df.dtypes[house_df.dtypes != 'object'].index # 수치형

### corr

In [6]:
# saleprice_corr_feature  = house_df[numeric_feature].corr().nlargest(15, 'SalePrice').index
# # print(saleprice_corr_feature)
# print(house_df[numeric_feature].corr().nlargest(15, 'SalePrice'))

### 정규성 검정(왜도), 이상값(IQR), scaling, log+1 ...

In [7]:
object_feature = house_df.dtypes[house_df.dtypes == 'object'].index # 범주형
numeric_feature = house_df.dtypes[house_df.dtypes != 'object'].index # 수치형

In [8]:
# |왜도|>3 인 피쳐
from scipy.stats import skew
sk_features = house_df[numeric_feature].apply(lambda x: skew(x))
# sk_features = sk_features[ abs(sk_features)> 3]
print(sk_features)
# house_df[sk_features.index] = np.log1p(house_df[sk_features.index])
# SalePrice 로그 변환
original_SalePrice = house_df['SalePrice']
house_df['SalePrice'] = np.log1p(house_df['SalePrice'])

Id                0.000000
MSSubClass        1.406210
LotFrontage            NaN
LotArea          12.195142
OverallQual       0.216721
OverallCond       0.692355
YearBuilt        -0.612831
YearRemodAdd     -0.503044
MasVnrArea             NaN
BsmtFinSF1        1.683771
BsmtFinSF2        4.250888
BsmtUnfSF         0.919323
TotalBsmtSF       1.522688
1stFlrSF          1.375342
2ndFlrSF          0.812194
LowQualFinSF      9.002080
GrLivArea         1.365156
BsmtFullBath      0.595454
BsmtHalfBath      4.099186
FullBath          0.036524
HalfBath          0.675203
BedroomAbvGr      0.211572
KitchenAbvGr      4.483784
TotRmsAbvGrd      0.675646
Fireplaces        0.648898
GarageYrBlt            NaN
GarageCars       -0.342197
GarageArea        0.179796
WoodDeckSF        1.539792
OpenPorchSF       2.361912
EnclosedPorch     3.086696
3SsnPorch        10.293752
ScreenPorch       4.117977
PoolArea         14.813135
MiscVal          24.451640
MoSold            0.211835
YrSold            0.096170
S

In [9]:
print(house_df['SalePrice'].head(3))

0    12.247699
1    12.109016
2    12.317171
Name: SalePrice, dtype: float64


### isnull, drop, fillna

##### 수치형

In [10]:
null_series=house_df.isnull().sum()
print('\n', null_series[null_series>0].sort_values(ascending=False))
#  PoolQC          1453
# MiscFeature     1406
# Alley           1369
# Fence           1179
# FireplaceQu      690
null_col= ["Id",'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
house_df.drop(null_col, axis=1, inplace=True)


 PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64


In [11]:
house_df.fillna(house_df.mean(), inplace=True)

##### 범주형

In [12]:
null_column_count = house_df.isnull().sum()[house_df.isnull().sum() > 0]
print(house_df.dtypes[null_column_count.index])
#원핫 인코딩 --> 자동으로 결측 None

MasVnrType      object
BsmtQual        object
BsmtCond        object
BsmtExposure    object
BsmtFinType1    object
BsmtFinType2    object
Electrical      object
GarageType      object
GarageFinish    object
GarageQual      object
GarageCond      object
dtype: object


### encoding

In [13]:
house_df_ohe = pd.get_dummies(house_df)
house_df_ohe.shape

(1460, 271)

##### 정규성검정

#### 이상값

In [14]:
# house_df['SalePrice'].describe() 
# q1= house_df['SalePrice'].describe()[4]
# q3=house_df['SalePrice'].describe()[6]
# IQR=q3-q1
# outlier= house_df.SalePrice[(house_df.SalePrice>= q3+IQR*1.5)|(house_df.SalePrice <= q1-IQR*1.5)].values
# print(outlier)
#_____________________
# numeric_feature = house_df.dtypes[house_df.dtypes != 'object'].index # 수치형
# for col in numeric_feature:
#     q1, q3 = house_df[col].quantile([0.25, 0.75])
#     iqr = q3-q1
#     upper_bound = 1.5 * iqr
#     print(col, upper_bound)
#     house_df[col] = house_df[col].map(lambda x: upper_bound if x > upper_bound else x)
    
#______________________
# for i in range(len(numeric_features)):
#   print("----------------------------",numeric_features[i])
#   # print(np.percentile(house_df[numeric_features[i]],[25,75]))
#   # print(house_df[numeric_features[i]].min(), house_df[numeric_features[i]].max())
#   q1, q3 = np.percentile(house_df[numeric_features[i]], [25,75])
#   outlier = house_df[(house_df[numeric_features[i]]< q1)|(house_df[numeric_features[i]]>q3)]
#   if outlier.shape[0] < 20:
#     house_df.drop(outlier.index, axis=0, inplace= True)
#     print("drop")

#### 이상값

### train test split

In [15]:
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error

In [16]:
y_target = house_df_ohe['SalePrice']
X_features = house_df_ohe.drop('SalePrice',axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=610)

### def rmse

In [17]:
# def get_rmse(model):
#     pred = model.predict(X_test)
#     mse = mean_squared_error(y_test , pred)
#     rmse = np.sqrt(mse)
#     print('{0} 로그 변환된 RMSE: {1}'.format(model.__class__.__name__, np.round(rmse, 4)))
#     return rmse

# def get_rmses(models):
#     rmses = [ ]
#     for model in models:
#         rmse = get_rmse(model)
#         rmses.append(rmse)
#     return rmses

In [18]:
### 모델 학습 

In [19]:
# # LinearRegression, Ridge, Lasso 학습, 예측, 평가
# lr_reg = LinearRegression()
# lr_reg.fit(X_train, y_train)

# ridge_reg = Ridge()
# ridge_reg.fit(X_train, y_train)

# 
# lasso_reg.fit(X_train, y_train)

# models = [lr_reg, ridge_reg, lasso_reg]
# get_rmses(models)

lr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
xgb = XGBRegressor()

models = [lr, ridge, lasso, xgb]

In [20]:
for model in models:
    model.fit(X_train,y_train)
    pred= model.predict(X_test)
    mse = mean_squared_error(y_test,pred)
    rmse=np.sqrt(mse)
    print(model, np.round(rmse,3))

# pred= model.predict(test)
# pred = np.expm1(pred)

LinearRegression() 0.133
Ridge() 0.136
Lasso() 0.191
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None) 0.146


In [33]:
model = LinearRegression()
model.fit(X_train,y_train)
pred= model.predict(X_test)
print(pred.shape)
pred=np.expm1(pred)
print(pred[:5])

(292,)
[207461.80610433 214203.37034805 188307.64026171 208628.51172228
 239532.90495417]


In [34]:
sub_df=pd.read_csv("sample_submission.csv")
sub_df2 = sub_df.iloc[:292].copy()
sub_df2['SalePrice'] = pred
print(sub_df2.shape)
print(sub_df2.head())

# sub_df['SalePrice'] = pred
# print(sub_df.shape)
# print(sub_df.head())


sub_df2.to_csv("dddd.csv", index=False)

(292, 2)
     Id      SalePrice
0  1461  207461.806104
1  1462  214203.370348
2  1463  188307.640262
3  1464  208628.511722
4  1465  239532.904954
