### Install libs

In [34]:
%pip install feature_engine


Note: you may need to restart the kernel to use updated packages.


In [35]:
%pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [36]:
import xgboost as xgb
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from feature_engine.encoding import CountFrequencyEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_squared_error, r2_score,make_scorer
import os


### Data Clean

In [37]:
#import data

df_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

df_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [38]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [39]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [40]:
df_explore=pd.DataFrame(((df_train.isnull().sum())/df_train.shape[0])*100)


In [41]:
df_explore.head(10)

Unnamed: 0,0
Id,0.0
MSSubClass,0.0
MSZoning,0.0
LotFrontage,17.739726
LotArea,0.0
Street,0.0
Alley,93.767123
LotShape,0.0
LandContour,0.0
Utilities,0.0


In [42]:
df_explore[df_explore[0]>0]

Unnamed: 0,0
LotFrontage,17.739726
Alley,93.767123
MasVnrType,59.726027
MasVnrArea,0.547945
BsmtQual,2.534247
BsmtCond,2.534247
BsmtExposure,2.60274
BsmtFinType1,2.534247
BsmtFinType2,2.60274
Electrical,0.068493


In [43]:
df_train[list(df_explore[df_explore[0]>0].index)].dtypes

LotFrontage     float64
Alley            object
MasVnrType       object
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
FireplaceQu      object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
dtype: object

In [44]:
df_train.drop(list(df_explore[df_explore[0]>6].index),axis=1,inplace=True)

In [45]:
df_explore[(df_explore[0]<6) & (df_explore[0]>0)]

Unnamed: 0,0
MasVnrArea,0.547945
BsmtQual,2.534247
BsmtCond,2.534247
BsmtExposure,2.60274
BsmtFinType1,2.534247
BsmtFinType2,2.60274
Electrical,0.068493
GarageType,5.547945
GarageYrBlt,5.547945
GarageFinish,5.547945


In [46]:
df_train[list(df_explore[(df_explore[0]<6) & (df_explore[0]>0)].index)].dtypes

MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
dtype: object

### Data Transformation

In [47]:
specials_categories = ["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Electrical", "GarageType", "GarageFinish", "GarageQual", "GarageCond"]
for column in specials_categories:
    df_train[column].fillna("Unknown", inplace=True)

for column in specials_categories:
    df_test[column].fillna("Unknown", inplace=True)

In [48]:
specials_num = ['MasVnrArea','GarageYrBlt']
for column in specials_num :
    median = df_train[column].median()
    df_train[column].fillna(median, inplace=True)

for column in specials_num :
    median = df_train[column].median()
    df_test[column].fillna(median, inplace=True)

In [49]:
df_train[list(df_explore[(df_explore[0]<6) & (df_explore[0]>0)].index)].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MasVnrArea    1460 non-null   float64
 1   BsmtQual      1460 non-null   object 
 2   BsmtCond      1460 non-null   object 
 3   BsmtExposure  1460 non-null   object 
 4   BsmtFinType1  1460 non-null   object 
 5   BsmtFinType2  1460 non-null   object 
 6   Electrical    1460 non-null   object 
 7   GarageType    1460 non-null   object 
 8   GarageYrBlt   1460 non-null   float64
 9   GarageFinish  1460 non-null   object 
 10  GarageQual    1460 non-null   object 
 11  GarageCond    1460 non-null   object 
dtypes: float64(2), object(10)
memory usage: 137.0+ KB


In [50]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [51]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

### Removing Outliers

In [52]:
def remove_outliers_by_column(data, column_name, lower_bound, upper_bound):
    
    iqr = data[column_name].quantile(0.75) - data[column_name].quantile(0.25)
    
    lower_limit = data[column_name].quantile(0.25) - (iqr * lower_bound)
    upper_limit = data[column_name].quantile(0.75) + (iqr * upper_bound)
    
    data = data[(data[column_name] >= lower_limit) & (data[column_name] <= upper_limit)]
    return data

In [53]:
lower_bound = 1.5
upper_bound = 1.5

colunas_numericas = df_train.select_dtypes(include=['int64','float64']).columns.tolist()

for col in colunas_numericas:
    df_train = remove_outliers_by_column(df_train, 'SalePrice', lower_bound, upper_bound)
    

In [54]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1344 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1344 non-null   int64  
 1   MSSubClass     1344 non-null   int64  
 2   MSZoning       1344 non-null   object 
 3   LotArea        1344 non-null   int64  
 4   Street         1344 non-null   object 
 5   LotShape       1344 non-null   object 
 6   LandContour    1344 non-null   object 
 7   Utilities      1344 non-null   object 
 8   LotConfig      1344 non-null   object 
 9   LandSlope      1344 non-null   object 
 10  Neighborhood   1344 non-null   object 
 11  Condition1     1344 non-null   object 
 12  Condition2     1344 non-null   object 
 13  BldgType       1344 non-null   object 
 14  HouseStyle     1344 non-null   object 
 15  OverallQual    1344 non-null   int64  
 16  OverallCond    1344 non-null   int64  
 17  YearBuilt      1344 non-null   int64  
 18  YearRemodAdd 

In [55]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

### Moment to define Features and Target

In [56]:
y_train=df_train.SalePrice
X_train=df_train.drop(['Id','SalePrice'],axis=1)

### Scaling

In [57]:
scaler = MinMaxScaler()

colunas_num = X_train.select_dtypes(include=['int64', 'float64']).columns
X_data_num = df_train[colunas_num]

X_data_scaled_num = scaler.fit_transform(X_data_num)

X_train_scaled = pd.DataFrame(X_data_scaled_num, columns=colunas_num)
X_train_scaled.reset_index(drop=True,inplace=True)

X_train_categories = X_train.drop(colunas_num, axis=1)
X_train_categories.reset_index(drop=True,inplace=True)

X_train_escaled = pd.concat([X_train_categories, X_train_scaled], axis=1)

### Transforming labels into frequencies (ENCODING)

In [58]:
encoder = CountFrequencyEncoder(encoding_method='frequency',
                         variables=X_train_escaled.select_dtypes(include='object').columns.tolist())
encoder.fit(X_train_escaled)
X_train_es_encoded= encoder.transform(X_train_escaled)

In [59]:
X_train = X_train_es_encoded


### Feature Selection

In [60]:
def custom_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(custom_rmse, greater_is_better=False)

In [61]:
lr = LinearRegression()

lr.fit(X_train, y_train)

In [62]:

sfs = SequentialFeatureSelector(
    lr,
    k_features='best',
    forward=True,
    verbose=2,
    scoring=rmse_scorer,  # Use the RMSE scoring function
    cv=10
)

sfs = sfs.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.7s

[2023-10-16 00:25:10] Features: 1/72 -- score: -33822.73131268404[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s

[2023-10-16 00:25:11] Features: 2/72 -- score: -30333.29436169698[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s

[2023-10-16 00:25:13] Features: 3/72 -- score: -27472.85385610132[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    1.0s

[2023-10-16 00:25:14] Features: 4/72 -- score: -26387.00538204859[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s

[2023-10-16 00:25:16] Features: 5/72 -- score: -25448.667994678624[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s

[2023-10-16 00:25:17] Features: 6/72 -- score: -24616.225507700794[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s

[2023-10-16 00:25:19] Features: 7/72 -- score: -23970.02972852462[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    1.0s

[2023-10-16 00:25:21] Features: 8/72 -- score: 

In [63]:
result_features=pd.DataFrame.from_dict(sfs.get_metric_dict()).T
best_features = list(result_features['feature_names'][15])
print(best_features)

['Condition1', 'ExterQual', 'KitchenQual', 'Functional', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'GrLivArea', 'BsmtFullBath', 'Fireplaces', 'GarageCars', 'ScreenPorch']


In [64]:
X_train_selected = X_train[best_features]

### Adjusts at df_test (like empty fields, minmaxscaler and encoding)

In [65]:
X_test = df_test[best_features]

In [66]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Condition1    1459 non-null   object 
 1   ExterQual     1459 non-null   object 
 2   KitchenQual   1458 non-null   object 
 3   Functional    1457 non-null   object 
 4   MSSubClass    1459 non-null   int64  
 5   LotArea       1459 non-null   int64  
 6   OverallQual   1459 non-null   int64  
 7   OverallCond   1459 non-null   int64  
 8   YearBuilt     1459 non-null   int64  
 9   YearRemodAdd  1459 non-null   int64  
 10  GrLivArea     1459 non-null   int64  
 11  BsmtFullBath  1457 non-null   float64
 12  Fireplaces    1459 non-null   int64  
 13  GarageCars    1458 non-null   float64
 14  ScreenPorch   1459 non-null   int64  
dtypes: float64(2), int64(9), object(4)
memory usage: 171.1+ KB


In [67]:
column_num = ['BsmtFullBath','GarageCars']
for column in column_num :
    median = X_test[column].median()
    X_test[column].fillna(median, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[column].fillna(median, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[column].fillna(median, inplace=True)


In [68]:
column_categories = ['Functional','KitchenQual']
for column in column_categories:
    X_test[column].fillna("Unknown", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[column].fillna("Unknown", inplace=True)


In [69]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Condition1    1459 non-null   object 
 1   ExterQual     1459 non-null   object 
 2   KitchenQual   1459 non-null   object 
 3   Functional    1459 non-null   object 
 4   MSSubClass    1459 non-null   int64  
 5   LotArea       1459 non-null   int64  
 6   OverallQual   1459 non-null   int64  
 7   OverallCond   1459 non-null   int64  
 8   YearBuilt     1459 non-null   int64  
 9   YearRemodAdd  1459 non-null   int64  
 10  GrLivArea     1459 non-null   int64  
 11  BsmtFullBath  1459 non-null   float64
 12  Fireplaces    1459 non-null   int64  
 13  GarageCars    1459 non-null   float64
 14  ScreenPorch   1459 non-null   int64  
dtypes: float64(2), int64(9), object(4)
memory usage: 171.1+ KB


In [70]:
encoder = CountFrequencyEncoder(encoding_method='frequency',
                         variables=X_test.select_dtypes(include='object').columns.tolist())
encoder.fit(X_test)
X_test_es_encoded= encoder.transform(X_test)

In [71]:
scaler = MinMaxScaler()

colunas_num = X_test_es_encoded.select_dtypes(include=['int64', 'float64']).columns
X_data_num = X_test_es_encoded[colunas_num]

X_data_scaled_num = scaler.fit_transform(X_data_num)

X_test_scaled = pd.DataFrame(X_data_scaled_num, columns=colunas_num)
X_test_scaled.reset_index(drop=True,inplace=True)

X_test_categories = X_test_es_encoded.drop(colunas_num, axis=1)
X_test_categories.reset_index(drop=True,inplace=True)

X_train_escaled = pd.concat([X_test_categories, X_test_scaled], axis=1)

In [72]:
X_train_escaled.head()

Unnamed: 0,Condition1,ExterQual,KitchenQual,Functional,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,GrLivArea,BsmtFullBath,Fireplaces,GarageCars,ScreenPorch
0,0.063352,1.0,1.0,1.0,0.0,0.184147,0.444444,0.625,0.625954,0.183333,0.104309,0.0,0.0,0.2,0.208333
1,1.0,1.0,0.746032,1.0,0.0,0.232124,0.555556,0.625,0.603053,0.133333,0.196672,0.0,0.0,0.2,0.0
2,1.0,1.0,1.0,1.0,0.235294,0.224197,0.444444,0.5,0.900763,0.8,0.260666,0.0,0.25,0.4,0.0
3,1.0,1.0,0.746032,1.0,0.235294,0.154326,0.555556,0.625,0.908397,0.8,0.255333,0.0,0.25,0.4,0.0
4,1.0,0.53961,0.746032,1.0,0.588235,0.064121,0.777778,0.5,0.862595,0.7,0.18622,0.0,0.0,0.4,0.25


In [73]:
X_test = X_train_escaled

## Testing with Xgb Boost MASTER RACE

In [74]:
params = {
    'objective': 'reg:squarederror',  
    'max_depth': 3,                   
    'eta': 0.1,                       
    'subsample': 0.8,                 
    'colsample_bytree': 0.8,         
    'eval_metric': 'rmse'        
}

In [75]:
num_round = 1000

In [76]:
# Treine o modelo XGBoost
bst = xgb.XGBRegressor(**params,n_estimators=num_round)
bst.fit(X_train_selected, y_train)

In [77]:
# Faça previsões
y_pred = bst.predict(X_test)

In [78]:
submission = df_test[['Id']]

In [79]:
submission['SalePrice'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['SalePrice'] = y_pred


In [80]:
submission.to_csv('submission.csv',index=False)