<a href="https://colab.research.google.com/github/rybinsky/House-Price-Prediction/blob/main/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings 
warnings.filterwarnings('ignore')

# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import copy

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns
#sns.set(font_scale = 2)
# Splitting data into training and testing
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

def train_and_evaluate(train_df, target_col, test_df = None, is_test = False):

    models = {
            "Gradient Boosting": GradientBoostingRegressor(),
            "Random Forest": RandomForestRegressor(),
            "XGBoost": XGBRegressor(),
            "LightGBM": LGBMRegressor()
        }
    # Разделяем на тренировочный и тестовый датасеты
    if is_test:

        X_train = train_df.drop(columns = target_col)
        y_train = train_df[target_col]
        X_test  = test_df
        predictions = {}
        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            predictions[name] = y_pred

        return predictions

    else:

        X_train, X_test, y_train, y_test = train_test_split(train_df.drop(target_col, axis = 1), 
                                                            train_df[target_col], 
                                                            test_size = 0.2, 
                                                            random_state = 42)

        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            print(f'Model: <{name}>')
            print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
            print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
            print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
            print('Mean Absolute Error log:', metrics.mean_absolute_error(np.log(y_test), np.log(y_pred)))  
            print('Mean Squared Error log:', metrics.mean_squared_error(np.log(y_test), np.log(y_pred)))  
            print('Root Mean Squared Error log:', np.sqrt(metrics.mean_squared_error(np.log(y_test), np.log(y_pred))))
            print('R2:', r2_score(y_test, y_pred))
            print('#============================================#')
  

In [None]:
train_df = pd.read_csv('train_clear2.csv')
test_df = pd.read_csv('test_clear2.csv')
test = pd.read_csv('test.csv')
train_df.head(5)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,2ndFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,Fireplaces,GarageFinish,GarageCars,...,BsmtFinType2_Unf,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,4,3,4,3,706,0,150,856,5,1,854,1710,1,0,2,1,3,1,4,0,1,2,...,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,3,3,4,3,978,0,284,1262,5,1,0,1262,0,1,2,0,3,1,3,1,1,2,...,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,4,3,4,3,486,0,434,920,5,1,866,1786,1,0,2,1,3,1,4,1,1,2,...,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,3,3,3,4,216,0,540,756,4,1,756,1717,1,0,1,0,3,1,4,1,0,3,...,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,4,3,4,3,655,0,490,1145,5,1,1053,2198,1,0,2,1,4,1,4,1,1,3,...,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0


In [None]:
train_and_evaluate(train_df, target_col = 'SalePrice')

Model: <Gradient Boosting>
Mean Absolute Error: 14611.466111597441
Mean Squared Error: 480979135.92590076
Root Mean Squared Error: 21931.23653435667
Mean Absolute Error log: 0.08159725161858616
Mean Squared Error log: 0.014577555799182743
Root Mean Squared Error log: 0.12073754925118674
R2: 0.9287294772975501
Model: <Random Forest>
Mean Absolute Error: 15843.638927335642
Mean Squared Error: 637860484.5101436
Root Mean Squared Error: 25255.89999406364
Mean Absolute Error log: 0.0854866917666976
Mean Squared Error log: 0.01624256347558922
Root Mean Squared Error log: 0.1274463160534239
R2: 0.9054831140341199
Model: <XGBoost>
Mean Absolute Error: 15473.98205017301
Mean Squared Error: 574232952.4515731
Root Mean Squared Error: 23963.15823199382
Mean Absolute Error log: 0.08678195665921266
Mean Squared Error log: 0.017370545393118882
Root Mean Squared Error log: 0.13179736489444271
R2: 0.914911313989928
Model: <LightGBM>
Mean Absolute Error: 15240.90358427978
Mean Squared Error: 524257551.4

Model: <Gradient Boosting>
Mean Absolute Error: 14743.634427373527
Mean Squared Error: 488520779.18277234
Root Mean Squared Error: 22102.506174250295
Mean Absolute Error log: 0.0822560766613617
Mean Squared Error log: 0.01513706767393491
Root Mean Squared Error log: 0.12303279105155222
R2: 0.9276119717410607
#============================================#
Model: <Random Forest>
Mean Absolute Error: 15800.543737024223
Mean Squared Error: 604200318.8044575
Root Mean Squared Error: 24580.486545315933
Mean Absolute Error log: 0.08666151808474287
Mean Squared Error log: 0.017047068348276587
Root Mean Squared Error log: 0.13056442221476947
R2: 0.9104708098090669
#============================================#
Model: <XGBoost>
Mean Absolute Error: 14995.80687716263
Mean Squared Error: 560971105.7326969
Root Mean Squared Error: 23684.828598338998
Mean Absolute Error log: 0.08464102561911709
Mean Squared Error log: 0.016676921838263227
Root Mean Squared Error log: 0.12913915687452518
R2: 0.9168764278109975
#============================================#
Model: <LightGBM>
Mean Absolute Error: 15192.098393956367
Mean Squared Error: 520102903.79353654
Root Mean Squared Error: 22805.764705300644
Mean Absolute Error log: 0.08442183298650267
Mean Squared Error log: 0.015549559949697311
Root Mean Squared Error log: 0.12469787467995319
R2: 0.9229321959234879
#============================================#

Попробуем предсказывать логарифм таргета(**в конце не забудем возвести np.exp**)

In [None]:
train_df_log = train_df.drop(columns = 'SalePrice')
train_df_log['SalePrice'] = np.log(train_df.SalePrice)

train_and_evaluate(train_df_log, target_col = 'SalePrice')

Model: <Gradient Boosting>
Mean Absolute Error: 0.08137487274432452
Mean Squared Error: 0.01498018733915115
Root Mean Squared Error: 0.12239357556322615
Mean Absolute Error log: 0.006800167040263461
Mean Squared Error log: 0.00010813661923217434
Root Mean Squared Error log: 0.010398875863869821
R2: 0.9114459407544597
Model: <Random Forest>
Mean Absolute Error: 0.08646022832041447
Mean Squared Error: 0.016545691864568966
Root Mean Squared Error: 0.12863005816903358
Mean Absolute Error log: 0.007201542363018887
Mean Squared Error log: 0.00011840627803491836
Root Mean Squared Error log: 0.010881464884606225
R2: 0.902191598511977
Model: <XGBoost>
Mean Absolute Error: 0.08812787252607981
Mean Squared Error: 0.01684016801676667
Root Mean Squared Error: 0.129769672946982
Mean Absolute Error log: 0.007395863951886211
Mean Squared Error log: 0.00012279813722266013
Root Mean Squared Error log: 0.011081432092588942
R2: 0.9004508286512449
Model: <LightGBM>
Mean Absolute Error: 0.08265561715475987


Видим, что с логарифмом стало чуть лучше. Проверим на тестовой выборке

In [None]:
preds_test = train_and_evaluate(train_df_log, 'SalePrice', test_df, is_test = True)

In [None]:
preds_test

{'Gradient Boosting': array([11.69048512, 11.96116631, 12.1314355 , ..., 11.91150902,
        11.69593283, 12.29277096]),
 'Random Forest': array([11.73833137, 11.9319521 , 12.07243883, ..., 11.97789234,
        11.67488098, 12.33266736]),
 'XGBoost': array([11.583313, 11.978967, 12.181624, ..., 11.917714, 11.666415,
        12.274489], dtype=float32),
 'LightGBM': array([11.71547436, 12.00158981, 12.11783057, ..., 11.96148587,
        11.6193358 , 12.32534879])}

In [None]:
def make_submissions(test, predictions):

    for model, preds in predictions.items():
        test['SalePrice'] = np.exp(preds)
        submission = test[['Id', 'SalePrice']]
        filename = f'standart_{model}.csv'
        submission.to_csv(filename, index = False) 

In [17]:
param_grid = {
    'num_leaves': [16, 31],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'n_estimators': [100, 150, 200],
#    'min_child_samples': [10, 20, 50, 100],
#    'subsample': [0.5, 0.8, 1.0]
}

def grid_search(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 1, n_jobs = -1)
    grid_search.fit(X_train, y_train)
    return grid_search

model_grid_search = grid_search(LGBMRegressor(), param_grid, train_df.drop('SalePrice', axis = 1), train_df['SalePrice'])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [18]:
model_grid_search.best_params_

{'learning_rate': 0.1, 'n_estimators': 150, 'num_leaves': 16}

In [19]:
preds_grid_lgbm = model_grid_search.predict(test_df)

In [20]:
test['SalePrice'] = preds_grid_lgbm
submission = test[['Id', 'SalePrice']]
filename = f'grid_lgbm_1.csv'
submission.to_csv(filename, index = False) 

In [None]:
make_submissions(test, preds_test)

На тесте LightGBM показал RMSLE 0.13005