<a href="https://colab.research.google.com/github/rybinsky/House-Price-Prediction/blob/main/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings 
warnings.filterwarnings('ignore')

# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import copy

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns
#sns.set(font_scale = 2)
# Splitting data into training and testing
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

def train_and_evaluate(train_df, target_col, test_df = None, is_test = False):

    models = {
            "Gradient Boosting": GradientBoostingRegressor(),
            "Random Forest": RandomForestRegressor(),
            "XGBoost": XGBRegressor(),
            "LightGBM": LGBMRegressor()
        }
    # Разделяем на тренировочный и тестовый датасеты
    if is_test:

        X_train = train_df.drop(columns = target_col)
        y_train = train_df[target_col]
        X_test  = test_df
        predictions = {}
        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            predictions[name] = y_pred

        return predictions

    else:

        X_train, X_test, y_train, y_test = train_test_split(train_df.drop(target_col, axis = 1), 
                                                            train_df[target_col], 
                                                            test_size = 0.2, 
                                                            random_state = 42)

        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            print(f'Model: <{name}>')
            print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
            print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
            print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
            print('Mean Absolute Error log:', metrics.mean_absolute_error(np.log(y_test), np.log(y_pred)))  
            print('Mean Squared Error log:', metrics.mean_squared_error(np.log(y_test), np.log(y_pred)))  
            print('Root Mean Squared Error log:', np.sqrt(metrics.mean_squared_error(np.log(y_test), np.log(y_pred))))
            print('R2:', r2_score(y_test, y_pred))
            print('#============================================#')
  

In [None]:
train_df = pd.read_csv('train_clear1.csv')
test_df = pd.read_csv('test_clear1.csv')
test = pd.read_csv('test.csv')
train_df.head(5)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,2ndFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,Fireplaces,GarageCars,GarageQual,GarageCond,...,Electrical_FuseP,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,4,3,4,3,706,0,150,856,5,854,1710,1,0,2,1,3,1,4,0,2,3,3,...,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,3,3,4,3,978,0,284,1262,5,0,1262,0,1,2,0,3,1,3,1,2,3,3,...,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,4,3,4,3,486,0,434,920,5,866,1786,1,0,2,1,3,1,4,1,2,3,3,...,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,3,3,3,4,216,0,540,756,4,756,1717,1,0,1,0,3,1,4,1,3,3,3,...,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,4,3,4,3,655,0,490,1145,5,1053,2198,1,0,2,1,4,1,4,1,3,3,3,...,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0


In [None]:
train_and_evaluate(train_df, target_col = 'SalePrice')

Model: <Gradient Boosting>
Mean Absolute Error: 14743.634427373527
Mean Squared Error: 488520779.18277234
Root Mean Squared Error: 22102.506174250295
Mean Absolute Error log: 0.0822560766613617
Mean Squared Error log: 0.01513706767393491
Root Mean Squared Error log: 0.12303279105155222
R2: 0.9276119717410607
Model: <Random Forest>
Mean Absolute Error: 15800.543737024223
Mean Squared Error: 604200318.8044575
Root Mean Squared Error: 24580.486545315933
Mean Absolute Error log: 0.08666151808474287
Mean Squared Error log: 0.017047068348276587
Root Mean Squared Error log: 0.13056442221476947
R2: 0.9104708098090669
Model: <XGBoost>
Mean Absolute Error: 14995.80687716263
Mean Squared Error: 560971105.7326969
Root Mean Squared Error: 23684.828598338998
Mean Absolute Error log: 0.08464102561911709
Mean Squared Error log: 0.016676921838263227
Root Mean Squared Error log: 0.12913915687452518
R2: 0.9168764278109975
Model: <LightGBM>
Mean Absolute Error: 15192.098393956367
Mean Squared Error: 52010

Попробуем предсказывать логарифм таргета(**в конце не забудем возвести np.exp**)

In [None]:
train_df_log = train_df.drop(columns = 'SalePrice')
train_df_log['SalePrice'] = np.log(train_df.SalePrice)

train_and_evaluate(train_df_log, target_col = 'SalePrice')

Model: <Gradient Boosting>
Mean Absolute Error: 0.08037711454007568
Mean Squared Error: 0.0146492003936501
Root Mean Squared Error: 0.12103388118064339
Mean Absolute Error log: 0.006714621812282096
Mean Squared Error log: 0.0001056136718554302
Root Mean Squared Error log: 0.010276851261715828
R2: 0.9134025409569684
Model: <Random Forest>
Mean Absolute Error: 0.08630264710768547
Mean Squared Error: 0.01686478542169295
Root Mean Squared Error: 0.1298644886860644
Mean Absolute Error log: 0.007193665529346412
Mean Squared Error log: 0.00012094311371950403
Root Mean Squared Error log: 0.010997413955994564
R2: 0.9003053050282782
Model: <XGBoost>
Mean Absolute Error: 0.08948181836411885
Mean Squared Error: 0.01670343553941477
Root Mean Squared Error: 0.12924177165071193
Mean Absolute Error log: 0.0075029580735046526
Mean Squared Error log: 0.00012124986141455615
Root Mean Squared Error log: 0.011011351479929979
R2: 0.9012591106590784
Model: <LightGBM>
Mean Absolute Error: 0.08172299627635964


Видим, что с логарифмом стало чуть лучше. Проверим на тестовой выборке

In [None]:
preds_test = train_and_evaluate(train_df_log, 'SalePrice', test_df, is_test = True)

In [None]:
preds_test

{'Gradient Boosting': array([11.69093536, 11.96160988, 12.12980038, ..., 11.92794229,
        11.69653615, 12.2922257 ]),
 'Random Forest': array([11.72534595, 11.93761746, 12.10518957, ..., 11.93839233,
        11.66759822, 12.3163467 ]),
 'XGBoost': array([11.601184 , 11.964397 , 12.1297655, ..., 11.903719 , 11.606575 ,
        12.282977 ], dtype=float32),
 'LightGBM': array([11.70325269, 11.99745749, 12.11980546, ..., 12.01450988,
        11.64648839, 12.28837466])}

In [35]:
def make_submissions(test, predictions):

    for model, preds in predictions.items():
        test['SalePrice'] = np.exp(preds)
        submission = test[['Id', 'SalePrice']]
        filename = f'standart_{model}.csv'
        submission.to_csv(filename, index = False) 

In [36]:
make_submissions(test, preds_test)

На тесте LightGBM показал RMSLE 0.1304