### Import Modules/Packages

In [44]:
import pandas as pd

### Import Data (Train and Test)

In [45]:
df_house_train = pd.read_csv(
    r"train.csv",
    low_memory=False,
)

# drop columns with all NaN values
df_house_train.dropna(axis=1, how="all")


# limit to only residential properties
residential = ["FV", "RH", "RL", "RP", "RM"]
df_house_train = df_house_train[df_house_train["MSZoning"].isin(residential)]
print("Train Data Shape: ", df_house_train.shape)


Train Data Shape:  (1450, 82)


In [46]:
# Fill missing data with zero

def fill_missing_data(df: pd.DataFrame):
    for col_ in df.columns:
        if df[col_].dtype == "object":
            # fill 'NONE' for categorical features
            df[col_].fillna("NONE", inplace=True)
        else:
            # fill zero for numerical features
            df[col_].fillna(0, inplace=True)


fill_missing_data(df_house_train)


In [47]:
correlation = df_house_train.corr()
top_10_corr = correlation.nlargest(12, "2023AdjSalePrice")["2023AdjSalePrice"].index

#### Building the Models

In [48]:
from sklearn.model_selection import train_test_split


In [49]:
# Define features to use in model 
Features = top_10_corr.copy().delete([0,1])


In [50]:
# Create copy of data set
train_df = df_house_train.copy()
# Filter to rows included in df_house_train set for train 
train_df = train_df[train_df.Id.isin(df_house_train.Id)]

# Set up Train and Validation Sets
    # test size = 20%
    # random state = 42

X_train, X_valid, y_train, y_valid = train_test_split(
    train_df[Features], train_df["2023AdjSalePrice"], test_size=0.2, random_state=42
)
_, X_valid_with_id, _, _ = train_test_split(
    train_df.drop("2023AdjSalePrice", axis=1),
    train_df["2023AdjSalePrice"],
    test_size=0.2,
    random_state=42,
)
# Create copy of comb_df
test_df = df_house_train.copy()

X_test = test_df[Features]


#### XGBoost Model

In [51]:
from xgboost import XGBRegressor

In [52]:
XGmodel = XGBRegressor(n_estimators=3000, learning_rate=0.005)

In [53]:
XGmodel.fit(X_train,y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.005, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=3000, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [54]:
print(f"Train Accuracy : {XGmodel.score(X_train,y_train)}")
print(f"Validation Accuracy : {XGmodel.score(X_valid,y_valid)}")

Train Accuracy : 0.9896360004096899
Validation Accuracy : 0.8444806748002602


In [55]:
XGdata = test_df
XGdata["SalePrice"] = XGmodel.predict(X_test)
XGdata = XGdata[["Id", "2023AdjSalePrice"]]
XGdata


Unnamed: 0,Id,2023AdjSalePrice
0,1,333600.0
1,2,290400.0
2,3,357600.0
3,4,224000.0
4,5,400000.0
...,...,...
1455,1456,280000.0
1456,1457,336000.0
1457,1458,426400.0
1458,1459,227400.0


In [56]:
# Used for UI
import pickle
# Save the XGmodel to a file
with open('xgboost_model.pkl', 'wb') as file:
    pickle.dump(XGmodel, file)