In [741]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge

In [742]:
seed = 42

# Data preparation

In [743]:
def clean_df(df):
    return df

def prep_features(df: pd.DataFrame):
    df = df.drop(["ID", "Price"], axis=1, errors='ignore')

    # drop useless stuff
    df = df.drop(
        [
            "House_Orientation_Angle",
            "Street_Alignment_Offset",
            "Magnetic_Field_Strength",
            "Vibration_Level",
        ],
        axis=1,
        errors="ignore",
    )

    return df

In [744]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [745]:
df_train.head()

Unnamed: 0,Square_Footage,Num_Bedrooms,Num_Bathrooms,Year_Built,Lot_Size,Garage_Size,Neighborhood_Quality,Footage_to_Lot_Ratio,Total_Rooms,Age_of_House,Garage_to_Footage_Ratio,Avg_Room_Size,Solar_Exposure_Index
0,2028,2,3,1967,1.78479,2,2,1136.268444,5,58,0.000986,405.6,235.502857
1,3519,5,3,1966,4.009947,0,10,877.567605,8,59,0.0,439.875,300.292055
2,4507,2,3,2014,4.122337,0,7,1093.311933,5,11,0.0,901.4,186.851621
3,3371,4,2,2000,1.580318,0,1,2133.114532,6,25,0.0,561.833333,107.843644
4,2871,5,1,1974,3.426914,2,6,837.78009,6,51,0.000697,478.5,357.571806


# Exploratory data analysis

In [746]:
# any missing values?

df_train.isna().sum().sort_values(ascending=False).sum()

0

In [747]:
df_train.corrwith(df["Price"]).sort_values(ascending=True)

Lot_Size                  -0.418189
Garage_to_Footage_Ratio   -0.356300
Age_of_House              -0.077660
Num_Bathrooms             -0.070893
Total_Rooms               -0.060942
Neighborhood_Quality      -0.035206
Num_Bedrooms              -0.030935
Solar_Exposure_Index       0.002168
Garage_Size                0.044464
Year_Built                 0.077660
Avg_Room_Size              0.543145
Square_Footage             0.750451
Footage_to_Lot_Ratio       0.920408
dtype: float64

# Models

In [748]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train, df["Price"], test_size=0.2, random_state=seed
)

In [749]:
def evaluate(model):
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring="neg_mean_absolute_error", n_jobs=-1)
    cv = np.mean(scores) - np.std(scores)

    model.fit(X_train, y_train)
    pr = mean_absolute_error(y_val, model.predict(X_val))

    return -cv, pr

In [750]:
rf = RandomForestRegressor(random_state=seed)

evaluate(rf)

(650.1482355337307, 530.2564962041262)

In [751]:
gb = GradientBoostingRegressor(random_state=seed)

evaluate(gb)

(481.78325206579393, 327.270678937147)

In [752]:
ridge = Ridge(alpha=0.2, random_state=seed)

evaluate(ridge)

(284.7990823460446, 256.29211100534974)

In [753]:
st = StackingRegressor(
    estimators=[("gb", gb), ('lr', ridge)],
)

evaluate(st)

(278.5312226449407, 238.73555730751121)

In [754]:
model = st

model.fit(X_train, y_train)

# Submission

In [755]:
df_test = pd.read_csv("test_data.csv")
df_test = clean_df(df_test)

features = prep_features(df_test)

In [756]:
# subtask 1
subtask1 = df_test["Square_Footage"] + df_test["Garage_Size"] + df_test["Lot_Size"]

# subtask 2
subtask2 = df_test["Garage_Size"] / df_test["Total_Rooms"]

# subtask 3
subtask3 = (df_test["Solar_Exposure_Index"] - df_test["Vibration_Level"] ) / df_test["Magnetic_Field_Strength"]

# subtask 4
ftg_mean = df["Square_Footage"].mean()

subtask4 = np.abs(ftg_mean - df_test["Square_Footage"])

# subtask 5
subtask5 = model.predict(features)

In [757]:
def build_subtask_df(subtask_id, answers):
    return pd.DataFrame({
        "subtaskID": subtask_id, "datapointID": df_test["ID"], "answer": answers
    })

subtasks = [(1, subtask1), (2, subtask2), (3, subtask3), (4, subtask4), (5, subtask5)]

submission = pd.concat(
    [build_subtask_df(sid, subtask) for sid, subtask in subtasks], ignore_index=True
)

submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,801,4015.098092
1,1,802,2312.369622
2,1,803,4710.79297
3,1,804,4937.479598
4,1,805,3649.980987


In [758]:
submission.to_csv("submission.csv", index=False)