In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Load the data
house_sales_data = pd.read_csv("../own-project/house_sales.csv")

house_sales_data.head()

In [None]:
# Task 1: Treat '--' as missing values in 'city' and replace with "Unknown"
# Replace '--' with NaN and count missing values
house_sales_data["city"] = house_sales_data["city"].replace("--", pd.NA)
missing_city = house_sales_data["city"].isna().sum()  # Count missing values in 'city'

# Replace missing values in 'city' with "Unknown"
house_sales_data["city"] = house_sales_data["city"].fillna("Unknown")

print("Missing values in city: ", missing_city)

In [None]:
# Task 2: Data Cleaning


def clean_data(df):
    """Clean the dataset according to specified criteria."""
    # Remove rows where 'sale_price' is missing
    df = df.dropna(subset=["sale_price"]).copy()

    # Replace missing 'sale_date' values with '2023-01-01'
    df["sale_date"] = df["sale_date"].fillna("2023-01-01")

    # Replace missing 'months_listed' values with the mean, rounded to one decimal place
    mean_months_listed = round(df["months_listed"].mean(), 1)
    df["months_listed"] = df["months_listed"].fillna(mean_months_listed)

    # Replace missing 'bedrooms' values with the mean, rounded to the nearest integer
    mean_bedrooms = int(round(df["bedrooms"].mean()))
    df["bedrooms"] = df["bedrooms"].fillna(mean_bedrooms)

    # Replace missing 'house_type' values with the most common house type
    most_common_house_type = df["house_type"].mode()[0]
    df["house_type"] = df["house_type"].fillna(most_common_house_type)

    # Remove " sq.m." suffix from 'area' column and convert to numeric
    df["area"] = df["area"].str.replace(" sq.m.", "", regex=False).astype(float)

    # Replace any remaining NA values in 'area' with the mean, rounded to one decimal place
    mean_area = round(df["area"].mean(), 1)
    df["area"] = df["area"].fillna(mean_area)

    return df


# Apply the data cleaning function
clean_data_df = clean_data(house_sales_data)

# Display the first few rows of the cleaned dataset
clean_data_df.head()

In [None]:
# Task 3: Calculate average sale price and variance by number of bedrooms


def calculate_price_by_rooms(df):
    """Calculate average sale price and variance by number of bedrooms."""
    price_by_rooms = (
        df.groupby("bedrooms")
        .agg(avg_price=("sale_price", "mean"), var_price=("sale_price", "var"))
        .reset_index()
    )
    price_by_rooms["avg_price"] = price_by_rooms["avg_price"].round(1)
    price_by_rooms["var_price"] = price_by_rooms["var_price"].round(1)
    return price_by_rooms


# Calculate price by rooms
price_by_rooms_df = calculate_price_by_rooms(clean_data_df)

# Display the result
price_by_rooms_df

In [None]:
# Load train and validation datasets
train_data = pd.read_csv("../own-project/train.csv")
val_data = pd.read_csv("../own-project/validation.csv")

In [None]:
# Task 4: Fit a Baseline Model (Linear Regression)


def baseline_model(train_df, validation_df):
    """Fit a linear regression model to predict sale price and return predictions."""
    features = ["bedrooms", "area", "months_listed"]
    X_train = train_df[features]
    y_train = train_df["sale_price"]
    X_validation = validation_df[features]

    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_validation)

    return pd.DataFrame(
        {"house_id": validation_df["house_id"], "price": predictions.round(1)}
    )


# Generate predictions using the baseline model
base_result_df = baseline_model(train_data, val_data)

# Display the baseline model results
base_result_df

In [None]:
# Task 5: Fit a Comparison Model (Random Forest)


def comparison_model(train_df, validation_df):
    """Fit a random forest model to predict sale price and return predictions."""
    features = ["bedrooms", "area", "months_listed"]
    X_train = train_df[features]
    y_train = train_df["sale_price"]
    X_validation = validation_df[features]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_validation)

    return pd.DataFrame(
        {"house_id": validation_df["house_id"], "price": predictions.round(1)}
    )


# Generate predictions using the comparison model
compare_result_df = comparison_model(train_data, val_data)

# Display the comparison model results
compare_result_df