In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Load the data
house_sales_data = pd.read_csv("../own-project/house_sales.csv")

house_sales_data.head()

Unnamed: 0,house_id,city,sale_price,sale_date,months_listed,bedrooms,house_type,area
0,1217792,Silvertown,55943,2021-09-12,5.4,2,Semi-detached,107.8 sq.m.
1,1900913,Silvertown,384677,2021-01-17,6.3,5,Detached,498.8 sq.m.
2,1174927,Riverford,281707,2021-11-10,6.9,6,Detached,542.5 sq.m.
3,1773666,Silvertown,373251,2020-04-13,6.1,6,Det.,528.4 sq.m.
4,1258487,Silvertown,328885,2020-09-24,8.7,5,Detached,477.1 sq.m.


In [2]:
# Task 1: Treat '--' as missing values in 'city' and replace with "Unknown"
# Replace '--' with NaN and count missing values
house_sales_data["city"] = house_sales_data["city"].replace("--", pd.NA)
missing_city = house_sales_data["city"].isna().sum()  # Count missing values in 'city'

# Replace missing values in 'city' with "Unknown"
house_sales_data["city"] = house_sales_data["city"].fillna("Unknown")

print("Missing values in city: ", missing_city)

Missing values in city:  73


In [3]:
# Task 2: Data Cleaning


def clean_data(df):
    """Clean the dataset according to specified criteria."""
    # Remove rows where 'sale_price' is missing
    df = df.dropna(subset=["sale_price"]).copy()

    # Replace missing 'sale_date' values with '2023-01-01'
    df["sale_date"] = df["sale_date"].fillna("2023-01-01")

    # Replace missing 'months_listed' values with the mean, rounded to one decimal place
    mean_months_listed = round(df["months_listed"].mean(), 1)
    df["months_listed"] = df["months_listed"].fillna(mean_months_listed)

    # Replace missing 'bedrooms' values with the mean, rounded to the nearest integer
    mean_bedrooms = int(round(df["bedrooms"].mean()))
    df["bedrooms"] = df["bedrooms"].fillna(mean_bedrooms)

    # Replace missing 'house_type' values with the most common house type
    most_common_house_type = df["house_type"].mode()[0]
    df["house_type"] = df["house_type"].fillna(most_common_house_type)

    # Remove " sq.m." suffix from 'area' column and convert to numeric
    df["area"] = df["area"].str.replace(" sq.m.", "", regex=False).astype(float)

    # Replace any remaining NA values in 'area' with the mean, rounded to one decimal place
    mean_area = round(df["area"].mean(), 1)
    df["area"] = df["area"].fillna(mean_area)

    return df


# Apply the data cleaning function
clean_data_df = clean_data(house_sales_data)

# Display the first few rows of the cleaned dataset
clean_data_df.head()

Unnamed: 0,house_id,city,sale_price,sale_date,months_listed,bedrooms,house_type,area
0,1217792,Silvertown,55943,2021-09-12,5.4,2,Semi-detached,107.8
1,1900913,Silvertown,384677,2021-01-17,6.3,5,Detached,498.8
2,1174927,Riverford,281707,2021-11-10,6.9,6,Detached,542.5
3,1773666,Silvertown,373251,2020-04-13,6.1,6,Det.,528.4
4,1258487,Silvertown,328885,2020-09-24,8.7,5,Detached,477.1


In [4]:
# Task 3: Calculate average sale price and variance by number of bedrooms


def calculate_price_by_rooms(df):
    """Calculate average sale price and variance by number of bedrooms."""
    price_by_rooms = (
        df.groupby("bedrooms")
        .agg(avg_price=("sale_price", "mean"), var_price=("sale_price", "var"))
        .reset_index()
    )
    price_by_rooms["avg_price"] = price_by_rooms["avg_price"].round(1)
    price_by_rooms["var_price"] = price_by_rooms["var_price"].round(1)
    return price_by_rooms


# Calculate price by rooms
price_by_rooms_df = calculate_price_by_rooms(clean_data_df)

# Display the result
price_by_rooms_df

Unnamed: 0,bedrooms,avg_price,var_price
0,2,67076.4,565289600.0
1,3,154665.1,2378289000.0
2,4,234704.6,1725211000.0
3,5,301515.9,2484328000.0
4,6,375741.3,3924432000.0


In [5]:
# Load train and validation datasets
train_data = pd.read_csv("../own-project/train.csv")
val_data = pd.read_csv("../own-project/validation.csv")

In [6]:
# Task 4: Fit a Baseline Model (Linear Regression)


def baseline_model(train_df, validation_df):
    """Fit a linear regression model to predict sale price and return predictions."""
    features = ["bedrooms", "area", "months_listed"]
    X_train = train_df[features]
    y_train = train_df["sale_price"]
    X_validation = validation_df[features]

    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_validation)

    return pd.DataFrame(
        {"house_id": validation_df["house_id"], "price": predictions.round(1)}
    )


# Generate predictions using the baseline model
base_result_df = baseline_model(train_data, val_data)

# Display the baseline model results
base_result_df

Unnamed: 0,house_id,price
0,1331375,118876.1
1,1630115,257484.2
2,1645745,383988.5
3,1336775,117832.4
4,1888274,256378.7
...,...,...
295,1986255,346927.9
296,1896276,365824.1
297,1758223,239461.9
298,1752010,181628.6


In [7]:
# Task 5: Fit a Comparison Model (Random Forest)


def comparison_model(train_df, validation_df):
    """Fit a random forest model to predict sale price and return predictions."""
    features = ["bedrooms", "area", "months_listed"]
    X_train = train_df[features]
    y_train = train_df["sale_price"]
    X_validation = validation_df[features]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_validation)

    return pd.DataFrame(
        {"house_id": validation_df["house_id"], "price": predictions.round(1)}
    )


# Generate predictions using the comparison model
compare_result_df = comparison_model(train_data, val_data)

# Display the comparison model results
compare_result_df

Unnamed: 0,house_id,price
0,1331375,119426.8
1,1630115,274531.0
2,1645745,371279.0
3,1336775,101516.4
4,1888274,265922.0
...,...,...
295,1986255,306840.2
296,1896276,404949.6
297,1758223,240180.7
298,1752010,217162.5
