In [15]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load dataset
df = pd.read_csv('property.csv')

# Clean column names
df.columns = df.columns.str.lower()

# Parse date column
df['date'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month


In [16]:
df.head()

Unnamed: 0,suburb,address,rooms,type,price,method,sellerg,date,distance,postcode,...,landsize,buildingarea,yearbuilt,councilarea,lattitude,longtitude,regionname,propertycount,year,month
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,2016-12-03,2.5,3067.0,...,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0,2016,12
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,2016-02-04,2.5,3067.0,...,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0,2016,2
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,2017-03-04,2.5,3067.0,...,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0,2017,3
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,2017-03-04,2.5,3067.0,...,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0,2017,3
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,2016-06-04,2.5,3067.0,...,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0,2016,6


Q1


In [17]:
altona_prices = df[df['suburb'] == 'Altona']['price'].dropna()

t_stat, p_value = stats.ttest_1samp(altona_prices, 800000)

print("Mean price:", altona_prices.mean())
print("t-statistic:", t_stat)
print("p-value:", p_value)


Mean price: 834830.4054054054
t-statistic: 1.0277020770199676
p-value: 0.307483271305555


Decision:
Failed to reject the null hypothesis (p > 0.05)

Conclusion:
While the average price is higher than $800,000, the increase is not statistically significant. Given that the assumption-- that a typical Altona property sells for $800,000 still holds

Q2

In [18]:
df_2016 = df[df['year'] == 2016]

winter = df_2016[df_2016['month'].isin([10,11,12,1,2,3])]['price'].dropna()
summer = df_2016[~df_2016['month'].isin([10,11,12,1,2,3])]['price'].dropna()

t_stat, p_value = stats.ttest_ind(winter, summer, equal_var=False)

print("Winter mean:", winter.mean())
print("Summer mean:", summer.mean())
print("p-value:", p_value)


Winter mean: 1116647.5917391304
Summer mean: 1048054.7286917741
p-value: 8.950129747463378e-05


Decision: Reject the null hypothesis.

Conclusion: Season impacts prices, it was higher in winter than in summer.

Q3

In [19]:
abb = df[df['suburb'] == 'Abbotsford']
p_no_car = (abb['car'] == 0).mean()
probability = stats.binom.pmf(3, 10, p_no_car)
print("Probability:", round(probability, 3))


Probability: 0.26


Q4

In [20]:
abb = df[df['suburb'] == 'Abbotsford']
prob_3_rooms = (abb['rooms'] == 3).mean()
print("Probability:", round(prob_3_rooms, 3))


Probability: 0.357


Q5

In [21]:
abb = df[df['suburb'] == 'Abbotsford']
prob_2_bath = (abb['bathroom'] == 2).mean()
print("Probability:", round(prob_2_bath, 3))


Probability: 0.339


Q6

In [22]:
richmond_prices = df[df['suburb'] == 'Richmond']['price'].dropna()

t_stat, p_value = stats.ttest_1samp(richmond_prices, 1_000_000)

print("Mean price:", richmond_prices.mean())
print("t-statistic:", t_stat)
print("p-value:", p_value)

Mean price: 1083564.423076923
t-statistic: 2.579547704074923
p-value: 0.01044499066415202


Decision: Reject the null hypothesis

Conclusion: Richmond property prices are significantly higher than $1M

Q7

In [23]:
price_with_car = df[df['car'] > 0]['price'].dropna()
price_no_car = df[df['car'] == 0]['price'].dropna()

t_stat, p_value = stats.ttest_ind(price_with_car, price_no_car, equal_var=False)

print("With car mean:", price_with_car.mean())
print("No car mean:", price_no_car.mean())
print("p-value:", p_value)


With car mean: 1074443.9241114312
No car mean: 1079088.0107212476
p-value: 0.7854749985398599


Car parking does not guarantee a higher sale price on its own.

Q8

In [24]:
anova_df = df[['price', 'suburb', 'type']].dropna()

model = ols('price ~ C(suburb) + C(type) + C(suburb):C(type)', data=anova_df).fit()
anova_results = sm.stats.anova_lm(model, typ=2)

print(anova_results)


  F /= J


                         sum_sq       df          F         PR(>F)
C(suburb)          3.559152e+15    313.0  70.854970   0.000000e+00
C(type)                     NaN      2.0        NaN            NaN
C(suburb):C(type)  6.470034e+14    626.0   6.440216  2.505093e-263
Residual           2.068639e+15  12890.0        NaN            NaN




While Suburb and type of property has a significant impact, Interaction does not.

Q9

In [25]:
p_value = 0.032

if p_value < 0.05:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")


Reject the null hypothesis


The p value indicates a 3.2% chance of observing the data if no real price difference exists

Reject hypothesis at α = 0.05

Conclusion: The price difference is statistically meaningful and not random

Q10

In [32]:
#Independent two tail test (≤2 bathrooms, 2 bathrooms )
# Hypothesis is that no price difference
more_than_2 = df[df['bathroom'] > 2]['price'].dropna()
two_or_less = df[df['bathroom'] <= 2]['price'].dropna()

t_stat, p_value = stats.ttest_ind(more_than_2, two_or_less, equal_var=False)

print(">2 bathrooms mean:", more_than_2.mean())
print("≤2 bathrooms mean:", two_or_less.mean())
print("p-value:", p_value)


>2 bathrooms mean: 1882824.2
≤2 bathrooms mean: 1007347.9350638977
p-value: 7.38042835669062e-137


Reject the null hypothesis. Houses with more bathrooms cost premium price.