In [55]:

import pandas as pd
import numpy as np
from scipy import stats


df = pd.read_csv('/content/property.csv')

print("Start")
print("=" *50)
print()
print("Dataset shape:", df.shape)
print()
print('=' *10)
print()
print("Basic info:")
print(df.info())
print()
print('=' *10)
print()
print("Data types:")
print(df.dtypes)
print()
print('=' *10)
print()
print("Missing values in each column:")
print(df.isnull().sum())
print()
print('=' *10)
print()


car_mode = df['Car'].mode()
if len(car_mode) > 0:
    car_mode_value = car_mode[0]
    print()
    print(f"Most common Car value: {car_mode_value}")
    missing_before = df['Car'].isnull().sum()
    df['Car'] = df['Car'].fillna(car_mode_value)
    missing_after = df['Car'].isnull().sum()
    print(f"Successfully filled {missing_before - missing_after} missing values")
else:
    print("No mode found for Car column")

print()
print('=' *50)
print('='*50)
print()

Start

Dataset shape: (13580, 21)


Basic info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  La

In [56]:

print()
print("QUESTION 1: Testing Altona property prices")
print('='*50)
print()

altona_data = df[df['Suburb'] == 'Altona']

if len(altona_data) > 0:

    prices = altona_data['Price'].dropna()

    test_stat, p_value = stats.ttest_1samp(prices, 800000, alternative='greater')


    if p_value < 0.05:
        print("Conclusion: Reject H0. The typical property price in Altona has increased from $800,000.")
    else:
        print("Conclusion: Fail to reject H0. No evidence that typical property price has increased from $800,000.")
else:
    print("No data found for Altona suburb")

print()
print('=' *50)
print('='*50)
print()


QUESTION 1: Testing Altona property prices

Conclusion: Fail to reject H0. No evidence that typical property price has increased from $800,000.




In [57]:
print()
print("QUESTION 2: Difference in the 2016 prices in summer months vs winter months")
print('='*50)
print()


df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)

df_2016 = df[df['Date'].dt.year == 2016].copy()


def get_season(month):
    if month >= 10 or month <= 2:
        return 'winter'
    else:
        return 'summer'

df_2016['season'] = df_2016['Date'].dt.month.apply(get_season)



price_column = None
for col in df.columns:
    if 'price' in col.lower():
        price_column = col
        break


winter_prices = df_2016[df_2016['season'] == 'winter'][price_column].dropna()
summer_prices = df_2016[df_2016['season'] == 'summer'][price_column].dropna()



if len(winter_prices) > 0 and len(summer_prices) > 0:
        print()
        print(f"Price statistics:")
        print(f"Winter mean price: ${winter_prices.mean():,.2f}")
        print(f"Summer mean price: ${summer_prices.mean():,.2f}")
        print(f"Winter median price: ${winter_prices.median():,.2f}")
        print(f"Summer median price: ${summer_prices.median():,.2f}")

        t_stat, p_value = stats.ttest_ind(winter_prices, summer_prices,
                                          equal_var=False,
                                          nan_policy='omit')


        print()
        print(f"Conclusion:")
        if p_value < 0.05:
            print("Reject H0. There is a statistically significant difference")

            if winter_prices.mean() > summer_prices.mean():
                print("Winter prices are higher than summer prices.")
            else:
                print("Summer prices are higher than winter prices.")
        else:
            print("Fail to reject H0. No significant difference found")
print()
print('=' *50)
print('='*50)
print()




QUESTION 2: Difference in the 2016 prices in summer months vs winter months


Price statistics:
Winter mean price: $1,116,647.59
Summer mean price: $1,048,054.73
Winter median price: $931,499.50
Summer median price: $881,000.00

Conclusion:
Reject H0. There is a statistically significant difference
Winter prices are higher than summer prices.




In [73]:
print()
print("QUESTION 3: Car space probability in  Abbotsford")
print('='*50)
print()

abbotsford_data = df[df['Suburb'] == 'Abbotsford']


valid_car_data = abbotsford_data[abbotsford_data['Car'].notnull()]

if len(valid_car_data) > 0:

        no_parking_count = len(valid_car_data[valid_car_data['Car'] == 0])
        total_count = len(valid_car_data)
        p = no_parking_count / total_count


        from scipy.stats import binom

        n = 10
        k = 3
        probability = binom.pmf(k, n, p)

        print(f"Binomial probability(3 out of 10 have no parking)")
        print(f"P = binom.pmf(3, 10, {p:.4f})={probability:.3f}")
        print()
        print("(based upon assumption that no car means no parking)")

else:
        print("No valid car data available")




QUESTION 3: Car space probability in  Abbotsford

Binomial probability(3 out of 10 have no parking)
P = binom.pmf(3, 10, 0.2679)=0.260

(based upon assumption that no car means no parking)


In [84]:
#Q3 (alternate approach, used help from AI)

abbotsford_data = df[df['Suburb'] == 'Abbotsford']



valid_data = abbotsford_data[
        abbotsford_data['Car'].notnull() &
        abbotsford_data['Landsize'].notnull()
    ].copy()

print()
print(f"Properties with both car and land size data: {len(valid_data)}")

if len(valid_data) > 0:


        MIN_SPACE_PER_CAR = 12

        print()
        print(f"Using minimum {MIN_SPACE_PER_CAR} sqm per car as parking requirement")

        def has_parking_space(land_size, cars):

            if cars == 0:

                return "no_cars"
            elif land_size >= (cars * MIN_SPACE_PER_CAR):
                return "has_parking"
            else:
                return "no_parking"


        parking_status = []
        for i, row in valid_data.iterrows():
            status = has_parking_space(row['Landsize'], row['Car'])
            parking_status.append(status)

        valid_data['parking_status'] = parking_status

        status_counts = valid_data['parking_status'].value_counts()



        definite_no_parking = status_counts.get('no_parking', 0)
        no_cars_properties = status_counts.get('no_cars', 0)

        # For properties with no cars, estimate that some have parking, some don't
        # Let's check the land size of properties with no cars
        no_cars_data = valid_data[valid_data['parking_status'] == 'no_cars']

        if len(no_cars_data) > 0:
            # Properties with very small land (< 12 sqm) likely have no parking
            no_cars_no_parking = len(no_cars_data[no_cars_data['Landsize'] < small_land_threshold])

            # For the rest, assume 50% have parking, 50% don't
            remaining_no_cars = len(no_cars_data) - no_cars_no_parking
            estimated_no_parking_from_remaining = remaining_no_cars * 0.5

            total_estimated_no_parking = (definite_no_parking +
                                         no_cars_no_parking +
                                         estimated_no_parking_from_remaining)
        else:
            total_estimated_no_parking = definite_no_parking

        p_no_parking = total_estimated_no_parking / len(valid_data)

        print(f"\nProbability calculation:")
        print(f"Properties definitely without parking: {definite_no_parking}")
        if len(no_cars_data) > 0:
            print(f"Properties with 0 cars and very small land (<12 sqm): {no_cars_no_parking}")
            print(f"Other properties with 0 cars (50% estimated no parking): {estimated_no_parking_from_remaining:.1f}")
        print(f"Total estimated without parking: {total_estimated_no_parking:.1f}")
        print(f"Total properties analyzed: {len(valid_data)}")
        print(f"P(no parking) = {total_estimated_no_parking:.1f}/{len(valid_data)} = {p_no_parking:.4f}")

        # Binomial probability
        from scipy.stats import binom

        n = 10  # 10 properties
        k = 3   # 3 without parking
        p = p_no_parking

        probability = binom.pmf(k, n, p)

        print(f"\nBinomial probability calculation:")
        print(f"n = {n} (10 properties sold)")
        print(f"k = {k} (3 without parking)")
        print(f"p = {p:.4f} (probability of no parking)")
        print(f"P(X = 3) = C(10,3) * ({p:.4f})^3 * (1-{p:.4f})^7")
        print(f"Result: {probability:.6f}")
        print(f"Rounded to 3 decimal places: {probability:.3f}")

else:
        print("Insufficient data for analysis")




Properties with both car and land size data: 56

Using minimum 12 sqm per car as parking requirement

Probability calculation:
Properties definitely without parking: 12
Properties with 0 cars and very small land (<12 sqm): 0
Other properties with 0 cars (50% estimated no parking): 7.5
Total estimated without parking: 19.5
Total properties analyzed: 56
P(no parking) = 19.5/56 = 0.3482

Binomial probability calculation:
n = 10 (10 properties sold)
k = 3 (3 without parking)
p = 0.3482 (probability of no parking)
P(X = 3) = C(10,3) * (0.3482)^3 * (1-0.3482)^7
Result: 0.253195
Rounded to 3 decimal places: 0.253


In [96]:
print()
print("QUESTION 4: Chances of finding 3 room property in Abbotsford")
print('='*50)
print()


abbotsford_data = df[df['Suburb'] == 'Abbotsford']
rooms_data = abbotsford_data['Rooms'].dropna()

properties_with_3_rooms = abbotsford_data[abbotsford_data['Rooms'] == 3].shape[0]
total_properties = len(rooms_data)

probability = properties_with_3_rooms / total_properties

print()
print(f"Total properties with room data: {total_properties}")
print(f"Properties with 3 rooms: {properties_with_3_rooms}")
print(f"Probability of finding property with 3 rooms: {probability:.3f}")

print()
print('=' *50)
print('='*50)
print()


QUESTION 4: Chances of finding 3 room proprty in Abbotsford


Total properties with room data: 56
Properties with 3 rooms: 20
Probability of finding property with 3 rooms: 0.357




In [97]:
print()
print("QUESTION 5: Chances of finding 2 bathroom proprty in Abbotsford")
print('='*50)
print()


abbotsford_data = df[df['Suburb'] == 'Abbotsford']
rooms_data = abbotsford_data['Bathroom'].dropna()

properties_with_3_rooms = abbotsford_data[abbotsford_data['Bathroom'] == 3].shape[0]
total_properties = len(rooms_data)

probability = properties_with_3_rooms / total_properties

print()
print(f"Total properties with bathroom data: {total_properties}")
print(f"Properties with 2 bathrooms: {properties_with_3_rooms}")
print(f"Probability of finding property with 2 bathrooms: {probability:.3f}")

print()
print('=' *50)
print('='*50)
print()


QUESTION 5: Chances of finding 2 bathroom proprty in Abbotsford


Total properties with bathroom data: 56
Properties with 2 bathrooms: 2
Probability of finding property with 2 bathrooms: 0.036




In [102]:
print()
print("QUESTION 6: One Sample Hypothesis testing whether property price in Richmond is $1000,000")
print('='*50)
print()


richmond_data = df[df['Suburb'] == 'Richmond']
prices = richmond_data['Price'].dropna()

print(f"Number of properties in Richmond: {len(prices)}")
print(f"Sample mean price: ${prices.mean():,.2f}")
print(f"Sample standard deviation: ${prices.std():,.2f}")


test_stat, p_value = stats.ttest_1samp(prices, 1000000)


if p_value < 0.05:
    print()
    print("Conclusion:")
    print("Reject H0. There is statistically significant difference.")
    if prices.mean() > 1000000:
        print("Properties in Richmond are more expensive than claimed.")
    else:
        print("Properties in Richmond are less expensive than claimed.")
else:
    print("Conclusion:")
    print("Fail to reject H0. No significant difference found.")

print()
print('=' *50)
print('='*50)
print()


QUESTION 6: One Sample Hypothesis testing whether property price in Richmond is $1000,000

Number of properties in Richmond: 260
Sample mean price: $1,083,564.42
Sample standard deviation: $522,353.52

Conclusion:
Reject H0. There is statistically significant difference.
Properties in Richmond are more expensive than claimed.




In [111]:
#alt method with better logic but built using AI avoided SIMPLER but LESS ACCURATE method used
print()
print("QUESTION 7: Independent Two-Sample T-Test on whether properties with car parking sell higher than with no parking")
print('='*50)
print()


valid_data = df[df['Car'].notnull() & df['Price'].notnull()]
with_parking = valid_data[valid_data['Car'] == 1]['Price']
without_parking = valid_data[valid_data['Car'] == 0]['Price']


t_stat, p_value = stats.ttest_ind(with_parking, without_parking, equal_var=False, alternative='greater')


print("Conclusion:")
if p_value < 0.05:
    print("Since p < 0.05, reject H0.")
    print("Properties with car parking sell at higher average prices.")

else:
    print("Since p > 0.05, cannot reject H0.")
    print("No strong evidence that parking increases price.")

print()
print('=' *50)
print('='*50)
print()



QUESTION 7: Independent Two-Sample T-Test on whether properties with car parking sell higher than with no parking

Conclusion:
Since p > 0.05, cannot reject H0.
No strong evidence that parking increases price.




In [127]:
#was getting error hence with help of AI picked top 5 subsets
print()
print("QUESTION 8: Two-Way ANOVA to explain which factors significantly affect price")
print('='*50)
print()


from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

top_suburbs = df['Suburb'].value_counts().head(5).index.tolist()
df_anova = df[df['Suburb'].isin(top_suburbs)]

df_anova = df_anova.dropna(subset=['Price', 'Suburb', 'Type'])

print(f"Sample size for ANOVA: {len(df_anova)}")
print(f"Suburbs included: {top_suburbs}")
print(f"Property types: {df_anova['Type'].unique()}")


model = ols('Price ~ C(Suburb) + C(Type) + C(Suburb):C(Type)', data=df_anova).fit()
anova_results = anova_lm(model, typ=2)

print()
print(f"Conclusion:")
print(anova_results)

print()
print('=' *50)
print('='*50)
print()



QUESTION 8: Two-Way ANOVA to explain which factors significantly affect price

Sample size for ANOVA: 1329
Suburbs included: ['Reservoir', 'Richmond', 'Bentleigh East', 'Preston', 'Brunswick']
Property types: ['h' 't' 'u']

Conclusion:
                         sum_sq      df           F         PR(>F)
C(Suburb)          4.160952e+13     4.0  139.142810   3.337329e-99
C(Type)            6.480908e+13     2.0  433.444923  2.721411e-145
C(Suburb):C(Type)  6.129412e+12     8.0   10.248420   5.587511e-14
Residual           9.823523e+13  1314.0         NaN            NaN




In [140]:
print()
print("QUESTION 9: A hypothesis test comparing prices across two suburbs")
print('='*50)
print()


p_value = 0.032
alpha = 0.05

print()
print("1. What does this p-value indicate?")
print(f"The p-value of {p_value} indicates that if the null hypothesis were true,there is a {p_value*100:.1f}%\nchance of observing the difference in prices (or more extreme) between the two suburbs.")

print()
print("2. Should the null hypothesis be rejected at α = 0.05?")
if p_value < alpha:
    print(f"Yes, reject the null hypothesis because p-value ({p_value}) < α ({alpha}).")
else:
    print(f"No, do not reject the null hypothesis because p-value ({p_value}) > α ({alpha}).")

print()
print("3. How should a business stakeholder interpret this result?")
print("Business interpretation: There is statistically significant evidence that property prices\ndiffer between the two suburbs.")
print("This information can be used for: Pricing, Investment decisions and Market analysis.")

print()
print('=' *50)
print('='*50)
print()


QUESTION 9: A hypothesis test comparing prices across two suburbs


1. What does this p-value indicate?
The p-value of 0.032 indicates that if the null hypothesis were true,there is a 3.2%
chance of observing the difference in prices (or more extreme) between the two suburbs.

2. Should the null hypothesis be rejected at α = 0.05?
Yes, reject the null hypothesis because p-value (0.032) < α (0.05).

3. How should a business stakeholder interpret this result?
Business interpretation: There is statistically significant evidence that property prices
differ between the two suburbs.
This information can be used for: Pricing, Investment decisions and Market analysis.




In [150]:
print()
print("QUESTION 10: Style Hypothesis Validation")
print('='*50)
print()



premium_group = df[df['Bathroom'] > 2]['Price'].dropna()
standard_group = df[df['Bathroom'] <= 2]['Price'].dropna()

print(f"Properties with >2 bathrooms: {len(premium_group)}")
print(f"Properties with ≤2 bathrooms: {len(standard_group)}")
print(f"Mean price (>2 bathrooms): ${premium_group.mean():,.2f}")
print(f"Mean price (≤2 bathrooms): ${standard_group.mean():,.2f}")


t_stat, p_value = stats.ttest_ind(premium_group, standard_group,
                                      equal_var=False, alternative='greater')

print()
print(f"Results:")
print(f"Test statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print()
print("Recommendation to policymakers:")
if p_value < 0.05:
     print("REJECT the null hypothesis. There is statistical evidence that properties with \nmore than 2 bathrooms command a premium price.")
     print("Policy implications: Consider incentives for developers to include more bathrooms")

else:
     print("FAIL TO REJECT the null hypothesis. No statistical evidence that properties with \nmore than 2 bathrooms command a premium price.")
     print("Policy implications: Focus on other features that may add more value")


print()
print('=' *50)
print('='*50)
print()


QUESTION 10: Style Hypothesis Validation

Properties with >2 bathrooms: 1060
Properties with ≤2 bathrooms: 12520
Mean price (>2 bathrooms): $1,882,824.20
Mean price (≤2 bathrooms): $1,007,347.94

Results:
Test statistic: 28.8002
P-value: 0.0000

Recommendation to policymakers:
REJECT the null hypothesis. There is statistical evidence that properties with 
more than 2 bathrooms command a premium price.
Policy implications: Consider incentives for developers to include more bathrooms


