In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
import scipy.stats as stats

# Load the cleaned dataset
df = pd.read_csv('cleaned_house_prices.csv')

print("--- Step 1: Statistical Analysis (ANOVA) ---")
# Hypothesis: Does the Location significantly affect the property price?
groups = [df[df['Location'] == loc]['Price'] for loc in df['Location'].unique()]
f_stat, p_val = stats.f_oneway(*groups)

print(f"ANOVA F-Statistic: {f_stat:.4f}")
print(f"ANOVA P-Value: {p_val:.4e}")

if p_val < 0.05:
    print("Conclusion: Statistically Significant. Location has a major impact on property prices.")
else:
    print("Conclusion: Not Significant. Location does not seem to affect prices much in this dataset.")

print("\n--- Step 2: Machine Learning (Price Prediction) ---")
# Preparing features and target
# Converting categorical variables into dummy/indicator variables
X = pd.get_dummies(df[['Area', 'Bedrooms', 'Bathrooms', 'Age', 'Location', 'Property_Type']], drop_first=True)
y = df['Price']

# Splitting data into Training (80%) and Testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing and training the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions
predictions = model.predict(X_test)

# Evaluating the model
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print(f"Model Performance:")
print(f"R-Squared Score: {r2:.4f} (This shows how well the model explains the price variation)")
print(f"Mean Absolute Error: ${mae:,.2f}")

print("\n--- Step 3: Feature Importance Analysis ---")
# Identifying which factors drive the price the most
importance = pd.Series(model.coef_, index=X.columns).sort_values(ascending=False)

print("Impact of different features on House Price:")
print(importance)

# Quick Insight for the report
top_feature = importance.index[0]
print(f"\nKey Business Insight: '{top_feature}' is the strongest positive driver of property value.")

--- Step 1: Statistical Analysis (ANOVA) ---
ANOVA F-Statistic: 58.6818
ANOVA P-Value: 3.3355e-22
Conclusion: Statistically Significant. Location has a major impact on property prices.

--- Step 2: Machine Learning (Price Prediction) ---
Model Performance:
R-Squared Score: 0.9406 (This shows how well the model explains the price variation)
Mean Absolute Error: $2,188,736.34

--- Step 3: Feature Importance Analysis ---
Impact of different features on House Price:
Bedrooms               1.585709e+06
Bathrooms              4.545940e+05
Property_Type_Villa    6.372497e+04
Area                   7.558956e+03
Age                   -8.244368e+04
Property_Type_House   -6.079037e+05
Location_Suburb       -8.633012e+06
Location_Rural        -1.672813e+07
dtype: float64

Key Business Insight: 'Bedrooms' is the strongest positive driver of property value.
