In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, pearsonr
from statsmodels.formula.api import ols
import statsmodels.api as sm

# Load the dataset
data = pd.read_csv('housing_dataset.csv')

# Question 1: T-test for median values of houses bounded by the Charles river or not
river_houses = data[data['CHAS'] == 1]['MEDV']
non_river_houses = data[data['CHAS'] == 0]['MEDV']
t_stat, p_value = ttest_ind(river_houses, non_river_houses)
print(f"Question 1: T-Test Results")
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}\n")

# Question 2: Box plot comparing median values based on the proportion of owner-occupied units built before 1940
data['proportion'] = pd.qcut(data['AGE'], q=4)
plt.figure(figsize=(8, 6))
sns.boxplot(x='proportion', y='MEDV', data=data)
plt.xlabel('Proportion of Owner-occupied Units Built Before 1940')
plt.ylabel('Median Value')
plt.title('Median Values Based on Proportion of Owner-occupied Units Built Before 1940')
plt.show()

# Question 3: Scatter plot of Nitric oxide concentrations vs. proportion of non-retail business acres per town
plt.figure(figsize=(8, 6))
sns.scatterplot(x='INDUS', y='NOX', data=data)
plt.xlabel('Proportion of Non-retail Business Acres per Town')
plt.ylabel('Nitric Oxide Concentrations')
plt.title('Nitric Oxide Concentrations vs. Proportion of Non-retail Business Acres per Town')
plt.show()

# Question 4: Regression analysis for weighted distance to employment centers vs. median value of owner-occupied homes
model = ols('MEDV ~ DIS', data=data).fit()
print("Question 4: Regression Analysis Results")
print(model.summary())

# Plot regression line and data points
plt.figure(figsize=(8, 6))
sns.regplot(x='DIS', y='MEDV', data=data)
plt.xlabel('Weighted Distance to Employment Centers')
plt.ylabel('Median Value of Owner-occupied Homes')
plt.title('Weighted Distance to Employment Centers vs. Median Value of Owner-occupied Homes')
plt.show()
