Task 1: Familiarize Yourself with the Dataset

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv("boston_housing.csv")

# Display the first few rows of the dataset
data.head()


Task 2: Generate Descriptive Statistics and Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Boxplot for MEDV
plt.figure(figsize=(8, 6))
sns.boxplot(x=data['MEDV'])
plt.title('Boxplot of Median Value of Owner-Occupied Homes (MEDV)')
plt.xlabel('Median Value ($1000\'s)')
plt.show()


In [None]:
# Bar plot for CHAS
plt.figure(figsize=(8, 6))
sns.countplot(x=data['CHAS'], palette='viridis')
plt.title('Bar Plot of Homes Bounded by the Charles River')
plt.xlabel('Charles River (1 = Yes, 0 = No)')
plt.ylabel('Number of Homes')
plt.show()


In [None]:
# Discretize AGE into three groups
bins = [0, 35, 70, 100]
labels = ['35 years and younger', 'Between 35 and 70 years', '70 years and older']
data['AGE_group'] = pd.cut(data['AGE'], bins=bins, labels=labels)

# Boxplot of MEDV vs. AGE_group
plt.figure(figsize=(10, 6))
sns.boxplot(x='AGE_group', y='MEDV', data=data, palette='coolwarm')
plt.title('Boxplot of Median Value (MEDV) vs. Age Group of Homes')
plt.xlabel('Age Group of Homes')
plt.ylabel('Median Value ($1000\'s)')
plt.show()

In [None]:
# Scatter plot of NOX vs. INDUS
plt.figure(figsize=(10, 6))
sns.scatterplot(x='INDUS', y='NOX', data=data, color='darkred')
plt.title('Scatter Plot of NOX vs. INDUS')
plt.xlabel('Proportion of Non-Retail Business Acres per Town (INDUS)')
plt.ylabel('Nitric Oxides Concentration (NOX)')
plt.show()

In [None]:
# Histogram of PTRATIO
plt.figure(figsize=(10, 6))
sns.histplot(data['PTRATIO'], bins=15, kde=True, color='skyblue')
plt.title('Histogram of Pupil-Teacher Ratio (PTRATIO)')
plt.xlabel('Pupil-Teacher Ratio by Town')
plt.ylabel('Frequency')
plt.show()

Task 3: Use the Appropriate Tests to Answer the Questions

In [None]:
from scipy.stats import ttest_ind

# Split data into two groups based on CHAS
chas_0 = data[data['CHAS'] == 0]['MEDV']
chas_1 = data[data['CHAS'] == 1]['MEDV']

# Perform T-test
t_stat, p_value = ttest_ind(chas_0, chas_1)

# Results
print(f"T-statistic: {t_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("There is a significant difference in median values of houses bounded by the Charles River.")
else:
    print("There is no significant difference in median values of houses bounded by the Charles River.")


In [None]:
from scipy.stats import f_oneway

# Group data by AGE_group
group1 = data[data['AGE_group'] == '35 years and younger']['MEDV']
group2 = data[data['AGE_group'] == 'Between 35 and 70 years']['MEDV']
group3 = data[data['AGE_group'] == '70 years and older']['MEDV']

# Perform ANOVA
f_stat, p_value = f_oneway(group1, group2, group3)

# Results
print(f"F-statistic: {f_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("There is a significant difference in median values of houses across different age groups.")
else:
    print("There is no significant difference in median values of houses across different age groups.")


In [None]:
from scipy.stats import pearsonr

# Pearson correlation between NOX and INDUS
corr, p_value = pearsonr(data['NOX'], data['INDUS'])

# Results
print(f"Pearson Correlation Coefficient: {corr}, P-value: {p_value}")
if p_value < 0.05:
    print("There is a significant correlation between NOX and INDUS.")
else:
    print("There is no significant correlation between NOX and INDUS.")


In [None]:
import statsmodels.api as sm

# Independent variable (DIS)
X = data['DIS']
# Dependent variable (MEDV)
y = data['MEDV']

# Add constant to the model
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Summary of the regression analysis
print(model.summary())
