# Boston Housing Project
Generated automatically.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
import numpy as np

# Load dataset
import warnings
warnings.filterwarnings('ignore')
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['MEDV'] = boston.target
df.head()

In [None]:
# Boxplot MEDV
plt.boxplot(df['MEDV'])
plt.title('Boxplot of MEDV')
plt.show()

In [None]:
# Task 1 — Summary Statistics & Missing Values



# Load dataset
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df["MEDV"] = boston.target

# 1. Dataset Information
print("### Dataset Info ###")
print(df.info())

# 2. Summary Statistics
print("\n### Summary Statistics ###")
print(df.describe())

# 3. Check Missing Values
print("\n### Missing Values ###")
print(df.isnull().sum())


In [None]:
# Task 2 — Visualizations with Matplotlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston

# Load Data
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df["MEDV"] = boston.target

# -------------------------------
# 1. Histogram for MEDV
# -------------------------------
plt.figure(figsize=(6,4))
plt.hist(df['MEDV'], bins=20)
plt.title('Histogram of MEDV')
plt.xlabel('Median Value (MEDV)')
plt.ylabel('Frequency')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# -------------------------------
# 2. Scatter Plot — RM vs MEDV
# -------------------------------
plt.figure(figsize=(6,4))
plt.scatter(df['RM'], df['MEDV'])
plt.title('Scatter Plot: RM vs MEDV')
plt.xlabel('Average Rooms per Dwelling (RM)')
plt.ylabel('MEDV')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# -------------------------------
# 3. Boxplot of MEDV
# -------------------------------
plt.figure(figsize=(6,4))
plt.boxplot(df['MEDV'])
plt.title('Boxplot of MEDV')
plt.ylabel('MEDV')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# -------------------------------
# 4. Bar Plot — CHAS (0 = no river, 1 = river)
# -------------------------------
chas_counts = df['CHAS'].value_counts().sort_index()

plt.figure(figsize=(6,4))
plt.bar(chas_counts.index.astype(str), chas_counts.values)
plt.title('Bar Plot of CHAS')
plt.xlabel('CHAS (0 = No River, 1 = River)')
plt.ylabel('Count')

for i, v in enumerate(chas_counts.values):
    plt.text(i, v + 1, str(v), ha='center')

plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# -------------------------------
# 5. Correlation Heatmap (Matplotlib only)
# -------------------------------
corr = df.corr()

plt.figure(figsize=(10,8))
plt.imshow(corr, cmap='viridis')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Task 3 — Hypothesis Testing

import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from scipy import stats
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

# Load dataset
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['MEDV'] = boston.target

alpha = 0.05   # significance level

# ------------------------------------------------------------
# 1. T-Test: Is MEDV significantly different for CHAS = 0 vs 1
# ------------------------------------------------------------

medv_chas0 = df[df['CHAS'] == 0]['MEDV']
medv_chas1 = df[df['CHAS'] == 1]['MEDV']

t_stat, p_val = stats.ttest_ind(medv_chas0, medv_chas1, equal_var=False)

print("### T-Test (MEDV ~ CHAS) ###")
print("T-statistic:", t_stat)
print("P-value:", p_val)

if p_val < alpha:
    print("Conclusion: Reject H0 — Significant difference in MEDV between CHAS groups.\n")
else:
    print("Conclusion: Fail to reject H0 — No significant difference.\n")

# ------------------------------------------------------------
# 2. ANOVA: MEDV across AGE groups
# ------------------------------------------------------------

# Create AGE groups
bins = [-1, 35, 70, 200]
labels = ['<=35', '36-70', '>70']
df['AGE_Group'] = pd.cut(df['AGE'], bins=bins, labels=labels)

group1 = df[df['AGE_Group'] == '<=35']['MEDV']
group2 = df[df['AGE_Group'] == '36-70']['MEDV']
group3 = df[df['AGE_Group'] == '>70']['MEDV']

f_stat, p_val_anova = stats.f_oneway(group1, group2, group3)

print("### ANOVA (MEDV ~ AGE Groups) ###")
print("F-statistic:", f_stat)
print("P-value:", p_val_anova)

if p_val_anova < alpha:
    print("Conclusion: Reject H0 — At least one AGE group mean is different.\n")
else:
    print("Conclusion: Fail to reject H0 — No significant difference.\n")

# ------------------------------------------------------------
# 3. Pearson Correlation — NOX vs INDUS
# ------------------------------------------------------------

r, p_corr = stats.pearsonr(df['NOX'], df['INDUS'])

print("### Pearson Correlation (NOX vs INDUS) ###")
print("Correlation coefficient (r):", r)
print("P-value:", p_corr)

if p_corr < alpha:
    print("Conclusion: Significant linear relationship between NOX and INDUS.\n")
else:
    print("Conclusion: No significant linear correlation.\n")


In [None]:
# ------------------------------
# TASK 4: FEATURE SCALING
# MinMaxScaler, StandardScaler, RobustScaler
# ------------------------------

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import pandas as pd

# Select only numerical columns for scaling
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
num_cols


In [None]:
# MinMax Scaling
minmax_scaler = MinMaxScaler()
df_minmax = pd.DataFrame(minmax_scaler.fit_transform(df[num_cols]), 
                         columns=[col + '_minmax' for col in num_cols])

print("MinMax Scaled Data:")
df_minmax.head()


In [None]:
# Standardization
standard_scaler = StandardScaler()
df_standard = pd.DataFrame(standard_scaler.fit_transform(df[num_cols]),
                           columns=[col + '_standard' for col in num_cols])

print("Standard Scaled Data:")
df_standard.head()


In [None]:
# Robust Scaling (good when dataset has outliers)
robust_scaler = RobustScaler()
df_robust = pd.DataFrame(robust_scaler.fit_transform(df[num_cols]),
                         columns=[col + '_robust' for col in num_cols])

print("Robust Scaled Data:")
df_robust.head()


In [None]:
df_scaled_combined = pd.concat([df_minmax, df_standard, df_robust], axis=1)
df_scaled_combined.head()


In [None]:
df_scaled_combined.to_csv("scaled_output.csv", index=False)
print("Scaling completed successfully!")


In [None]:
# ---------------------------------------
# TASK 5: CATEGORICAL ENCODING
# OHE, LabelEncoder, OrdinalEncoder
# ---------------------------------------

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# Identify categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols


In [None]:
# One-Hot Encoding for categorical columns
ohe = OneHotEncoder(sparse_output=False, drop='first')   # drop first to avoid dummy trap

df_ohe = pd.DataFrame(
    ohe.fit_transform(df[cat_cols]),
    columns=ohe.get_feature_names_out(cat_cols)
)

df_ohe.head()


In [None]:
# Label Encoding
df_label = df.copy()
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_label[col + "_label"] = le.fit_transform(df[col])
    label_encoders[col] = le  # store encoder

df_label.head()


In [None]:
# Example of ordinal mapping
ordinal_cols = ['Education', 'Size', 'Priority']  # change based on your dataset

# Filter only the columns that actually exist in df
ordinal_cols = [col for col in ordinal_cols if col in df.columns]

ordinal_categories = [
    ['Low', 'Medium', 'High'],   # for Education
    ['Small', 'Medium', 'Large'], # for Size
    ['Low', 'Medium', 'High']    # for Priority
]

ord_enc = OrdinalEncoder(categories=ordinal_categories)

df_ordinal = df.copy()
df_ordinal[ordinal_cols] = ord_enc.fit_transform(df[ordinal_cols])

df_ordinal.head()


In [None]:
df_encoded_combined = pd.concat(
    [df.drop(columns=cat_cols), df_ohe, df_label[[col + "_label" for col in cat_cols]]],
    axis=1
)

df_encoded_combined.head()


In [None]:
df_encoded_combined.to_csv("encoded_output.csv", index=False)
print("Categorical Encoding Completed Successfully!")
