# Assignment 8: EDA & Hypothesis Testing  
**Name:** Sanskruti Odhe  
**Dataset:** Adult Income Dataset  


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix

%matplotlib inline
sns.set_style("whitegrid")


ModuleNotFoundError: No module named 'pandas'

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
cols = ["age","workclass","fnlwgt","education","education_num",
        "marital_status","occupation","relationship","race","sex",
        "capital_gain","capital_loss","hours_per_week","native_country","income"]
df = pd.read_csv(url, names=cols, na_values=" ?", skipinitialspace=True)


In [None]:
# shape, dtypes, missing
print(df.shape)
df.info()
print(df.isna().sum())

# quick look
df.head()


In [None]:
df.describe()                            # numeric
df.select_dtypes(include="object").describe()  # categorical


In [None]:
# 5.1 Distribution of key numerics
fig, axes = plt.subplots(2,2, figsize=(12,8))
sns.histplot(df.age, bins=30, ax=axes[0,0]).set_title("Age")
sns.histplot(df.hours_per_week, bins=30, ax=axes[0,1]).set_title("Hours/Week")
sns.boxplot(x="income", y="capital_gain", data=df, ax=axes[1,0]).set_title("Cap Gain by Income")
sns.boxplot(x="income", y="capital_loss", data=df, ax=axes[1,1]).set_title("Cap Loss by Income")
plt.tight_layout()

# 5.2 Categorical relationships
plt.figure(figsize=(8,4))
sns.countplot(data=df, x="education", hue="income")
plt.xticks(rotation=45)
plt.title("Income by Education")

# 5.3 Correlation heatmap
num = ["age","education_num","capital_gain","capital_loss","hours_per_week"]
plt.figure(figsize=(5,4))
sns.heatmap(df[num].corr(), annot=True, cmap="coolwarm")
plt.title("Numeric Corr")


#6 Hypothesis 
H₀₁: Mean hours_per_week is equal for income ≤50K vs >50K.
H₁₁: Mean hours_per_week is higher for income >50K.

H₀₂: Education level and income bracket are independent.
H₁₂: Higher education correlates with income >50K.

H₀₃: Age distributions are the same across income groups.
H₁₃: Age distribution differs between income groups.

In [None]:
# 7 hypotheis testing
#  t-test on hours_per_week
high = df[df.income==">50K"].hours_per_week
low  = df[df.income=="<=50K"].hours_per_week
t, p = stats.ttest_ind(high, low, equal_var=False)
print("T-test hours/week:", t.round(3), "p=", p.round(3))

# chi-square on education vs income
ct = pd.crosstab(df.education, df.income)
chi2, p2, *_ = stats.chi2_contingency(ct)
print("Chi2 educ vs income:", chi2.round(1), "p=", p2.round(3))

# KS test on age distributions
age_high = df[df.income==">50K"].age
age_low  = df[df.income=="<=50K"].age
ks, p3    = stats.ks_2samp(age_high, age_low)
print("KS-test age:", ks.round(3), "p=", p3.round(3))


In [None]:
# extra credit 
# Logistic Regression & ROC
X = pd.get_dummies(df.drop("income", axis=1).dropna(), drop_first=True)
y = df.dropna().income.map({">50K":1,"<=50K":0})
model = LogisticRegression(max_iter=1000).fit(X, y)
probs = model.predict_proba(X)[:,1]
fpr, tpr, _ = roc_curve(y, probs)
print("AUC =", auc(fpr,tpr).round(3))
plt.plot(fpr, tpr); plt.plot([0,1],[0,1],"--"); plt.title("ROC Curve")

#Missing-Value Analysis
missing = df.isna().mean().sort_values(ascending=False)
missing.plot.barh(title="Miss % by Feature")

#Pairplot on subsample
sns.pairplot(df[num+["income"]].sample(500), hue="income")

