

# Load Data


In [None]:
import pandas as pd

df = pd.read_csv('adult.csv')
display(df.head())

# Explore Data
Visualize the key findings from the initial data exploration of the dataset at "/content/adult.csv".

In [None]:
display(df.info())
display(df.describe(include='all'))

In [None]:
numerical_cols = df.select_dtypes(include=['int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print("Numerical variables suitable for visualization:")
for col in numerical_cols:
    print(f"- {col}")

print("\nCategorical variables suitable for visualization:")
for col in categorical_cols:
    print(f"- {col}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(df['age'], kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

categorical_cols = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex']

for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, y=col, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.xlabel('Frequency')
    plt.ylabel(col)
    plt.show()

In [None]:
plt.figure(figsize=(14, 7))
top_countries = df['native.country'].value_counts().nlargest(10).index
sns.countplot(data=df[df['native.country'].isin(top_countries)], y='native.country', order=top_countries)
plt.title('Distribution of Top 10 Native Countries')
plt.xlabel('Frequency')
plt.ylabel('Native Country')
plt.show()

### Missing Values Analysis

In [None]:
missing_counts = (df == "?").sum()
missing_percent = (missing_counts / len(df)) * 100

missing_summary = pd.DataFrame({
    "Missing_Count": missing_counts,
    "Missing_Percent": missing_percent.round(2)
})

nonzero_missing = missing_summary[missing_summary["Missing_Count"] > 0]

plt.figure(figsize=(8, 4))
plt.bar(nonzero_missing.index, nonzero_missing["Missing_Count"])
plt.xticks(rotation=45, ha="right")
plt.ylabel("Count of '?' values")
plt.title("Missing Value Distribution Across Columns")
plt.tight_layout()
plt.show()

missing_summary

## Summary

### Data Analysis Key Findings

*   The dataset exhibits an imbalanced income distribution, with more individuals earning $\le$50K than >50K.
*   Higher education levels and being in a 'Married-civ-spouse' marital status are strongly associated with a higher likelihood of earning >50K.
*   Certain workclasses ('Self-emp-inc', 'Federal-gov') and occupations ('Exec-managerial', 'Prof-specialty') show a higher proportion of individuals earning >50K.
*   There is a notable gender income disparity, with a significantly higher proportion of males earning >50K compared to females.
*   Working more hours per week generally correlates with a higher chance of earning >50K.
*   Non-zero capital gains are strongly associated with having an income >50K.
*   Missing values appear only in three categorical columns: workclass (5.64%), occupation (5.66%), and native.country (1.79%). All other columns contain no missing entries.



In [None]:
display(df['income'].value_counts())