

# Load Data


In [None]:
import pandas as pd

df = pd.read_csv('adult.csv')
display(df.head())

# Explore Data
Visualize the key findings from the initial data exploration of the dataset at "/content/adult.csv".

In [None]:
display(df.info())
display(df.describe(include='all'))

In [None]:
numerical_cols = df.select_dtypes(include=['int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print("Numerical variables suitable for visualization:")
for col in numerical_cols:
    print(f"- {col}")

print("\nCategorical variables suitable for visualization:")
for col in categorical_cols:
    print(f"- {col}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(df['age'], kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

categorical_cols = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex']

for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, y=col, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.xlabel('Frequency')
    plt.ylabel(col)
    plt.show()

In [None]:
plt.figure(figsize=(14, 7))
top_countries = df['native.country'].value_counts().nlargest(10).index
sns.countplot(data=df[df['native.country'].isin(top_countries)], y='native.country', order=top_countries)
plt.title('Distribution of Top 10 Native Countries')
plt.xlabel('Frequency')
plt.ylabel('Native Country')
plt.show()

### Missing Values Analysis

In [None]:
missing_counts = (df == "?").sum()
missing_percent = (missing_counts / len(df)) * 100

missing_summary = pd.DataFrame({
    "Missing_Count": missing_counts,
    "Missing_Percent": missing_percent.round(2)
})

nonzero_missing = missing_summary[missing_summary["Missing_Count"] > 0]

plt.figure(figsize=(8, 4))
plt.bar(nonzero_missing.index, nonzero_missing["Missing_Count"])
plt.xticks(rotation=45, ha="right")
plt.ylabel("Count of '?' values")
plt.title("Missing Value Distribution Across Columns")
plt.tight_layout()
plt.show()

missing_summary

## Summary

### Data Analysis Key Findings

*   The dataset exhibits an imbalanced income distribution, with more individuals earning $\le$50K than >50K.
*   Higher education levels and being in a 'Married-civ-spouse' marital status are strongly associated with a higher likelihood of earning >50K.
*   Certain workclasses ('Self-emp-inc', 'Federal-gov') and occupations ('Exec-managerial', 'Prof-specialty') show a higher proportion of individuals earning >50K.
*   There is a notable gender income disparity, with a significantly higher proportion of males earning >50K compared to females.
*   Working more hours per week generally correlates with a higher chance of earning >50K.
*   Non-zero capital gains are strongly associated with having an income >50K.
*   Missing values appear only in three categorical columns: workclass (5.64%), occupation (5.66%), and native.country (1.79%). All other columns contain no missing entries.



In [None]:
display(df['income'].value_counts())

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    f1_score
)
import matplotlib.pyplot as plt
df_svm = df.replace("?", np.nan).copy()
X = df_svm.drop("income", axis=1)
y = df_svm["income"]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns
print("Numeric features:", list(numeric_features))
print("Categorical features:", list(categorical_features))

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)
linear_svm_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("svm", LinearSVC(C=1.0, class_weight="balanced"))
])

linear_svm_pipeline.fit(X_train, y_train)
y_pred_linear = linear_svm_pipeline.predict(X_test)
print("\n Linear SVM (LinearSVC) Classification Report:")
print(classification_report(y_test, y_pred_linear, target_names=label_encoder.classes_))
cm_linear = confusion_matrix(y_test, y_pred_linear)
ConfusionMatrixDisplay(cm_linear).plot(cmap="Blues")
plt.title("Confusion Matrix LinearSVC (Linear SVM):")
plt.show()
print("\nF1 Score Linear SVM:", f1_score(y_test, y_pred_linear))
