In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import \
LabelEncoder, OneHotEncoder, StandardScaler

In [3]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
# Identify Categorical And Numerical Features
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include=['int64','float64']).columns

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

Categorical Columns: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country', 'income'],
      dtype='object')
Numerical Columns: Index(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')


In [6]:
# Apply label Encoding
le = LabelEncoder()
df['income'] = le.fit_transform(df['income'])

In [7]:
# Apply One Hot Encoding
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,38,89814,9,0,0,50,0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,28,336951,12,0,0,40,1,False,True,False,...,False,False,False,False,False,False,False,True,False,False
3,44,160323,10,7688,0,40,1,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,18,103497,10,0,0,30,0,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [8]:
# Feature scaling using StandardScaler
scaler = StandardScaler()

df_encoded[numerical_cols]= scaler.fit_transform(
    df_encoded[numerical_cols]
)

df_encoded.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,-0.995129,0.351675,-1.197259,-0.144804,-0.217127,-0.034087,0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,-0.046942,-0.945524,-0.419335,-0.144804,-0.217127,0.77293,0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,-0.776316,1.394723,0.74755,-0.144804,-0.217127,-0.034087,1,False,True,False,...,False,False,False,False,False,False,False,True,False,False
3,0.390683,-0.277844,-0.030373,0.886874,-0.217127,-0.034087,1,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,-1.505691,-0.815954,-0.030373,-0.144804,-0.217127,-0.841104,0,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [9]:
df[numerical_cols].describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [10]:
df_encoded[numerical_cols].describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,2.281092e-16,-5.848208000000001e-17,-9.208746000000001e-17,1.04744e-17,-1.0183450000000001e-17,4.4661690000000005e-17
std,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001
min,-1.578629,-1.67968,-3.53103,-0.1448035,-0.2171271,-3.181452
25%,-0.7763164,-0.6828752,-0.4193353,-0.1448035,-0.2171271,-0.03408696
50%,-0.119879,-0.1090844,-0.03037346,-0.1448035,-0.2171271,-0.03408696
75%,0.6824334,0.4543232,0.7475502,-0.1448035,-0.2171271,0.3694214
max,3.745808,12.31723,2.303397,13.27438,10.59179,4.727312


Scaling ensures all features contibute equally to the model.Algorithms like KNN, SVM, and Gradient Descent perform better when features are on the same scale

In [11]:
df_encoded.to_csv("adult_processed.csv", index= False)

In [None]:
4