In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
df = pd.read_csv("adult.csv")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
print("Dataset shape:", df.shape)
df.info()


Dataset shape: (48842, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
df.replace("?", np.nan, inplace=True)

print("Missing values per column:")
df.isnull().sum()


Missing values per column:


Unnamed: 0,0
age,0
workclass,2799
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,2809
relationship,0
race,0
gender,0


In [5]:
# Drop rows with missing values (standard practice for Adult dataset)
df.dropna(inplace=True)

print("Shape after dropping missing values:", df.shape)


Shape after dropping missing values: (45222, 15)


In [6]:
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

print("Categorical Features:")
print(categorical_cols.tolist())

print("\nNumerical Features:")
print(numerical_cols.tolist())


Categorical Features:
['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

Numerical Features:
['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [9]:
df_encoded = pd.get_dummies(
    df,
    columns=[
        'workclass',
        'education',
        'marital-status',
        'occupation',
        'relationship',
        'race',
        'gender',
        'native-country'
    ],
    drop_first=True
)

print("Shape after encoding:", df_encoded.shape)
le = LabelEncoder()

df['income'] = le.fit_transform(df['income'])
df['income'].value_counts()


Shape after encoding: (45222, 97)


Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
0,34014
1,11208


In [10]:
scaler = StandardScaler()

df_encoded[numerical_cols] = scaler.fit_transform(
    df_encoded[numerical_cols]
)

df_encoded[numerical_cols].head()


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
0,-1.024983,0.350889,-1.221559,-0.146733,-0.21878,-0.07812
1,-0.041455,-0.945878,-0.438122,-0.146733,-0.21878,0.754701
2,-0.798015,1.393592,0.737034,-0.146733,-0.21878,-0.07812
3,0.412481,-0.27842,-0.046403,0.877467,-0.21878,-0.07812
5,-0.344079,0.084802,-1.613277,-0.146733,-0.21878,-0.910942


In [11]:
print("Before Scaling:")
df[numerical_cols].describe()


Before Scaling:


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0
mean,38.547941,189734.7,10.11846,1101.430344,88.595418,40.938017
std,13.21787,105639.2,2.552881,7506.430084,404.956092,12.007508
min,17.0,13492.0,1.0,0.0,0.0,1.0
25%,28.0,117388.2,9.0,0.0,0.0,40.0
50%,37.0,178316.0,10.0,0.0,0.0,40.0
75%,47.0,237926.0,13.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [12]:
print("\nAfter Scaling:")
df_encoded[numerical_cols].describe()



After Scaling:


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0
mean,-2.5453970000000003e-17,5.357903e-17,1.693789e-16,-1.4769580000000002e-17,2.6475270000000002e-17,2.165158e-16
std,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011
min,-1.630231,-1.668365,-3.57187,-0.1467332,-0.2187803,-3.326124
25%,-0.7980149,-0.6848527,-0.4381216,-0.1467332,-0.2187803,-0.07812006
50%,-0.117111,-0.108093,-0.046403,-0.1467332,-0.2187803,-0.07812006
75%,0.6394489,0.4561924,1.128753,-0.1467332,-0.2187803,0.3382907
max,3.892656,12.31247,2.303909,13.17519,10.53806,4.835527


In [13]:
print("""
Scaling Impact:
- Prevents large-value features from dominating.
- Improves convergence of gradient-based models.
- Required for KNN, SVM, K-Means, Linear & Logistic Regression.
""")



Scaling Impact:
- Prevents large-value features from dominating.
- Improves convergence of gradient-based models.
- Required for KNN, SVM, K-Means, Linear & Logistic Regression.



In [14]:
output_file = "Adult_Preprocessed.csv"
df_encoded.to_csv(output_file, index=False)

print("✅ Saved:", output_file)


✅ Saved: Adult_Preprocessed.csv


In [18]:
from google.colab import files
files.download(output_file)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>