In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest

import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
# Load dataset (update path if needed)
df = pd.read_csv(r"D:\data science\Assignments\Basic stats - 1\Data Trasformation\adult_with_headers (1).csv")

df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
 # Basic Data Exploration

df.info()
df.describe(include='all')
df.isnull().sum()

# Some versions of Adult dataset store missing values as " ?".
 # So convert them to NaN:
df = df.replace(" ?", np.nan)
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64

In [5]:
# Handle Missing Values
 # Best practice for Adult dataset:
 # Categorical missing values → Most Frequent Imputation
 # Numerical missing values → Median Imputation

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

In [6]:
 # Scaling Numerical Features
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

scaler_standard = StandardScaler()
df_standard = df.copy()
df_standard[num_cols] = scaler_standard.fit_transform(df_standard[num_cols])

scaler_minmax = MinMaxScaler()
df_minmax = df.copy()
df_minmax[num_cols] = scaler_minmax.fit_transform(df_minmax[num_cols])


In [8]:
# Separate categorical columns:
cat_cols = df.select_dtypes(include='object').columns

few_cat_cols = [col for col in cat_cols if df[col].nunique() < 5]
df_onehot = pd.get_dummies(df, columns=few_cat_cols, drop_first=True)

# Label Encoding for high cardinality columns
label_enc = LabelEncoder()

high_cat_cols = [col for col in cat_cols if df[col].nunique() >= 5]

for col in high_cat_cols:
    df_onehot[col] = label_enc.fit_transform(df_onehot[col])



In [None]:
# Feature Engineering
# Feature 1: age_group
df['age_group'] = pd.cut(
    df['age'],
    bins=[0, 25, 45, 65, 100],
    labels=['Youth', 'Adult', 'Middle_Age', 'Senior']
)
# Feature 2: Work Intensity (Based on hours_per_week)
df['work_intensity'] = pd.cut(
    df['hours_per_week'],
    bins=[0, 20, 40, 60, 100],
    labels=['Low', 'Medium', 'High', 'Very_High']
)
# Log Transformation of highly skewed feature capital_gain
df['capital_gain_log'] = np.log1p(df['capital_gain'])

df[['age', 'age_group', 'hours_per_week', 'work_intensity', 'capital_gain', 'capital_gain_log']].head()

In [20]:
# Log Transformation for Skewed Featur

# Check skewness
print("Skewness Before:", df['capital_gain'].skew())

# Apply log(1 + x) transformation
df['capital_gain_log'] = np.log1p(df['capital_gain'])

print("Skewness After:", df['capital_gain_log'].skew())

df[['capital_gain', 'capital_gain_log']].head()


Skewness Before: 11.953847687699799
Skewness After: 3.096143524467517


Unnamed: 0,capital_gain,capital_gain_log
0,2174,7.684784
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


In [21]:
# Feature Selection with Isolation Forest

iso = IsolationForest(contamination=0.02, random_state=42)
pred = iso.fit_predict(df_onehot[num_cols])

df_clean = df_onehot[pred == 1]
df_clean.shape


(31909, 15)