In [8]:
import pandas as pd

# Load the dataset
file_path = 'adult_with_headers.csv'
df = pd.read_csv(file_path)

In [9]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
# Generate summary statistics
df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [11]:
# Check for missing values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [12]:
# Check data types
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [13]:
# Check the proportion of missing values
missing_values = df.isnull().mean()
print(missing_values)

age               0.0
workclass         0.0
fnlwgt            0.0
education         0.0
education_num     0.0
marital_status    0.0
occupation        0.0
relationship      0.0
race              0.0
sex               0.0
capital_gain      0.0
capital_loss      0.0
hours_per_week    0.0
native_country    0.0
income            0.0
dtype: float64


In [14]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [15]:
# Remove rows with missing values
df_clean = df.dropna()

In [19]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [23]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Select numerical features
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Apply Standard Scaling
scaler_standard = StandardScaler()
df_standard_scaled = df_clean.copy()
df_standard_scaled[numerical_features] = scaler_standard.fit_transform(df_clean[numerical_features])

# Apply Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df_clean.copy()
df_minmax_scaled[numerical_features] = scaler_minmax.fit_transform(df_clean[numerical_features])


In [24]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Identify categorical features
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

# Apply One-Hot Encoding to categorical variables with less than 5 categories
one_hot_features = [col for col in categorical_features if df_clean[col].nunique() < 5]
df_one_hot_encoded = pd.get_dummies(df_clean, columns=one_hot_features)

# Apply Label Encoding to categorical variables with more than 5 categories
label_encoder = LabelEncoder()
label_features = [col for col in categorical_features if df_clean[col].nunique() >= 5]

for col in label_features:
    df_clean[col] = label_encoder.fit_transform(df_clean[col])


In [16]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Select numerical features
numerical_features = df_clean.select_dtypes(include=['int64', 'float64']).columns

# Apply Standard Scaling
scaler_standard = StandardScaler()
df_standard_scaled = df_clean.copy()
df_standard_scaled[numerical_features] = scaler_standard.fit_transform(df_clean[numerical_features])

# Apply Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df_clean.copy()
df_minmax_scaled[numerical_features] = scaler_minmax.fit_transform(df_clean[numerical_features])


In [25]:
# Create new features
df_clean['age_bin'] = pd.cut(df_clean['age'], bins=[0, 30, 60, 90], labels=['Young', 'Middle-aged', 'Old'])
df_clean['capital_diff'] = df_clean['capital_gain'] - df_clean['capital_loss']


In [26]:
import numpy as np

# Check skewness
print(df_clean[numerical_features].skew())

# Apply log transformation to a skewed feature (example: capital_gain)
df_clean['capital_gain_log'] = np.log1p(df_clean['capital_gain'])


age                0.558743
fnlwgt             1.446980
education_num     -0.311676
capital_gain      11.953848
capital_loss       4.594629
hours_per_week     0.227643
dtype: float64


In [27]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.01)
outliers = iso_forest.fit_predict(df_clean[numerical_features])
df_no_outliers = df_clean[outliers == 1]


In [28]:
import ppscore as pps

# Calculate PPS matrix
pps_matrix = pps.matrix(df_no_outliers)
print(pps_matrix)

# Compare with correlation matrix
correlation_matrix = df_no_outliers.corr()
print(correlation_matrix)


                    x                 y   ppscore            case  \
0                 age               age  1.000000  predict_itself   
1                 age         workclass  0.000000      regression   
2                 age            fnlwgt  0.000000      regression   
3                 age         education  0.000000      regression   
4                 age     education_num  0.000000      regression   
..                ...               ...       ...             ...   
319  capital_gain_log    native_country  0.000000      regression   
320  capital_gain_log            income  0.288064  classification   
321  capital_gain_log           age_bin  0.000000  classification   
322  capital_gain_log      capital_diff  0.792427      regression   
323  capital_gain_log  capital_gain_log  1.000000  predict_itself   

     is_valid_score               metric  baseline_score   model_score  \
0              True                 None        0.000000      1.000000   
1              True  me

  correlation_matrix = df_no_outliers.corr()
