In [1]:
import pandas as pd

In [2]:

# Load dataset
df = pd.read_csv('adult_with_headers.csv')

# Display first few rows
print(df.head())

# Summary statistics
print(df.describe(include='all'))

# Data types and missing values
print(df.info())
print(df.isnull().sum())


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [5]:
# For categorical columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# For numerical columns
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].median())


In [6]:
numerical_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
print(numerical_cols)


Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')


In [7]:
from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler()
df_std_scaled = df_clean.copy()
df_std_scaled[numerical_cols] = scaler_std.fit_transform(df_clean[numerical_cols])


In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler_mm = MinMaxScaler()
df_mm_scaled = df_clean.copy()
df_mm_scaled[numerical_cols] = scaler_mm.fit_transform(df_clean[numerical_cols])


In [10]:
from sklearn.preprocessing import OneHotEncoder

# Correct usage
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform
encoded = encoder.fit_transform(df[categorical_cols])

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))


In [12]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.drop('income')  # exclude target

# Columns with less than 5 categories → One-Hot Encoding
onehot_cols = [col for col in categorical_cols if df[col].nunique() < 5]

# Columns with more than 5 categories → Label Encoding
label_cols = [col for col in categorical_cols if df[col].nunique() > 5]

print("One-Hot Columns:", onehot_cols)
print("Label Columns:", label_cols)


One-Hot Columns: ['sex']
Label Columns: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'native_country']


In [13]:
# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop first to avoid dummy variable trap
encoded_onehot = encoder.fit_transform(df[onehot_cols])

# Convert to DataFrame
encoded_onehot_df = pd.DataFrame(encoded_onehot, columns=encoder.get_feature_names_out(onehot_cols))


In [14]:
# Label Encoding
df_label = df.copy()
for col in label_cols:
    le = LabelEncoder()
    df_label[col] = le.fit_transform(df[col])


In [15]:
# Drop original one-hot columns and add encoded columns
df_final = df_label.drop(columns=onehot_cols)
df_final = pd.concat([df_final, encoded_onehot_df], axis=1)

print(df_final.head())


   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship    race  capital_gain  capital_loss  \
0           1             1   White          2174             0   
1           4             0   White             0             0   
2           6             1   White             0             0   
3           6             0   Black             0             0   
4          10             5   Black             0             0   

   hours_per_week  native_country  income  sex_ Male  
0              40              39   <=50K        1.0  
1              13              39   <=50K        1.0  
2    

In [18]:
# Create a new feature: net_capital = capital-gain - capital-loss
df['net_capital'] = df['capital_gain'] - df['capital_loss']


In [20]:
# Categorize working hours
df['work_hours_category'] = pd.cut(df['hours_per_week'],
                                   bins=[0, 20, 40, 60, 100],
                                   labels=['part-time', 'full-time', 'over-time', 'extreme'])


In [21]:
# Check skewness
skewed_features = df[numerical_cols].skew().sort_values(ascending=False)
print(skewed_features)


capital_gain      11.953848
capital_loss       4.594629
fnlwgt             1.446980
age                0.558743
hours_per_week     0.227643
education_num     -0.311676
dtype: float64


In [23]:
import numpy as np

# Add 1 to avoid log(0)
df['capital_gain_log'] = np.log1p(df['capital_gain'])


In [24]:
df = pd.get_dummies(df, columns=['work_hours_category'], drop_first=True)


In [25]:
from sklearn.ensemble import IsolationForest

# Select numerical features for outlier detection
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Initialize Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)  # ~1% expected outliers

# Fit model
iso_forest.fit(df[numerical_cols])

# Predict outliers (-1 = outlier, 1 = inlier)
outliers = iso_forest.predict(df[numerical_cols])

# Add as a column to df
df['outlier'] = outliers


In [26]:
# Keep only inliers
df_clean = df[df['outlier'] == 1].drop(columns=['outlier'])


In [None]:
pip install pscore


In [None]:
import ppscore as pps

# Compute PPS matrix
pps_matrix = pps.matrix(df_clean)
pps_matrix = pps_matrix.pivot(index='x', columns='y', values='ppscore')

# Display PPS matrix
print(pps_matrix)


In [None]:
# Correlation matrix for numerical features
corr_matrix = df_clean[numerical_cols].corr()
print(corr_matrix)
