In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Data Exploration and Preprocessing:


In [2]:
df = pd.read_csv('adult_with_headers.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.describe()
df.isnull().sum()
df.dtypes

Unnamed: 0,0
age,int64
workclass,object
fnlwgt,int64
education,object
education_num,int64
marital_status,object
occupation,object
relationship,object
race,object
sex,object


In [21]:
for col in df.columns:
    if df[col].dtype == 'object':
        imputer = SimpleImputer(strategy='most_frequent')
        df[col] = imputer.fit_transform(df[[col]]).ravel() # Flatten the output of fit_transform
    else:
        imputer = SimpleImputer(strategy='mean')
        df[col] = imputer.fit_transform(df[[col]]).ravel() # Flatten the output of fit_transform

df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [20]:
df_numerical = df.select_dtypes(include=np.number)
df_categorical = df.select_dtypes(include='object')

scaler_standard = StandardScaler()
df_scaled_standard = pd.DataFrame(scaler_standard.fit_transform(df_numerical), columns=df_numerical.columns)
df_scaled_standard.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,capital_gain_loss,capital_gain_log,outlier
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,0.159762,2.83137,0.229412
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153,-0.13367,-0.299271,0.229412
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429,-0.13367,-0.299271,0.229412
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429,-0.13367,-0.299271,0.229412
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429,-0.13367,-0.299271,0.229412


In [18]:
# Min-Max Scaling to numerical features
scaler_minmax = MinMaxScaler()
df_scaled_minmax = pd.DataFrame(scaler_minmax.fit_transform(df_numerical), columns=df_numerical.columns)
df_scaled_minmax.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,0.150685,0.221488,0.8,0.0,0.0,0.397959


**Discuss scenarios for each scaling technique**:

Standard Scaling (Z-score normalization)
- Centers the data around 0 with a standard deviation of 1
- Useful when the data follows a Gaussian distribution or when algorithms like Linear Regression, Logistic Regression, SVMs, and K-Means are sensitive to the scale of features.
- Less affected by outliers compared to Min-Max scaling.

Min-Max Scaling (Normalization):
- Scales the data to a fixed range, usually between 0 and 1
- Useful when the range of values is important or when algorithms like Neural Networks and K-Nearest Neighbors are sensitive to the magnitude of features.
- Can be significantly affected by outliers.

# Encoding Techniques:

In [19]:
df_categorical_onehot = pd.DataFrame()
for col in df_categorical.columns:
    if df_categorical[col].nunique() < 5:
        onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        encoded_data = onehot_encoder.fit_transform(df_categorical[[col]])
        encoded_df = pd.DataFrame(encoded_data, columns=[f'{col}_{cat}' for cat in onehot_encoder.categories_[0]])
        df_categorical_onehot = pd.concat([df_categorical_onehot, encoded_df], axis=1)
    else:
        label_encoder = LabelEncoder()
        df_categorical[col] = label_encoder.fit_transform(df_categorical[col])

df_categorical_encoded = pd.concat([df_categorical_onehot, df_categorical.select_dtypes(include=np.number)], axis=1)

df_categorical_encoded.head()

Unnamed: 0,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,workclass,education,marital_status,occupation,relationship,race,native_country
0,0.0,1.0,1.0,0.0,7,9,4,1,1,4,39
1,0.0,1.0,1.0,0.0,6,9,2,4,0,4,39
2,0.0,1.0,1.0,0.0,4,11,0,6,1,4,39
3,0.0,1.0,1.0,0.0,4,1,2,6,0,2,39
4,1.0,0.0,1.0,0.0,4,9,2,10,5,2,5


**Discuss the pros and cons of One-Hot Encoding and Label Encoding**

**One-Hot Encoding:**

**Pros:**

- Represents categorical data as binary vectors, avoiding ordinal relationships.
- Suitable for nominal categorical variables where there is no inherent order.

**Cons:**

- Can lead to a large number of features (curse of dimensionality) if there are many unique categories.
- Can be computationally expensive for high-cardinality features.

**Label Encoding:**

**Pros:**

- Reduces the dimensionality of the dataset compared to One-Hot Encoding.
- Suitable for ordinal categorical variables where there is an inherent order.

**Cons:**

- Introduces an artificial ordinal relationship between categories, which can mislead some algorithms.
- Not suitable for nominal categorical variables.

# Feature Engineering:

In [8]:
def categorize_hours(hours):
    if hours < 30:
        return 'Part-time'
    elif hours <= 40:
        return 'Full-time'
    else:
        return 'Over-time'

df['hours_per_week_category'] = df['hours_per_week'].apply(categorize_hours)
df['capital_gain_loss'] = df['capital_gain'] - df['capital_loss']

df.head() # DataFrame with new features

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,hours_per_week_category,capital_gain_loss
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K,Full-time,2174.0
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K,Part-time,0.0
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K,Full-time,0.0
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K,Full-time,0.0
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K,Full-time,0.0


In [9]:
df_numerical.skew() # Skewness of numerical features

Unnamed: 0,0
age,0.558743
fnlwgt,1.44698
education_num,-0.311676
capital_gain,11.953848
capital_loss,4.594629
hours_per_week,0.227643


In [10]:
df['capital_gain_log'] = np.log1p(df['capital_gain'])
df[['capital_gain', 'capital_gain_log']].head()

Unnamed: 0,capital_gain,capital_gain_log
0,2174.0,7.684784
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


# Feature Selection:

In [11]:
# !pip install ppscore

In [12]:
# !pip uninstall numpy pandas scikit-learn -y
# !pip install numpy pandas scikit-learn

In [15]:
from sklearn.ensemble import IsolationForest
import ppscore as pps

iso_forest = IsolationForest(contamination=0.05, random_state=42) # Assuming 5% outliers Identifying and removing outliers using Isolation Forest

outliers = iso_forest.fit_predict(df_numerical)
df['outlier'] = outliers
df_cleaned = df[df['outlier'] == 1].drop('outlier', axis=1) # Remove outliers

df.shape # Original dataset shape
df_cleaned.shape # Cleaned dataset shape (after removing outliers)

(30933, 18)

In [16]:
pps_matrix = pps.matrix(df_cleaned)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
display(pps_matrix) # PPS Matrix



x,age,capital_gain,capital_gain_log,capital_gain_loss,capital_loss,education,education_num,fnlwgt,hours_per_week,hours_per_week_category,income,marital_status,native_country,occupation,race,relationship,sex,workclass
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
age,1.0,0.0052,0.004409,0.005859,0.0,0.02136902,0.02136902,0.0,0.006446,0.0039728,0.03076751,0.2053112,0.0,0.01486088,0.0,0.1546451,0.0005248303,0.017099
capital_gain,0.0,1.0,0.997589,0.997662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
capital_gain_log,0.0,0.998119,1.0,0.998119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
capital_gain_loss,0.0,0.873872,0.873818,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
capital_loss,0.0,0.0,0.0,0.997405,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
education,0.054854,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.046437,0.02371198,0.03128533,0.0,0.0,0.1102706,0.0,0.03965151,0.0,0.03814
education_num,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.02542083,0.0,0.0,0.1611129,0.0,0.0,0.0,0.0
fnlwgt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001303701,0.0,0.0,0.0
hours_per_week,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.4758303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hours_per_week_category,0.046683,0.01585,0.01644,0.014846,0.0,0.02392753,0.02392753,0.012448,1.0,1.0,0.1643962,0.0,0.0,0.0,0.0,0.0,0.0,0.089641


In [17]:
correlation_matrix = df_cleaned.select_dtypes(include=np.number).corr() # correlation matrix for numerical features
display(correlation_matrix) # Correlation Matrix

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,capital_gain_loss,capital_gain_log
age,1.0,-0.079026,0.035963,0.09341,0.029835,0.094233,0.089152,0.096498
fnlwgt,-0.079026,1.0,-0.039355,-0.012993,-0.011045,-0.021354,-0.011629,-0.014575
education_num,0.035963,-0.039355,1.0,0.13008,0.045779,0.134755,0.123678,0.098761
capital_gain,0.09341,-0.012993,0.13008,1.0,-0.032021,0.082962,0.99373,0.868804
capital_loss,0.029835,-0.011045,0.045779,-0.032021,1.0,0.010101,-0.143571,-0.038568
hours_per_week,0.094233,-0.021354,0.134755,0.082962,0.010101,1.0,0.081015,0.06734
capital_gain_loss,0.089152,-0.011629,0.123678,0.99373,-0.143571,0.081015,1.0,0.864559
capital_gain_log,0.096498,-0.014575,0.098761,0.868804,-0.038568,0.06734,0.864559,1.0


**Comparison of PPS and Correlation Matrix findings**

- Correlation measures linear relationships between numerical features, ranging from -1 to 1. A value of 0 indicates no linear correlation.
- PPS measures the strength of the predictive relationship between two columns, for both numerical and categorical features. It ranges from 0 to 1, where 0 means no predictive power and 1 means perfect predictive power.
- Unlike correlation, PPS is not symmetric (PPS(a,b) is not necessarily equal to PPS(b,a)).
- PPS can reveal non-linear relationships that correlation might miss.
- Comparing the two matrices can provide a more comprehensive understanding of feature relationships.