In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.ensemble import IsolationForest
from scipy.stats import chi2_contingency

# 1. Data Exploration and Preprocessing

In [3]:
from google.colab import files
uploaded = files.upload()

Saving adult_with_headers.csv to adult_with_headers (1).csv


In [4]:
adult = pd.read_csv('adult_with_headers.csv')
adult

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
adult.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
summary_stats = adult.describe(include='all')
missing_values = adult.isnull().sum()
data_types = adult.dtypes

print("Summary Statistics:\n", summary_stats)
print("\nMissing Values:\n", missing_values)
print("\nData Types:\n", data_types)

Summary Statistics:
                  age workclass        fnlwgt education  education_num  \
count   32561.000000     32561  3.256100e+04     32561   32561.000000   
unique           NaN         9           NaN        16            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22696           NaN     10501            NaN   
mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
std        13.640433       NaN  1.055500e+05       NaN       2.572720   
min        17.000000       NaN  1.228500e+04       NaN       1.000000   
25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
75%        48.000000       NaN  2.370510e+05       NaN      12.000000   
max        90.000000       NaN  1.484705e+06       NaN      16.000000   

             marital_status       occupation relationship    race    sex  \
count                 3256

In [8]:
# Identify numerical features
numerical_features = adult.select_dtypes(include=['number']).columns
print("Numerical Columns:\n", numerical_features)

# Apply standard scaling
scaler_standard = StandardScaler()
data_standard_scaled = adult.copy()
data_standard_scaled[numerical_features] = scaler_standard.fit_transform(data_standard_scaled[numerical_features])

# Display scaled data
print("\nStandard Scaled Data:")
print(data_standard_scaled.head())

Numerical Columns:
 Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')

Standard Scaled Data:
        age          workclass    fnlwgt   education  education_num  \
0  0.030671          State-gov -1.063611   Bachelors       1.134739   
1  0.837109   Self-emp-not-inc -1.008707   Bachelors       1.134739   
2 -0.042642            Private  0.245079     HS-grad      -0.420060   
3  1.057047            Private  0.425801        11th      -1.197459   
4 -0.775768            Private  1.408176   Bachelors       1.134739   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-ci

In [9]:
# Apply min-max scaling
scaler_minmax = MinMaxScaler()
data_minmax_scaled = adult.copy()
data_minmax_scaled[numerical_features] = scaler_minmax.fit_transform(data_minmax_scaled[numerical_features])

# Display scaled data
print("Min-Max Scaled Data:\n")
print(data_minmax_scaled.head())

Min-Max Scaled Data:

        age          workclass    fnlwgt   education  education_num  \
0  0.301370          State-gov  0.044302   Bachelors       0.800000   
1  0.452055   Self-emp-not-inc  0.048238   Bachelors       0.800000   
2  0.287671            Private  0.138113     HS-grad       0.533333   
3  0.493151            Private  0.151068        11th       0.400000   
4  0.150685            Private  0.221488   Bachelors       0.800000   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0   

**Standard Scaling:**

*   **When to Use:** Standard scaling is preferred when the features have different units or scales, and when the data is expected to follow a Gaussian distribution. It's particularly beneficial for algorithms that assume normality (e.g., linear regression, logistic regression).


**Min-Max Scaling:**
*  **When to Use:** Min-max scaling is useful when the data does not follow a Gaussian distribution or when preserving the relationships between features is essential. It's commonly used with algorithms like neural networks or k-nearest neighbors that benefit from features being within a uniform range.

# 2. Encoding Techniques

In [10]:
# Identify categorical features
categorical_features = adult.select_dtypes(include=['object']).columns

# Count unique categories in each categorical feature
category_counts = adult[categorical_features].nunique()

# Display the number of unique categories in each feature
print("Unique Categories in Categorical Features:")
print(category_counts)

Unique Categories in Categorical Features:
workclass          9
education         16
marital_status     7
occupation        15
relationship       6
race               5
sex                2
native_country    42
income             2
dtype: int64


In [11]:
# Apply One-Hot Encoding
one_hot_features = category_counts[category_counts < 5].index
data_one_hot_encoded = pd.get_dummies(adult, columns=one_hot_features, drop_first=True)

print("Data after One-Hot Encoding:")
print( data_one_hot_encoded.head())

Data after One-Hot Encoding:
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race  \
0        Never-married        Adm-clerical   Not-in-family   White   
1   Married-civ-spouse     Exec-managerial         Husband   White   
2             Divorced   Handlers-cleaners   Not-in-family   White   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black   
4   Married-civ-spouse      Prof-specialty            Wife   Black   

   capital_gain  capital_loss  hours_per_week  native_country  sex_ Male  \
0          2174             0              40   United-States          1   
1            

In [12]:
# Apply Label Encoding
label_features = category_counts[category_counts >= 5].index
data_label_encoded = data_one_hot_encoded.copy()

label_encoders = {}
for feature in label_features:
    le = LabelEncoder()
    data_label_encoded[feature] = le.fit_transform(data_label_encoded[feature])
    label_encoders[feature] = le

print("Data after Label Encoding:")
print(data_label_encoded.head())

Data after Label Encoding:
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           1             1     4          2174             0              40   
1           4             0     4             0             0              13   
2           6             1     4             0             0              40   
3           6             0     2             0             0              40   
4          10             5     2             0             0              40   

   native_country  sex_ Male  income_ >50K  
0             

**One-Hot Encoding**
*   **Pros:**
-One-hot encoding does not impose any ordinal relationship between categories, making it suitable for nominal variables.
-Works well with many machine learning algorithms, especially those that do not assume a specific order of categories (e.g., tree-based models).

*   **Cons:**
-Increases the dimensionality of the dataset, especially with features that have many unique categories. This can lead to a "curse of dimensionality," where the model becomes more complex and computationally expensive.
-The resulting dataset can be sparse, which might be inefficient for some models.





**Label Encoding**
*   **Pros:**
*   Converts categories to a single column of integer values, preserving the dataset's original dimensionality.
*   More computationally efficient for features with many categories compared to one-hot encoding.

*  **Cons:**
*   Label encoding assumes an ordinal relationship between categories, which may not exist. This can mislead models into interpreting the encoded integers as having some order, leading to biased predictions.
*   If the categorical feature is nominal (no order), label encoding might distort the relationships between categories.

# 3. Feature Engineering

In [13]:
# Create Age Group Feature
data_label_encoded['age_group'] = pd.cut(data_label_encoded['age'], bins=[0, 25, 45, 65, 100],
                                         labels=['Young Adult', 'Middle Age', 'Senior', 'Elder'])
print("Age Group Distribution:")
print(data_label_encoded['age_group'].value_counts())

Age Group Distribution:
Middle Age     16523
Senior          8469
Young Adult     6411
Elder           1158
Name: age_group, dtype: int64


**Feature 1: "Age Group"**

Rationale:

Although age is a significant demographic characteristic, some trends may go unnoticed if age is taken as a raw number. We can generate a categorical feature by putting ages into bins, which could highlight patterns within particular age groups (young adults, middle-aged, seniors, etc).

In [14]:
# Create Capital Gain to Loss Ratio Feature
data_label_encoded['capital_gain_to_loss_ratio'] = data_label_encoded['capital_gain'] / (data_label_encoded['capital_loss'] + 1)
# Replace infinite values with a large number, as division by zero could occur
data_label_encoded['capital_gain_to_loss_ratio'].replace([float('inf'), -float('inf')], 0, inplace=True)
print("Capital Gain to Loss Ratio:")
print(data_label_encoded['capital_gain_to_loss_ratio'].describe())

Capital Gain to Loss Ratio:
count    32561.000000
mean      1077.648844
std       7385.292085
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      99999.000000
Name: capital_gain_to_loss_ratio, dtype: float64


**Feature 2: "Capital Gain to Loss Ratio"**

Rationale:

The capital gain and capital loss features provide information about an individual's financial situation. Creating a ratio between capital gain and loss might offer insight into financial stability or investment success.

In [15]:
# Log Transformation on Capital Gain
data_label_encoded['log_capital_gain'] = np.log1p(data_label_encoded['capital_gain'])  # log1p used to handle zero values

# Display the original and transformed feature distributions
print("Original Capital Gain Distribution:")
print(data_label_encoded['capital_gain'].describe())
print("\nLog-Transformed Capital Gain Distribution:")
print(data_label_encoded['log_capital_gain'].describe())

Original Capital Gain Distribution:
count    32561.000000
mean      1077.648844
std       7385.292085
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      99999.000000
Name: capital_gain, dtype: float64

Log-Transformed Capital Gain Distribution:
count    32561.000000
mean         0.734621
std          2.454738
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         11.512925
Name: log_capital_gain, dtype: float64


# 4. Feature Selection

In [16]:
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week', 'log_capital_gain']
# Initialize Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)  # Assuming 1% of the data are outliers
outliers = iso_forest.fit_predict(data_label_encoded[numerical_features])
# Add the outliers column to the data
data_label_encoded['outliers'] = outliers
# Keep only the inliers (outliers are labeled as -1 by Isolation Forest)
data_no_outliers = data_label_encoded[data_label_encoded['outliers'] != -1].copy()
print("Number of Outliers Removed:", data_label_encoded.shape[0] - data_no_outliers.shape[0])

Number of Outliers Removed: 326


**Effect of outliers on Model Performance:**
*   **Bias:** Outliers can skew the results of models, particularly linear models, by introducing extreme values that can unduly influence the model's predictions.
*   **Overfitting:** Models might try to fit to outliers, leading to overfitting, particularly in models sensitive to extreme values like linear regression or k-nearest neighbors.
*   **Loss of Accuracy:** Outliers can reduce model accuracy by distorting the model's understanding of the majority of the data.

In [31]:
!pip install ppscore

Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas<2.0.0,>=1.0.0 (from ppscore)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: ppscore
  Building wheel for ppscore (setup.py) ... [?25l[?25hdone
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13164 sha256=454665741b63ab13e4831f4e94879ba46f5ce58c0bfcb0a025cc48863ce1df45
  Stored in directory: /root/.cache/pip/wheels/42/87/10/00056aa2d2624f1b9374db6a0d5245da9a3d87bdc9247c1a56
Successfully built ppscore
Installing collected packages: pandas, ppscore
  Attempting uninstall: pandas
    Found existing installation: pandas

In [19]:
import ppscore as pps
# Calculate PPS matrix
pps_matrix = pps.matrix(data_no_outliers).pivot(columns='x', index='y', values='ppscore')
# Display the PPS matrix
print("PPS Matrix:")
print(pps_matrix)

# Calculate correlation matrix
correlation_matrix = data_no_outliers[numerical_features].corr()
# Display the correlation matrix
print("\nCorrelation Matrix:")
print(correlation_matrix)

PPS Matrix:
x                                age  age_group  capital_gain  \
y                                                               
age                         1.000000   0.612212      0.008513   
age_group                   1.000000   1.000000      0.000000   
capital_gain                0.000000   0.000000      1.000000   
capital_gain_to_loss_ratio  0.000000   0.000000      0.996963   
capital_loss                0.000000   0.000000      0.000000   
education                   0.000000   0.000000      0.000000   
education_num               0.000000   0.000000      0.003509   
fnlwgt                      0.000000   0.000000      0.000000   
hours_per_week              0.000000   0.000000      0.000000   
income_ >50K                0.000000   0.000000      0.000000   
log_capital_gain            0.000000   0.000000      0.998275   
marital_status              0.163390   0.128248      0.000000   
native_country              0.000000   0.000000      0.000000   
occupation   

**Comparison of PPS and Correlation Matrix**

Correlation Matrix:
*   Measures only linear relationships between numerical features.
*   Ranges from -1 to 1, with values close to 1 or -1 indicating a strong linear relationship.
*   Fails to capture non-linear relationships and only works with numerical data.

PPS Matrix:
*   Measures both linear and non-linear predictive power.
*   Values range from 0 (no predictive power) to 1 (perfect predictive power).
*   Works with both categorical and numerical features, providing a more comprehensive understanding of relationships.