In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Exploration and preprocessing

1.	Load the dataset and conduct basic data exploration (summary statistics, missing values, data types).
2.	Handle missing values as per the best practices (imputation, removal, etc.).


In [6]:
df=pd.read_csv('adult_with_headers.csv')

In [7]:
df


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [8]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
df.shape

(32561, 15)

In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32561.0,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
fnlwgt,32561.0,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0
education_num,32561.0,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
capital_gain,32561.0,1077.648844,7385.292085,0.0,0.0,0.0,0.0,99999.0
capital_loss,32561.0,87.30383,402.960219,0.0,0.0,0.0,0.0,4356.0
hours_per_week,32561.0,40.437456,12.347429,1.0,40.0,40.0,45.0,99.0


In [12]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [13]:
# there is no missing values in dataset

In [14]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

3. •	Apply scaling techniques to numerical features:

In [16]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [36]:
# Identify numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
numerical_features

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')

In [37]:
std_sca= StandardScaler()

In [46]:
df_std_sca_scaled = df.copy()

In [47]:
df_std_sca_scaled[numerical_features]=std_sca.fit_transform(df[numerical_features])


In [54]:
df_std_sca_scaled[numerical_features]

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [56]:
min_sca= MinMaxScaler()

In [57]:
df_min_max_scaled = df.copy()

In [58]:
df_min_max_scaled[numerical_features]=min_sca.fit_transform(df[numerical_features])

In [59]:
df_min_max_scaled[numerical_features]

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.301370,0.044302,0.800000,0.021740,0.0,0.397959
1,0.452055,0.048238,0.800000,0.000000,0.0,0.122449
2,0.287671,0.138113,0.533333,0.000000,0.0,0.397959
3,0.493151,0.151068,0.400000,0.000000,0.0,0.397959
4,0.150685,0.221488,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...
32556,0.136986,0.166404,0.733333,0.000000,0.0,0.377551
32557,0.315068,0.096500,0.533333,0.000000,0.0,0.397959
32558,0.561644,0.094827,0.533333,0.000000,0.0,0.397959
32559,0.068493,0.128499,0.533333,0.000000,0.0,0.193878


# 2. Encoding Technique

1.•	Apply One-Hot Encoding to categorical variables with less than 5 categories.

In [61]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [62]:
# Identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns

In [63]:
categorical_features

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

In [64]:
# Apply One-Hot Encoding to categorical variables with less than 5 categories
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
df_one_hot_encoded = df.copy()

for col in categorical_features:
    if df[col].nunique() < 5:
        one_hot_encoded_data = one_hot_encoder.fit_transform(df[[col]])
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data, columns=[f"{col}_{i}" for i in range(one_hot_encoded_data.shape[1])])
        df_one_hot_encoded = pd.concat([df_one_hot_encoded, one_hot_encoded_df], axis=1).drop(columns=[col])



In [65]:

# Apply Label Encoding to categorical variables with more than 5 categories
label_encoder = LabelEncoder()
df_label_encoded = df_one_hot_encoded.copy()

for col in categorical_features:
    if df[col].nunique() >= 5:
        df_label_encoded[col] = label_encoder.fit_transform(df[col])

df_label_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_0,income_0
0,0.030671,7,-1.063611,9,1.134739,4,1,1,4,0.148453,-0.21666,-0.035429,39,1.0,0.0
1,0.837109,6,-1.008707,9,1.134739,2,4,0,4,-0.14592,-0.21666,-2.222153,39,1.0,0.0
2,-0.042642,4,0.245079,11,-0.42006,0,6,1,4,-0.14592,-0.21666,-0.035429,39,1.0,0.0
3,1.057047,4,0.425801,1,-1.197459,2,6,0,2,-0.14592,-0.21666,-0.035429,39,1.0,0.0
4,-0.775768,4,1.408176,9,1.134739,2,10,5,2,-0.14592,-0.21666,-0.035429,5,0.0,0.0


# 3 feature engineering|

In [66]:
# Creating new features
df_label_encoded['age_bin'] = pd.cut(df_label_encoded['age'], bins=[0, 25, 45, 65, 100], labels=['young', 'middle_aged', 'senior', 'elderly'])
df_label_encoded['hours_per_week_bin'] = pd.cut(df_label_encoded['hours_per_week'], bins=[0, 20, 40, 60, 80], labels=['part_time', 'full_time', 'overtime', 'extreme'])

# Encoding new features
df_label_encoded['age_bin'] = label_encoder.fit_transform(df_label_encoded['age_bin'])
df_label_encoded['hours_per_week_bin'] = label_encoder.fit_transform(df_label_encoded['hours_per_week_bin'])

df_label_encoded.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_0,income_0,age_bin,hours_per_week_bin
0,0.030671,7,-1.063611,9,1.134739,4,1,1,4,0.148453,-0.21666,-0.035429,39,1.0,0.0,0,1
1,0.837109,6,-1.008707,9,1.134739,2,4,0,4,-0.14592,-0.21666,-2.222153,39,1.0,0.0,0,1
2,-0.042642,4,0.245079,11,-0.42006,0,6,1,4,-0.14592,-0.21666,-0.035429,39,1.0,0.0,1,1
3,1.057047,4,0.425801,1,-1.197459,2,6,0,2,-0.14592,-0.21666,-0.035429,39,1.0,0.0,0,1
4,-0.775768,4,1.408176,9,1.134739,2,10,5,2,-0.14592,-0.21666,-0.035429,5,0.0,0.0,1,1


Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice.


In [71]:

# Check skewness of numerical features
skewness = df[numerical_features].skew()

# Apply log transformation to a skewed feature
skewed_feature = 'capital_gain'
df_label_encoded[skewed_feature + '_log'] = np.log1p(df_label_encoded[skewed_feature])

# Verify transformation
df_label_encoded[[skewed_feature, skewed_feature + '_log']].head()

Unnamed: 0,capital_gain,capital_gain_log
0,0.148453,0.138416
1,-0.14592,-0.157731
2,-0.14592,-0.157731
3,-0.14592,-0.157731
4,-0.14592,-0.157731


# 4. Feature Selection

Use the Isolation Forest algorithm to identify and remove outliers.

In [72]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(df_label_encoded[numerical_features])

# Remove outliers
df_no_outliers = df_label_encoded[outliers == 1]

df_no_outliers.shape



(30933, 18)

Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.



In [74]:
!pip install ppscore

Collecting ppscore




  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: ppscore
  Building wheel for ppscore (setup.py): started
  Building wheel for ppscore (setup.py): finished with status 'done'
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13167 sha256=3050a4c399cf1c49dff62cce7159cdeb06f16ac7f472ffa87c391ef985fa7258
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\8c\0a\dc\9be952e22d2a23c36d0894bb0b5be5825e83442245083a1777
Successfully built ppscore
Installing collected packages: ppscore
Successfully installed ppscore-1.3.0


In [75]:
import ppscore as pps

# Calculate PPS matrix
pps_matrix = pps.matrix(df_no_outliers)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')

# Calculate correlation matrix
correlation_matrix = df_no_outliers.corr()

pps_matrix, correlation_matrix

(x                        age   age_bin  capital_gain  capital_gain_log  \
 y                                                                        
 age                 1.000000  0.442589      0.000000          0.000000   
 age_bin             1.000000  1.000000      0.000000          0.000000   
 capital_gain        0.000000  0.000000      1.000000          0.998913   
 capital_gain_log    0.000000  0.000000      0.998970          1.000000   
 capital_loss        0.000000  0.000000      0.000000          0.000000   
 education           0.000000  0.000000      0.000000          0.000000   
 education_num       0.000000  0.000000      0.000274          0.000274   
 fnlwgt              0.000000  0.000000      0.000000          0.000000   
 hours_per_week      0.000000  0.000000      0.000000          0.000000   
 hours_per_week_bin  0.000000  0.000000      0.000000          0.000000   
 income_0            0.000000  0.000000      0.000000          0.000000   
 marital_status      0.15

The correlation matrix provides insights into linear relationships between features, while the PPS matrix assesses non-linear relationships and predictive power between features. Comparing the findings of both matrices can provide a more comprehensive understanding of feature relationships.





