#	1. Data Exploration and Preprocessing:

In [57]:
import pandas as pd
from sklearn.ensemble import IsolationForest


In [3]:
df = pd.read_csv("/content/adult_with_headers.csv")


In [4]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [8]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [9]:
df['workclass'].fillna(df['workclass'].mode()[0], inplace=True)
df['occupation'].fillna(df['occupation'].mode()[0], inplace=True)


In [10]:
df.dropna(inplace=True)

In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [12]:
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

In [13]:
scaler_standard = StandardScaler()
standard_scaled = scaler_standard.fit_transform(df[numerical_features])


In [14]:
scaler_minmax = MinMaxScaler()
minmax_scaled = scaler_minmax.fit_transform(df[numerical_features])

In [15]:
df_standard_scaled = pd.DataFrame(standard_scaled, columns=numerical_features)
df_minmax_scaled = pd.DataFrame(minmax_scaled, columns=numerical_features)


In [16]:
df_standard_scaled.head(), df_minmax_scaled.head()


(        age    fnlwgt  education_num  capital_gain  capital_loss  \
 0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
 1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
 2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
 3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
 4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   
 
    hours_per_week  
 0       -0.035429  
 1       -2.222153  
 2       -0.035429  
 3       -0.035429  
 4       -0.035429  ,
         age    fnlwgt  education_num  capital_gain  capital_loss  \
 0  0.301370  0.044302       0.800000       0.02174           0.0   
 1  0.452055  0.048238       0.800000       0.00000           0.0   
 2  0.287671  0.138113       0.533333       0.00000           0.0   
 3  0.493151  0.151068       0.400000       0.00000           0.0   
 4  0.150685  0.221488       0.800000       0.00000           0.0   
 
    hours_per_week  
 0        0.397959  

# 2. Encoding Techniques:

In [35]:
categorical_features = df.select_dtypes(include=['object']).columns

In [36]:
unique_counts = df[categorical_features].nunique()

In [37]:
one_hot_features = unique_counts[unique_counts <= 5].index
label_encoding_features = unique_counts[unique_counts > 5].index
unique_counts, one_hot_features, label_encoding_features



(income    2
 dtype: int64,
 Index(['income'], dtype='object'),
 Index([], dtype='object'))

In [38]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
label_encoder = LabelEncoder()


In [39]:
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')  # Drop first to avoid dummy variable trap
label_encoder = LabelEncoder()

In [40]:
one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(df[one_hot_features]),
                               columns=one_hot_encoder.get_feature_names_out(one_hot_features))




In [41]:
label_encoded_df = df[label_encoding_features].apply(label_encoder.fit_transform)

In [42]:
df_encoded = pd.concat([df.drop(columns=one_hot_features.tolist() + label_encoding_features.tolist()),
                        one_hot_encoded, label_encoded_df], axis=1)


In [43]:
df_encoded.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,income_ >50K,0
0,22,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,
1,33,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,
2,21,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,
3,36,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,
4,11,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,


# 3. Feature Engineering:

In [58]:
bins = [0, 30, 50, 100]
labels = ['Young', 'Middle-aged', 'Senior']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)


In [59]:
df['capital_gain_loss_ratio'] = df['capital_gain'] / (df['capital_loss'] + 1)

In [60]:
import numpy as np

In [61]:
df['log_capital_gain'] = np.log1p(df['capital_gain'])

In [62]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [63]:
X = df.drop('age', axis=1)
y = df['age']


In [64]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

In [65]:
numerical_features = numerical_columns
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [66]:
preprocessor = ColumnTransformer(
    transformers=[
        ('age', StandardScaler(), numerical_features),
        ('hours_per_week', OneHotEncoder(), categorical_features)
    ])


In [67]:
df['age_bucket'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 65, 100], labels=['<25', '25-35', '35-45', '45-55', '55-65', '>65'])


In [68]:
df['hours_per_week_bucket'] = pd.cut(df['hours_per_week'], bins=[0, 20, 40, 60, 80, 100], labels=['<20', '20-40', '40-60', '60-80', '>80'])


In [69]:
df['capital_gain_log'] = df['capital_gain'].apply(lambda x: np.log(x + 1))


In [70]:
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['outlier'] = iso_forest.fit_predict(df.select_dtypes(include=[np.number]))


In [71]:
df_no_outliers = df[df['outlier'] == 1].drop(columns=['outlier'])

In [72]:
correlation_matrix = df_no_outliers.select_dtypes(include=['float64', 'int64']).corr()

In [73]:
df[['age', 'age_bucket', 'hours_per_week', 'hours_per_week_bucket', 'capital_gain', 'capital_gain_log']].head(), correlation_matrix


(   age age_bucket  hours_per_week hours_per_week_bucket  capital_gain  \
 0   22        <25              40                 20-40          2174   
 1   33      25-35              13                   <20             0   
 2   21        <25              40                 20-40             0   
 3   36      35-45              40                 20-40             0   
 4   11        <25              40                 20-40             0   
 
    capital_gain_log  
 0          7.684784  
 1          0.000000  
 2          0.000000  
 3          0.000000  
 4          0.000000  ,
                               age    fnlwgt  education_num  capital_gain  \
 age                      1.000000 -0.078645       0.044787      0.104428   
 fnlwgt                  -0.078645  1.000000      -0.036960     -0.005631   
 education_num            0.044787 -0.036960       1.000000      0.142585   
 capital_gain             0.104428 -0.005631       0.142585      1.000000   
 capital_loss             0.05

# 4. Feature Selection:

In [74]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.01)
outliers = iso_forest.fit_predict(df[numerical_features])
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['outlier'] = iso_forest.fit_predict(df.select_dtypes(include=[np.number]))

df = df[outliers != -1]

In [75]:
correlation_matrix = df_no_outliers.select_dtypes(include=['float64', 'int64']).corr()
print(correlation_matrix)

                              age    fnlwgt  education_num  capital_gain  \
age                      1.000000 -0.078645       0.044787      0.104428   
fnlwgt                  -0.078645  1.000000      -0.036960     -0.005631   
education_num            0.044787 -0.036960       1.000000      0.142585   
capital_gain             0.104428 -0.005631       0.142585      1.000000   
capital_loss             0.057244 -0.009201       0.083255     -0.047558   
hours_per_week           0.076699 -0.021357       0.145061      0.088596   
capital_gain_loss_ratio  0.104428 -0.005631       0.142585      1.000000   
log_capital_gain         0.103002 -0.008228       0.105209      0.856082   
capital_gain_log         0.103002 -0.008228       0.105209      0.856082   

                         capital_loss  hours_per_week  \
age                          0.057244        0.076699   
fnlwgt                      -0.009201       -0.021357   
education_num                0.083255        0.145061   
capital_gai

In [70]:
pip install ppscore

Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas<2.0.0,>=1.0.0 (from ppscore)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: ppscore
  Building wheel for ppscore (setup.py) ... [?25l[?25hdone
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13164 sha256=1b185541405bbf9c49741e5d6225707f58a5c780636a018ac439e8c7fe7f00fb
  Stored in directory: /root/.cache/pip/wheels/42/87/10/00056aa2d2624f1b9374db6a0d5245da9a3d87bdc9247c1a56
Successfully built ppscore
Installing collected packages: pandas, ppscore
  Attempting uninstall: pandas
    Found existing installation: pandas

In [1]:
import ppscore as pps

In [82]:
X = df.drop(columns=['income'])
y = df['income']


In [78]:
pps_matrix = pps.matrix(df_no_outliers)
print(pps_matrix[['x', 'y', 'ppscore']])


                      x                        y   ppscore
0                   age                      age  1.000000
1                   age                   fnlwgt  0.000000
2                   age            education_num  0.000000
3                   age             capital_gain  0.000000
4                   age             capital_loss  0.000000
...                 ...                      ...       ...
13220  capital_gain_log  capital_gain_loss_ratio  0.992129
13221  capital_gain_log         log_capital_gain  0.999569
13222  capital_gain_log               age_bucket  0.022844
13223  capital_gain_log    hours_per_week_bucket  0.042778
13224  capital_gain_log         capital_gain_log  1.000000

[13225 rows x 3 columns]


In [79]:
correlation_matrix = df_no_outliers.corr()
print(correlation_matrix)



  correlation_matrix = df_no_outliers.corr()


                                 age    fnlwgt  education_num  capital_gain  \
age                         1.000000 -0.078645       0.044787      0.104428   
fnlwgt                     -0.078645  1.000000      -0.036960     -0.005631   
education_num               0.044787 -0.036960       1.000000      0.142585   
capital_gain                0.104428 -0.005631       0.142585      1.000000   
capital_loss                0.057244 -0.009201       0.083255     -0.047558   
...                              ...       ...            ...           ...   
native_country_ Vietnam    -0.014642 -0.009530      -0.004318     -0.006915   
native_country_ Yugoslavia  0.000810  0.005842      -0.001447     -0.001456   
capital_gain_loss_ratio     0.104428 -0.005631       0.142585      1.000000   
log_capital_gain            0.103002 -0.008228       0.105209      0.856082   
capital_gain_log            0.103002 -0.008228       0.105209      0.856082   

                            capital_loss  hours_per

In [83]:
pip uninstall ppscore

Found existing installation: ppscore 1.3.0
Uninstalling ppscore-1.3.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/ppscore-1.3.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/ppscore/*
Proceed (Y/n)? Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/uninstall.py", line 106, in run
    uninstall_pathset = req.uninstall(
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/req/req_install.py", line 722, in uninstall
    uninstalled_pathset.remove(auto_confirm, verbose)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/req/req_uninstall.py", line 364, in remove
    if auto_confirm or self._allowed_to_proceed(verbose):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/req/req_uninstall.py", line 404, in _allowed_to_proceed
    return