## **Data Exploration and Preprocessing:**

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")


In [36]:
df=pd.read_csv("/content/adult_with_headers.csv")
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [37]:
df.shape

(32561, 15)

In [38]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [39]:
df.describe(include=object)

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
count,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [40]:
df.dtypes

Unnamed: 0,0
age,int64
workclass,object
fnlwgt,int64
education,object
education_num,int64
marital_status,object
occupation,object
relationship,object
race,object
sex,object


In [41]:
df.isna().sum()
# there are no missing values

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [42]:
# standardization of numerical columns

scaler=StandardScaler()
scaled_data=scaler.fit_transform(df[['age','fnlwgt','education_num','capital_gain','capital_loss','hours_per_week']])

In [43]:
scaled_data

array([[ 0.03067056, -1.06361075,  1.13473876,  0.1484529 , -0.21665953,
        -0.03542945],
       [ 0.83710898, -1.008707  ,  1.13473876, -0.14592048, -0.21665953,
        -2.22215312],
       [-0.04264203,  0.2450785 , -0.42005962, -0.14592048, -0.21665953,
        -0.03542945],
       ...,
       [ 1.42360965, -0.35877741, -0.42005962, -0.14592048, -0.21665953,
        -0.03542945],
       [-1.21564337,  0.11095988, -0.42005962, -0.14592048, -0.21665953,
        -1.65522476],
       [ 0.98373415,  0.92989258, -0.42005962,  1.88842434, -0.21665953,
        -0.03542945]])

In [44]:
df1=pd.DataFrame(scaled_data,columns=['age','fnlwgt','education_num','capital_gain','capital_loss','hours_per_week'])

In [45]:
df1

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [46]:
df1.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,-2.705915e-17,-1.001625e-16,1.471887e-16,1.309314e-17,1.0169e-16,-1.5493550000000002e-17
std,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.582206,-1.681631,-3.529656,-0.1459205,-0.2166595,-3.19403
25%,-0.7757679,-0.681691,-0.4200596,-0.1459205,-0.2166595,-0.03542945
50%,-0.1159546,-0.1082193,-0.03136003,-0.1459205,-0.2166595,-0.03542945
75%,0.6904838,0.4478765,0.7460392,-0.1459205,-0.2166595,0.3695194
max,3.769612,12.26856,2.300838,13.39458,10.59351,4.742967


## **Encoding Techniques:**

In [47]:
data=df[['workclass','education','marital_status','occupation','relationship','race','sex','native_country','income']]

In [48]:
data

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


In [49]:
# applying OneHotEncoding To 'race','sex',and 'income' columns bcz they have less than 5 uniue values
# applying Label encoding to other categorical columns since they have more than 5 unique values

In [50]:

OHE_data=df[['race','sex','income']]
OHE_data


Unnamed: 0,race,sex,income
0,White,Male,<=50K
1,White,Male,<=50K
2,White,Male,<=50K
3,Black,Male,<=50K
4,Black,Female,<=50K
...,...,...,...
32556,White,Female,<=50K
32557,White,Male,>50K
32558,White,Female,<=50K
32559,White,Male,<=50K


In [51]:
OHE=OneHotEncoder()
Encoded_OHE=OHE.fit_transform(OHE_data).toarray()
features_name=OHE.get_feature_names_out(OHE_data.columns)
OHE_Data=pd.DataFrame(Encoded_OHE,columns=features_name)
OHE_Data

Unnamed: 0,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
32557,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
32558,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
32559,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [52]:
data

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


In [53]:
Label_data=data.iloc[:,[0,1,2,3,4,7]]
Label_data

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Cuba
...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,United-States


In [54]:
for col in Label_data.columns:

  LabelEnc=LabelEncoder()
  Label_data[col]=LabelEnc.fit_transform(Label_data[col])
Label_data


Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country
0,7,9,4,1,1,39
1,6,9,2,4,0,39
2,4,11,0,6,1,39
3,4,1,2,6,0,39
4,4,9,2,10,5,5
...,...,...,...,...,...,...
32556,4,7,2,13,5,39
32557,4,11,2,7,0,39
32558,4,11,6,1,4,39
32559,4,11,4,1,3,39


In [55]:
df_enc=pd.concat([Label_data,OHE_Data,df1],axis=1)
df_enc

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,...,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,7,9,4,1,1,39,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,6,9,2,4,0,39,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,4,11,0,6,1,39,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,4,1,2,6,0,39,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,4,9,2,10,5,5,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,4,7,2,13,5,39,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,4,11,2,7,0,39,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,4,11,6,1,4,39,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,4,11,4,1,3,39,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [56]:
df_enc.describe()

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,...,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,...,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,3.868892,10.29821,2.611836,6.57274,1.446362,36.718866,0.009551,0.031909,0.095943,0.008323,...,0.330795,0.669205,0.75919,0.24081,-2.705915e-17,-1.001625e-16,1.471887e-16,1.309314e-17,1.0169e-16,-1.5493550000000002e-17
std,1.45596,3.870264,1.506222,4.228857,1.606771,7.823782,0.097264,0.175761,0.294518,0.090851,...,0.470506,0.470506,0.427581,0.427581,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.582206,-1.681631,-3.529656,-0.1459205,-0.2166595,-3.19403
25%,4.0,9.0,2.0,3.0,0.0,39.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,-0.7757679,-0.681691,-0.4200596,-0.1459205,-0.2166595,-0.03542945
50%,4.0,11.0,2.0,7.0,1.0,39.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,-0.1159546,-0.1082193,-0.03136003,-0.1459205,-0.2166595,-0.03542945
75%,4.0,12.0,4.0,10.0,3.0,39.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.6904838,0.4478765,0.7460392,-0.1459205,-0.2166595,0.3695194
max,8.0,15.0,6.0,14.0,5.0,41.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,3.769612,12.26856,2.300838,13.39458,10.59351,4.742967


## **Feature Engineering**

In [57]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [58]:
import random
for i in df['income']:
  if i == "<=50K":
    df.loc[df['income'] == i, 'income'] = random.randint(10000, 50000)
  else:
    df.loc[df['income'] == i, 'income'] = random.randint(50000, 100000)

In [59]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,58244
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,58244
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,58244
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,58244
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,58244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,58244
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,58244
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,58244
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,58244


In [60]:
df['income'].dtype

dtype('O')

In [61]:
df['income_per_hr'] = df['income'] / df['hours_per_week']

In [62]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,income_per_hr
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,58244,1456.1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,58244,4480.307692
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,58244,1456.1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,58244,1456.1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,58244,1456.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,58244,1532.736842
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,58244,1456.1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,58244,1456.1
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,58244,2912.2


In [63]:
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 100], labels=['Young', 'Middle-aged', 'Senior'])

In [64]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,income_per_hr,age_group
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,58244,1456.1,Middle-aged
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,58244,4480.307692,Middle-aged
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,58244,1456.1,Middle-aged
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,58244,1456.1,Senior
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,58244,1456.1,Young
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,58244,1532.736842,Young
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,58244,1456.1,Middle-aged
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,58244,1456.1,Senior
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,58244,2912.2,Young


In [66]:
# log Transformation on inocome_per_hr
df['income_per_hr'] = pd.to_numeric(df['income_per_hr'], errors='coerce')
df['income_per_hr']=np.log(df['income_per_hr'])

In [67]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,income_per_hr,age_group
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,58244,7.283517,Middle-aged
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,58244,8.407447,Middle-aged
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,58244,7.283517,Middle-aged
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,58244,7.283517,Senior
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,58244,7.283517,Young
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,58244,7.334810,Young
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,58244,7.283517,Middle-aged
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,58244,7.283517,Senior
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,58244,7.976664,Young


## **Feature Selection**

In [68]:
# Isolation forest

from sklearn.ensemble import IsolationForest


In [69]:
df_enc

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,...,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,7,9,4,1,1,39,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,6,9,2,4,0,39,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,4,11,0,6,1,39,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,4,1,2,6,0,39,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,4,9,2,10,5,5,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,4,7,2,13,5,39,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,4,11,2,7,0,39,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,4,11,6,1,4,39,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,4,11,4,1,3,39,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [70]:
iso=IsolationForest(random_state=42,contamination=0.1)
iso.fit(df_enc)


In [71]:
out=iso.predict(df_enc)

In [72]:
out


array([ 1,  1,  1, ...,  1,  1, -1])

In [73]:
df_enc["score"]=out
df_enc

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,...,sex_ Male,income_ <=50K,income_ >50K,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,score
0,7,9,4,1,1,39,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,1
1,6,9,2,4,0,39,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153,1
2,4,11,0,6,1,39,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429,1
3,4,1,2,6,0,39,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429,1
4,4,9,2,10,5,5,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,4,7,2,13,5,39,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409,1
32557,4,11,2,7,0,39,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429,1
32558,4,11,6,1,4,39,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429,1
32559,4,11,4,1,3,39,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225,1


In [74]:
df_enc[df_enc['score']==-1]

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,...,sex_ Male,income_ <=50K,income_ >50K,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,score
4,4,9,2,10,5,5,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429,-1
6,4,6,3,8,1,23,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.763796,-0.280358,-1.974858,-0.145920,-0.21666,-1.979184,-1
8,4,12,4,10,1,39,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-0.555830,-1.364279,1.523438,1.761142,-0.21666,0.774468,-1
10,4,15,2,4,0,39,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,-0.115955,0.859186,-0.031360,-0.145920,-0.21666,3.204161,-1
11,7,9,2,10,0,19,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,-0.629143,-0.459328,1.134739,-0.145920,-0.21666,-0.035429,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32538,4,9,0,10,4,39,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,-0.042642,-0.479386,1.134739,1.887883,-0.21666,0.369519,-1
32539,0,10,2,0,0,39,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,2.376673,0.924634,2.300838,-0.145920,-0.21666,-2.465122,-1
32541,0,11,5,0,1,39,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.177296,0.123580,-0.420060,-0.145920,-0.21666,-0.683348,-1
32545,2,7,2,1,5,39,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.030671,-0.741645,0.746039,-0.145920,-0.21666,-1.655225,-1


In [76]:
df_enc.drop(df_enc[df_enc['score']==-1].index,inplace=True)

In [77]:
df_enc

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,...,sex_ Male,income_ <=50K,income_ >50K,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,score
0,7,9,4,1,1,39,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,1
1,6,9,2,4,0,39,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153,1
2,4,11,0,6,1,39,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429,1
3,4,1,2,6,0,39,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429,1
5,4,12,2,4,5,39,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-0.115955,0.898201,1.523438,-0.145920,-0.21666,-0.035429,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,4,15,4,11,1,39,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,-1.215643,1.140460,-0.031360,-0.145920,-0.21666,-0.035429,1
32556,4,7,2,13,5,39,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409,1
32557,4,11,2,7,0,39,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429,1
32558,4,11,6,1,4,39,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429,1


In [79]:
# PPS Score
!pip install ppscore




In [80]:
import ppscore as pps
import pandas as pd

In [83]:
pps.score(df_enc,'workclass','age')

{'x': 'workclass',
 'y': 'age',
 'ppscore': 0.023656151704430806,
 'case': 'regression',
 'is_valid_score': True,
 'metric': 'mean absolute error',
 'baseline_score': 0.8023329183064852,
 'model_score': 0.7833528090735683,
 'model': DecisionTreeRegressor()}

In [84]:
pps.matrix(df_enc)

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,workclass,workclass,1.000000,predict_itself,True,,0.000000,1.000000,
1,workclass,education,0.000000,regression,True,mean absolute error,2.675600,2.842719,DecisionTreeRegressor()
2,workclass,marital_status,0.000000,regression,True,mean absolute error,1.140600,1.226663,DecisionTreeRegressor()
3,workclass,occupation,0.083075,regression,True,mean absolute error,3.658200,3.354296,DecisionTreeRegressor()
4,workclass,relationship,0.000000,regression,True,mean absolute error,1.184400,1.272033,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
479,score,education_num,0.000000,regression,True,mean absolute error,0.687065,0.698198,DecisionTreeRegressor()
480,score,capital_gain,0.000000,regression,True,mean absolute error,0.113274,0.208739,DecisionTreeRegressor()
481,score,capital_loss,0.000000,regression,True,mean absolute error,0.172850,0.332884,DecisionTreeRegressor()
482,score,hours_per_week,0.000000,regression,True,mean absolute error,0.574590,0.593942,DecisionTreeRegressor()


In [85]:
df2=pd.DataFrame(pps.matrix(df_enc))
df2[df2.case!='predict_itself'].iloc[:,[0,1,2,3,5,8]].sort_values('ppscore',ascending=False)

Unnamed: 0,x,y,ppscore,case,metric,model
254,sex_ Female,sex_ Male,1.0,regression,mean absolute error,DecisionTreeRegressor()
375,education_num,education,1.0,regression,mean absolute error,DecisionTreeRegressor()
321,income_ >50K,income_ <=50K,1.0,regression,mean absolute error,DecisionTreeRegressor()
275,sex_ Male,sex_ Female,1.0,regression,mean absolute error,DecisionTreeRegressor()
300,income_ <=50K,income_ >50K,1.0,regression,mean absolute error,DecisionTreeRegressor()
...,...,...,...,...,...,...
166,race_ Asian-Pac-Islander,sex_ Male,0.0,regression,mean absolute error,DecisionTreeRegressor()
165,race_ Asian-Pac-Islander,sex_ Female,0.0,regression,mean absolute error,DecisionTreeRegressor()
164,race_ Asian-Pac-Islander,race_ White,0.0,regression,mean absolute error,DecisionTreeRegressor()
163,race_ Asian-Pac-Islander,race_ Other,0.0,regression,mean absolute error,DecisionTreeRegressor()
