# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

## 1. Data Exploration and Preprocessing:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing  import LabelEncoder
from sklearn.ensemble import IsolationForest

In [2]:
data = pd.read_csv('adult_with_headers.csv')

In [3]:
adult = data.copy()

In [4]:
adult.shape

(32561, 15)

In [5]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
adult.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [7]:
adult.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [8]:
adult.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [9]:
adult.dtypes

Unnamed: 0,0
age,int64
workclass,object
fnlwgt,int64
education,object
education_num,int64
marital_status,object
occupation,object
relationship,object
race,object
sex,object


In [10]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [11]:
standard_scaler = StandardScaler()

In [12]:
adult_standard_scaled = adult.copy()
columns = adult_standard_scaled.select_dtypes(include='number').columns
for column in adult_standard_scaled[columns]:
    adult_standard_scaled[column] = standard_scaler.fit_transform(adult_standard_scaled[[column]])
adult_standard_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030671,State-gov,-1.063611,Bachelors,1.134739,Never-married,Adm-clerical,Not-in-family,White,Male,0.148453,-0.21666,-0.035429,United-States,<=50K
1,0.837109,Self-emp-not-inc,-1.008707,Bachelors,1.134739,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.14592,-0.21666,-2.222153,United-States,<=50K
2,-0.042642,Private,0.245079,HS-grad,-0.42006,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
3,1.057047,Private,0.425801,11th,-1.197459,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
4,-0.775768,Private,1.408176,Bachelors,1.134739,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.14592,-0.21666,-0.035429,Cuba,<=50K


In [13]:
minmax_scaler = MinMaxScaler()

In [14]:
adult_minmax_scaled = adult.copy()
columns = adult_minmax_scaled.select_dtypes(include='number').columns
for column in adult_minmax_scaled[columns]:
    adult_minmax_scaled[column] = minmax_scaler.fit_transform(adult_minmax_scaled[[column]])
adult_minmax_scaled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,<=50K


In [15]:
# When the data distribution follows normal distribution and algorithms used distance then we will use standard scaler
# When the data disribution not followed normal distribution and algorithms required specific range then we will use minmax scaler

# 2. Encoding Techniques:

In [16]:
adult.nunique()

Unnamed: 0,0
age,73
workclass,9
fnlwgt,21648
education,16
education_num,16
marital_status,7
occupation,15
relationship,6
race,5
sex,2


In [17]:
adult.select_dtypes(exclude='number').nunique()

Unnamed: 0,0
workclass,9
education,16
marital_status,7
occupation,15
relationship,6
race,5
sex,2
native_country,42
income,2


In [18]:
adult.select_dtypes(exclude='number').columns[adult.select_dtypes(exclude='number').nunique() < 5]

Index(['sex', 'income'], dtype='object')

In [19]:
adult = pd.get_dummies(adult,columns=['sex','income'],drop_first=True,dtype='int64')

In [20]:
adult.dtypes

Unnamed: 0,0
age,int64
workclass,object
fnlwgt,int64
education,object
education_num,int64
marital_status,object
occupation,object
relationship,object
race,object
capital_gain,int64


In [21]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States,1,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States,1,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,40,United-States,1,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,40,United-States,1,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,40,Cuba,0,0


In [22]:
adult.select_dtypes(exclude='number').columns[adult.select_dtypes(exclude='number').nunique() >= 5]

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'native_country'],
      dtype='object')

In [23]:
label_encoder = LabelEncoder()

In [24]:
columns = ['workclass','education','marital_status','occupation','relationship','race','native_country']
for column in adult[columns]:
    adult[column] = label_encoder.fit_transform(adult[column])

In [25]:
adult.dtypes

Unnamed: 0,0
age,int64
workclass,int64
fnlwgt,int64
education,int64
education_num,int64
marital_status,int64
occupation,int64
relationship,int64
race,int64
capital_gain,int64


In [26]:
adult = adult.astype('int64')
adult.dtypes

Unnamed: 0,0
age,int64
workclass,int64
fnlwgt,int64
education,int64
education_num,int64
marital_status,int64
occupation,int64
relationship,int64
race,int64
capital_gain,int64


In [27]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,1,0
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,1,0
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,1,0
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,1,0
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,0,0


In [28]:
# One-Hot Encoding
# Pros: there is no order every category is treated as equal
# Cons: it is difficult if there are many categories because that many new columns will be created

# Label Encoding
# Pros: No new column will be created even there are 10's and 100's of categories also
# Cons: It will generated based on alphabetical order and treated the categories in order and there is a chance to think particular category is higher

## 3. Feature Engineering:

In [29]:
adult.skew()

Unnamed: 0,0
age,0.558743
workclass,-0.752024
fnlwgt,1.44698
education,-0.934042
education_num,-0.311676
marital_status,-0.013508
occupation,0.114583
relationship,0.786818
race,-2.435386
capital_gain,11.953848


In [30]:
adult['capital_net'] = adult['capital_gain'] - adult['capital_loss']

In [31]:
adult['age_category'] = pd.cut(adult['age'], bins = [0,30,60,90], labels = [0, 1, 2])

In [32]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K,capital_net,age_category
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,1,0,2174,1
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,1,0,0,1
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,1,0,0,1
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,1,0,0,1
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,0,0,0,0


In [33]:
adult['capital_gain'].skew()

11.953847687699799

In [34]:
adult['capital_gain_log'] = np.log1p(adult['capital_gain'])

In [35]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K,capital_net,age_category,capital_gain_log
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,1,0,2174,1,7.684784
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,1,0,0,1,0.0
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,1,0,0,1,0.0
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,1,0,0,1,0.0
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,0,0,0,0,0.0


## 4. Feature Selection:

In [36]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(adult)

In [37]:
# predictions
y_pred = clf.predict(adult)

In [38]:
#-1 for outliers and 1 for inliers.
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [39]:
adult.loc[y_pred==-1]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K,capital_net,age_category,capital_gain_log
52,47,4,51835,14,15,2,10,5,4,0,1902,60,16,0,1,-1902,1,0.000000
93,30,4,117747,11,9,2,12,5,1,0,1573,35,0,0,0,-1573,0,0.000000
106,17,0,304873,0,6,4,0,3,4,34095,0,32,39,0,0,34095,0,10.436935
157,71,6,494223,15,10,5,12,4,2,0,1816,2,39,1,0,-1816,2,0.000000
297,39,0,157443,12,14,2,0,5,1,3464,0,40,0,0,0,3464,1,8.150468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32090,66,4,115498,9,13,2,4,0,4,99999,0,55,0,1,1,99999,2,11.512925
32238,47,4,294913,14,15,2,4,0,4,99999,0,40,39,1,1,99999,1,11.512925
32341,74,6,199136,9,13,6,3,1,4,15831,0,8,11,1,1,15831,2,9.669788
32370,53,6,137547,14,15,4,10,1,1,27828,0,40,30,1,1,27828,1,10.233834


In [40]:
adult = adult.loc[y_pred==1].reset_index(drop=True)

In [41]:
adult

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K,capital_net,age_category,capital_gain_log
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,1,0,2174,1,7.684784
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,1,0,0,1,0.000000
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,1,0,0,1,0.000000
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,1,0,0,1,0.000000
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,0,0,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32230,27,4,257302,7,12,2,13,5,4,0,0,38,39,0,0,0,0,0.000000
32231,40,4,154374,11,9,2,7,0,4,0,0,40,39,1,1,0,1,0.000000
32232,58,4,151910,11,9,6,1,4,4,0,0,40,39,0,0,0,1,0.000000
32233,22,4,201490,11,9,4,1,3,4,0,0,20,39,1,0,0,0,0.000000


In [42]:
# Effect of Outliers: Outliers can skew model predictions, reduce accuracy, and lead to overfitting.

In [43]:
#install the package
!pip install ppscore



In [44]:
import ppscore as pps

In [47]:
#pps.score(df, "feature_column", "target_column")  syntax
pps.score(data, "workclass", "income")

{'x': 'workclass',
 'y': 'income',
 'ppscore': 0.0940557685801341,
 'case': 'classification',
 'is_valid_score': True,
 'metric': 'weighted F1',
 'baseline_score': 0.6531153390987711,
 'model_score': 0.6857418424884953,
 'model': DecisionTreeClassifier()}

In [48]:
#calculate the whole PPS matrix
pps.matrix(data)



Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,age,age,1.000000,predict_itself,True,,0.000000,1.000000,
1,age,workclass,0.011232,classification,True,weighted F1,0.579088,0.583816,DecisionTreeClassifier()
2,age,fnlwgt,0.000000,regression,True,mean absolute error,75872.186200,77535.141544,DecisionTreeRegressor()
3,age,education,0.052315,classification,True,weighted F1,0.201200,0.242989,DecisionTreeClassifier()
4,age,education_num,0.000000,regression,True,mean absolute error,1.853000,1.898306,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
220,income,capital_gain,0.000000,regression,True,mean absolute error,1093.884000,1760.682115,DecisionTreeRegressor()
221,income,capital_loss,0.000000,regression,True,mean absolute error,94.942600,176.261353,DecisionTreeRegressor()
222,income,hours_per_week,0.000000,regression,True,mean absolute error,7.656400,8.097596,DecisionTreeRegressor()
223,income,native_country,0.000000,classification,True,weighted F1,0.841082,0.841082,DecisionTreeClassifier()
