## DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

### 1. Data Exploration and Preprocessing:

#### •	Load the dataset and conduct basic data exploration (summary statistics, missing values, data types).

In [3]:
#import pandas library
import pandas as pd
import numpy as np

In [4]:
#Read csv file
df=pd.read_csv("adult_with_headers.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
#Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [None]:
#Describe-statistical summary
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [None]:
#Check for null values
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [None]:
#Check for Datatypes
df.dtypes

Unnamed: 0,0
age,int64
workclass,object
fnlwgt,int64
education,object
education_num,int64
marital_status,object
occupation,object
relationship,object
race,object
sex,object


#### •	Handle missing values as per the best practices (imputation, removal, etc.).

In [None]:
#Replace ? with nan value
df.replace(' ?', np.nan, inplace=True)
#pd.set_option('display.max_rows', None)
#df
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,1836
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,1843
relationship,0
race,0
sex,0


In [None]:
#Check the total null values percentage in dataset
df.isnull().sum().sum()/len(df)*100


np.float64(13.08927858481005)

In [None]:
#Fill the null values
df.fillna({'workclass':df.workclass.mode()[0],'occupation':df.occupation.mode()[0],'native_country':df['native_country'].mode()[0]},inplace=True)

In [None]:
#Recheck the null values
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [None]:
#Check for the duplicate rows
df.duplicated().sum()

np.int64(24)

In [None]:
#Drop the duplicate
df.drop_duplicates(inplace=True,ignore_index=True)
df.duplicated().sum()

np.int64(0)

In [None]:
#Split the target and features
target= df[['income']]
features= df.drop(columns=['income'])

In [None]:
#Divide numerical and categorical columns
num_col=features.select_dtypes(include=['number']).columns
cat_col=features.select_dtypes(include=['object']).columns
df_num=features[num_col]
df_cat=features[cat_col]


#### •	Apply scaling techniques to numerical features:
•	Standard Scaling

•	Min-Max Scaling


In [None]:
#import Scaling libraries
from sklearn.preprocessing import StandardScaler,MinMaxScaler

##### • Standard Scaling

In [None]:
#apply Standard Scaling
std_sca=StandardScaler()
pd.DataFrame(std_sca.fit_transform(df_num),columns=df_num.columns)


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030390,-1.063569,1.134777,0.148292,-0.216743,-0.035664
1,0.836973,-1.008668,1.134777,-0.145975,-0.216743,-2.222483
2,-0.042936,0.245040,-0.420679,-0.145975,-0.216743,-0.035664
3,1.056950,0.425752,-1.198407,-0.145975,-0.216743,-0.035664
4,-0.776193,1.408066,1.134777,-0.145975,-0.216743,-0.035664
...,...,...,...,...,...,...
32532,-0.849519,0.639678,0.745913,-0.145975,-0.216743,-0.197650
32533,0.103716,-0.335436,-0.420679,-0.145975,-0.216743,-0.035664
32534,1.423579,-0.358779,-0.420679,-0.145975,-0.216743,-0.035664
32535,-1.216148,0.110930,-0.420679,-0.145975,-0.216743,-1.655530


##### • Min-Max Scaling

In [None]:
#Apply MinMax scaling
minmax_sca=MinMaxScaler()
df_num=pd.DataFrame(minmax_sca.fit_transform(df_num),columns=df_num.columns)
df_num

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.301370,0.044302,0.800000,0.021740,0.0,0.397959
1,0.452055,0.048238,0.800000,0.000000,0.0,0.122449
2,0.287671,0.138113,0.533333,0.000000,0.0,0.397959
3,0.493151,0.151068,0.400000,0.000000,0.0,0.397959
4,0.150685,0.221488,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...
32532,0.136986,0.166404,0.733333,0.000000,0.0,0.377551
32533,0.315068,0.096500,0.533333,0.000000,0.0,0.397959
32534,0.561644,0.094827,0.533333,0.000000,0.0,0.397959
32535,0.068493,0.128499,0.533333,0.000000,0.0,0.193878


#### •	Discuss the scenarios where each scaling technique is preferred and why.

##### Standard Scaling (StandardScaler):

Preferred when: The data is approximately normally distributed or when the algorithm you are using assumes a normal distribution (e.g., Linear Regression, Logistic Regression, Linear Discriminant Analysis).
How it works: It standardizes features by removing the mean and scaling to unit variance. The resulting values have a mean of 0 and a standard deviation of 1.
Less affected by: Outliers, as it doesn't bound the data to a specific range.

##### Min-Max Scaling (MinMaxScaler):

Preferred when: The data is not normally distributed, or when algorithms are sensitive to the scale of features and require features to be within a specific range (e.g., K-Nearest Neighbors, Support Vector Machines with RBF kernel, Neural Networks).
How it works: It scales features to a fixed range, usually between 0 and 1.
Sensitive to: Outliers, as they can significantly impact the scaling range.

## 2. Encoding Techniques:

In [None]:
#import encoding libraries
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [None]:
 #Identify the variables that has categories less than 5 and more than 5
lt5=[]
gt5=[]
for col in df_cat.columns:

    if df_cat[col].nunique() < 5:
        lt5.append(col)
    else:
        gt5.append(col)

    print(col)

    #print(df_cat[col].unique())
print("\n*********categories less than 5:**********\n",lt5)
print("\n*********categories greater than 5:**********\n",gt5)
df_cat_lt5=df_cat[lt5]
df_cat_gt5=df_cat[gt5]


workclass
education
marital_status
occupation
relationship
race
sex
native_country

*********categories less than 5:**********
 ['sex']

*********categories greater than 5:**********
 ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']


#### •	Apply One-Hot Encoding to categorical variables with less than 5 categories.

In [None]:
#Apply onehot encoding
onehot_enc=OneHotEncoder(sparse_output=False)
pd.DataFrame(onehot_enc.fit_transform(df_cat_lt5),columns=onehot_enc.get_feature_names_out(df_cat_lt5.columns))


Unnamed: 0,sex_ Female,sex_ Male
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0
...,...,...
32532,1.0,0.0
32533,0.0,1.0
32534,1.0,0.0
32535,0.0,1.0


#### •	Use Label Encoding for categorical variables with more than 5 categories.

In [None]:
#Apply label encoder
label_enc=LabelEncoder()
# Create a copy to store encoded data
df_cat_encoded = df_cat_gt5.copy()

# Apply LabelEncoder to each column
label_enc = LabelEncoder()
for col in df_cat_encoded.columns:
    df_cat_encoded[col] = label_enc.fit_transform(df_cat_encoded[col])
df_cat_encoded

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,native_country
0,6,9,4,0,1,4,38
1,5,9,2,3,0,4,38
2,3,11,0,5,1,4,38
3,3,1,2,5,0,2,38
4,3,9,2,9,5,2,4
...,...,...,...,...,...,...,...
32532,3,7,2,12,5,4,38
32533,3,11,2,6,0,4,38
32534,3,11,6,0,4,4,38
32535,3,11,4,0,3,4,38


#### •	Discuss the pros and cons of One-Hot Encoding and Label Encoding.

##### >> **One-Hot Encoding:**

**Pros:**
Avoids introducing ordinal relationships between categories where none exist.
Works well with most machine learning algorithms, especially those that rely on distance calculations.

**Cons:**
Can lead to a high-dimensional feature space, especially with a large number of unique categories.
Can increase memory usage and computation time.

##### >> **Label Encoding:**

**Pros:**
Simple and efficient, especially for a large number of categories.
Reduces the dimensionality of the feature space compared to one-hot encoding.

**Cons:**
Introduces an artificial ordinal relationship between categories, which can mislead some machine learning algorithms.
Not suitable for nominal categorical variables where there is no inherent order.
Choosing between One-Hot Encoding and Label Encoding depends on the nature of the categorical variable and the requirements of the machine learning algorithm you plan to use.

## 3. Feature Engineering:

#### •	Create at least 2 new features that could be beneficial for the model. Explain the rationale behind your choices.

In [None]:
#Create 2 new features
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 45, 65, 100], labels=['Youth', 'Adult', 'Mid-age', 'Senior'])
df['gain_diff'] = df['capital_gain'] - df['capital_loss']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,age_group,gain_diff
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,Adult,2174
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,Mid-age,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,Adult,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,Mid-age,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Adult,0


#### rationale behind your choices
**1.age_group:** Grouping it into categories can help models capture patterns based on age group.

**2.capital_diff:** This feature represents the net capital change, which could be a more direct indicator of financial activity and wealth compared to capital gain and loss separately.


#### •	Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice.

In [None]:
#Find skewness of capital_gain
skew_capital_gain=df['capital_gain'].skew()
# Apply logarithmic transformation to capital_gain
logof_capital_gain=df['logof_capital_gain'] = np.log1p(df['capital_gain'])
#Find skewness of capital_gain after log transformation
skew_logof_capital_gain=df['logof_capital_gain'].skew()
print("skewness of capital_gain:",skew_capital_gain)
print("skewness of capital_gain after log transformation:",skew_logof_capital_gain)
print("**capital_gain:** capital_gain is highly skewed column. Logarithmic transformation helps to reduce skewness, making the distribution normal. This can improve the performance of some machine learning models.")
print("\n----------------------------------------------------------------------------")
df

skewness of capital_gain: 11.949402833551463
skewness of capital_gain after log transformation: 3.094666793136126
**capital_gain:** capital_gain is highly skewed column. Logarithmic transformation helps to reduce skewness, making the distribution normal. This can improve the performance of some machine learning models.

----------------------------------------------------------------------------


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,logof_capital_gain
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,7.684784
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0.000000
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0.000000
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0.000000
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,0.000000
32533,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0.000000
32534,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,0.000000
32535,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,0.000000


## 4. Feature Selection:

#### •	Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance.

In [None]:
#import isolation forest library
from sklearn.ensemble import IsolationForest
#Get numerical features
numerical_features = df.select_dtypes(include=np.number).columns.tolist()
print("Shape of the DataFrame before removing outliers:",df.shape)

# contamination='auto' detects outliers automatically based on the dataset and random_state for reproducibility
iso_forest = IsolationForest(contamination='auto', random_state=42)

# Fit the model and predict outliers (-1 for outliers, 1 for inliers)
predict_outliers = iso_forest.fit_predict(df[numerical_features])
#Get outlier and inlier count
outlier_count=pd.Series(predict_outliers).value_counts()
print("\nOutliers and inliers count:\n", outlier_count)

# Create a mask for inliers
mask = predict_outliers == 1
cleaned_df = df[mask]
cleaned_df
print("\nShape of the DataFrame after removing outliers:",cleaned_df.shape)
print("\n")
cleaned_df

Shape of the DataFrame before removing outliers: (32537, 16)

Outliers and inliers count:
  1    27125
-1     5412
Name: count, dtype: int64

Shape of the DataFrame after removing outliers: (27125, 16)




Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,logof_capital_gain
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0.0
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32531,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K,0.0
32532,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,0.0
32533,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0.0
32534,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,0.0


#### •	Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.

In [None]:
#instal ppscore
!pip install ppscore

Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas<2.0.0,>=1.0.0 (from ppscore)
  Downloading pandas-1.5.3.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: ppscore, pandas
  Building wheel for ppscore (setup.py) ... [?25l[?25hdone
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13166 sha256=00026111ce3addacafb21a984bcd8bc666d4c03ed02f5f10f92720fec2d3c513
  Stored in directory: /root/.cache/pip/wheels/30/1c/06/b724ffb08ed69cd209743b44137306245ebbf025fd9acacf0c
  Building wheel for pandas (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pandas: filename=pandas-1.5.3-cp3

In [30]:
#Upgrade numpy and pandas
!pip install --upgrade --force-reinstall numpy pandas

Collecting numpy
  Downloading numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-dateutil>=2.8.2 (from pandas)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)
  Downloading six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading numpy-2.3.3-cp312-cp312-manylinux_

In [5]:
#import ppscore and warning packages
import ppscore as pps
import warnings
warnings.filterwarnings('ignore')

In [6]:
#Find pps matrix
pps_matrix = pps.matrix(df)
display("** Predictive Power Score **",pps_matrix)

print("\n")
#find correlation matrix
correlation_matrix = df.corr(numeric_only=True)
display("** Correlation matrix **",correlation_matrix)

'** Predictive Power Score **'

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,age,age,1.000000,predict_itself,True,,0.000000,1.000000,
1,age,workclass,0.011232,classification,True,weighted F1,0.579088,0.583816,DecisionTreeClassifier()
2,age,fnlwgt,0.000000,regression,True,mean absolute error,75872.186200,77535.141544,DecisionTreeRegressor()
3,age,education,0.052315,classification,True,weighted F1,0.201200,0.242989,DecisionTreeClassifier()
4,age,education_num,0.000000,regression,True,mean absolute error,1.853000,1.898306,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
220,income,capital_gain,0.000000,regression,True,mean absolute error,1093.884000,1760.682115,DecisionTreeRegressor()
221,income,capital_loss,0.000000,regression,True,mean absolute error,94.942600,176.261353,DecisionTreeRegressor()
222,income,hours_per_week,0.000000,regression,True,mean absolute error,7.656400,8.097596,DecisionTreeRegressor()
223,income,native_country,0.000000,classification,True,weighted F1,0.841082,0.841082,DecisionTreeClassifier()






'** Correlation matrix **'

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756
fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768
education_num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123
capital_gain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409
capital_loss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256
hours_per_week,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0


#### Predictive Power Score (PPS) Matrix vs. Correlation Matrix:
---------------------------------------------------------
PPS measures the predictive power of one feature on another, while correlation measures the linear relationship between two features.
PPS values range from 0 to 1, where 0 indicates no predictive power and 1 indicates perfect predictive power.
Correlation values range from -1 to 1, where -1 indicates a perfect negative linear relationship, 1 indicates a perfect positive linear relationship, and 0 indicates no linear relationship.

Observations:
- Compare the values in the PPS matrix and the correlation matrix. Notice that some pairs of features might have a low correlation but a high PPS, and vice versa.
- PPS can reveal non-linear relationships that correlation might miss.
- PPS is asymmetric (PPS(x, y) is not necessarily equal to PPS(y, x)), while correlation is symmetric.