In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
import ppscore as pps

df = pd.read_csv('/content/adult_with_headers.csv')

print("Data Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(exclude=[np.number]).columns

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

print("\nMissing Values After Imputation:")
print(df.isnull().sum())

scaler_standard = StandardScaler()
df[['age', 'hours_per_week']] = scaler_standard.fit_transform(df[['age', 'hours_per_week']])

scaler_minmax = MinMaxScaler()
df[['age', 'hours_per_week']] = scaler_minmax.fit_transform(df[['age', 'hours_per_week']])

df = pd.get_dummies(df, columns=['education', 'marital_status'])

label_encoder = LabelEncoder()
df['occupation'] = label_encoder.fit_transform(df['occupation'])

df['income_group'] = df['hours_per_week'] * df['age']
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 50, 75, 100], labels=['young', 'middle-aged', 'senior', 'elder'])

df['log_age'] = np.log1p(df['age'])

iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(df[['age', 'hours_per_week']])
df['outlier'] = np.where(outliers == -1, 1, 0)

df_cleaned = df[df['outlier'] == 0]

pps_matrix = pps.matrix(df_cleaned)

print("\nPPS Score Matrix:")
print(pps_matrix)

corr_matrix = df.corr()
print("\nCorrelation Matrix:")
print(corr_matrix)


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None

Summary Statistics:
                age        fnlwgt  education_num  capita

  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  return f1_diff / s


PPS Score Matrix:
            x                        y   ppscore            case  \
0         age                      age  1.000000  predict_itself   
1         age                workclass  0.005857  classification   
2         age                   fnlwgt  0.000000      regression   
3         age            education_num  0.000000      regression   
4         age               occupation  0.000000      regression   
...       ...                      ...       ...             ...   
1595  outlier  marital_status_ Widowed  0.000000      regression   
1596  outlier             income_group  0.000000      regression   
1597  outlier                age_group       NaN  classification   
1598  outlier                  log_age  0.000000      regression   
1599  outlier                  outlier  1.000000  predict_itself   

      is_valid_score               metric  baseline_score   model_score  \
0               True                 None        0.000000      1.000000   
1             

  return f1_diff / scale_range  # 0.1/0.3 = 0.33
  corr_matrix = df.corr()
