In [1]:
import sys
from pathlib import Path
ROOT_PATH = str('../sfs')
sys.path.append(ROOT_PATH)
from shapwise_feature_selector import *
np.random.seed(0)
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from category_encoders import OneHotEncoder

def generate_data(n_samples=2500, n_features=1000, n_categorical_features=200):
    """
    Generate synthetic numerical and categorical data.
    """
    n_numerical_features = n_features - n_categorical_features
    numerical_data = np.random.randn(n_samples, n_numerical_features)
    categorical_data = np.random.randint(0, 3, size=(n_samples, n_categorical_features))
    # Explicitly name all columns
    numerical_columns = [f'num_{i}' for i in range(n_numerical_features)]
    categorical_columns = [f'cat_{i}' for i in range(n_categorical_features)]
    df_numerical = pd.DataFrame(numerical_data, columns=numerical_columns)
    df_categorical = pd.DataFrame(categorical_data, columns=categorical_columns)
    df = pd.concat([df_numerical, df_categorical], axis=1)
    return df

def introduce_signal(df, n_numerical_features, signal_strength=0.8, train_only_signal=False, train_size=0.8):
    """
    Introduce a signal into the data.
    """
    n_samples = len(df)
    numerical_columns = [f'num_{i}' for i in range(n_numerical_features)]
    signal_features = np.random.choice(numerical_columns, size=10, replace=False)  # 10 numerical features contribute to the signal
    noise = np.random.randn(n_samples) * (1 - signal_strength)
    target = np.dot(df[signal_features].values, np.random.rand(len(signal_features))) + noise
    df['target'] = (target > np.median(target)).astype(int)  # Binary classification
    
    if train_only_signal:
        n_train = int(n_samples * train_size)
        train_indices = np.random.choice(range(n_samples), size=n_train, replace=False)
        test_indices = list(set(range(n_samples)) - set(train_indices))
        # Zeroing out signal in test set for specific signal features
        for feature in signal_features:
            df.loc[test_indices, feature] = np.random.randn(len(test_indices))
            
    return df, signal_features

def process_data(df, target_column='target'):
    """
    One-hot encode categorical variables and split the data into train and test sets.
    """
    encoder = OneHotEncoder(cols=[col for col in df.columns if 'cat' in col], use_cat_names=True)
    df_encoded = encoder.fit_transform(df.drop(target_column, axis=1))
    X_train, X_test, y_train, y_test = train_test_split(df_encoded, df[target_column], test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

n_features = 1000
n_categorical_features = 200
n_numerical_features = n_features - n_categorical_features

df = generate_data(n_samples=2500, n_features=n_features, n_categorical_features=n_categorical_features)
df, signal_features = introduce_signal(df, n_numerical_features, train_only_signal=True)
df.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,...,cat_191,cat_192,cat_193,cat_194,cat_195,cat_196,cat_197,cat_198,cat_199,target
0,1.764052,0.400157,0.978738,2.240893,1.867558,-0.977278,0.950088,-0.151357,-0.103219,0.410599,...,2,2,1,2,0,2,1,2,0,1
1,1.411172,0.785804,-0.05747,-0.391217,0.940918,0.405204,0.498052,-0.026192,-1.68823,-0.112466,...,2,0,2,0,2,2,1,1,1,0
2,0.430771,-0.149892,-1.006037,-0.82155,-1.548254,0.531975,1.260569,-0.100394,-0.400349,-1.472323,...,1,0,2,2,0,1,1,2,2,0
3,0.152177,-0.374126,-0.013451,0.815472,0.410602,0.48097,-0.63543,0.85283,0.669562,1.004419,...,0,0,1,0,2,0,2,0,1,0
4,-1.333342,0.367784,-1.388233,-2.575203,-0.836106,0.331092,-0.269881,1.267131,0.183753,-0.76631,...,2,0,2,1,1,0,1,2,0,1


In [3]:
X_train, X_test, y_train, y_test = process_data(df)
X_train.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,...,cat_196_0.0,cat_197_1.0,cat_197_2.0,cat_197_0.0,cat_198_2.0,cat_198_1.0,cat_198_0.0,cat_199_0.0,cat_199_1.0,cat_199_2.0
2055,-0.26779,-1.244953,0.579719,0.532729,-1.599408,0.273115,1.508266,-1.544335,1.192402,-2.245811,...,0,0,0,1,0,0,1,1,0,0
1961,0.291132,0.354325,-1.435145,-2.008045,-0.841351,-1.078707,-0.368103,-0.297344,-0.370908,0.778764,...,0,0,1,0,1,0,0,0,1,0
1864,1.284703,-0.129104,-1.632991,0.518368,-0.341115,-0.809103,-0.423381,-0.630384,-0.808723,1.136608,...,0,0,1,0,1,0,0,1,0,0
2326,0.30653,1.119647,2.299225,-1.23777,1.204511,0.795546,1.779258,1.133944,-0.389372,0.915985,...,0,0,1,0,1,0,0,1,0,0
461,0.585654,-1.942345,-0.8426,0.420766,0.97545,-0.769837,0.898704,0.222523,-0.419532,0.453049,...,0,0,1,0,1,0,0,0,1,0


In [4]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.68


In [5]:
sfs_model = SHAPwiseFeatureSelector(clf, accuracy_score, number_top_fi = 50)

In [6]:
sfs_model.fit(X_train, y_train, X_test, y_test)

Noisy feature detected: num_388 original_score (with num_388): 0.678, new score (without num_388): 0.678
Noisy feature detected: num_262 original_score (with num_262): 0.678, new score (without num_262): 0.686
Noisy feature detected: num_277 original_score (with num_277): 0.686, new score (without num_277): 0.694
Noisy feature detected: num_609 original_score (with num_609): 0.694, new score (without num_609): 0.696
Noisy feature detected: num_477 original_score (with num_477): 0.696, new score (without num_477): 0.7
Noisy feature detected: num_395 original_score (with num_395): 0.7, new score (without num_395): 0.718
Summary:
Before drop: 0.678% After drop: 0.718%
Improvement of 5.9%


In [8]:
sfs_model.features_to_drop

['num_388', 'num_262', 'num_277', 'num_609', 'num_477', 'num_395']

In [9]:
sfs_model.base_estimator

In [17]:
y_pred = sfs_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.72


In [14]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train.drop(sfs_model.features_to_drop, axis = 1), y_train)
y_pred = clf.predict(X_test.drop(sfs_model.features_to_drop, axis = 1))
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.72
