In [2]:
cd ..

In [3]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv
import pandas as pd
from src.loaders.load_data import load_data
from collections import OrderedDict
import numpy as np

In [4]:
from sklearn.preprocessing import KBinsDiscretizer
X = np.random.choice([0, 1], size=(100, 1))
est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform', subsample = None)
est.fit_transform(X)

In [None]:
target_processing_strategies = {
    'school_pca': {
        'min': 0
        'max': 70,
        'group_idx': [10, 20, 30, 40, 50 , 60]
    },
    'dvisits': {
        'min': 0,
        'max': 8,
        'group_idx': [1, 4]   
    }, 
    'california': {
        'min': 0,
        'max': 5,
        'group_idx': None
    },
    'hhip': {
        'min': 0,
        'max': 15,
        'group_idx': [2, 3, 4, 5, 6] 
    }
}



## Functions

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

def show_heatmap(df, figsize=(8, 6)):
    plt.figure(figsize=figsize)
    sns.heatmap(df.corr(), annot=True, fmt=".1f")
    plt.show()

In [6]:
def avg_correlation(df):
    avg_correlation_cols = list(OrderedDict(df.corr().abs().mean().sort_values(ascending=False).to_dict()).items())
    features = set(df.columns.tolist()[:-1])
    avg_correlation_cols = [col for col in avg_correlation_cols if col[0] in features]
    return avg_correlation_cols

In [7]:
from sklearn.cluster import KMeans
def kbins(data, data_config):

    est = KMeans(n_clusters=10, random_state=0)

    X = data.iloc[:, data_config['split_col_idx']].values
    X = est.fit_predict(X)
    
    return X

## Codrna

In [7]:
df, data_config = load_data('codrna')
show_heatmap(df,  figsize = (6,4))
print(avg_correlation(df))
print(data_config)

In [38]:
avg_cols = avg_correlation(df)
avg_cols = [col[0] for col in avg_cols]
split_col_idx = [df.columns.tolist().index(col) for col in avg_cols]
split_col_idx

In [63]:
cl = kbins(df, data_config)
np.unique(cl, return_counts=True)

In [41]:
data_config

## HHP

In [8]:

############################################################################################################
# Load data
# members
df_members = pd.read_csv('./data/HHP_herritage_health/Members.csv')
df_members['Sex'] = df_members['Sex'].map({'M': 1, 'F': 0})
df_members['AgeAtFirstClaim'] = df_members['AgeAtFirstClaim'].map({
    '0-9': 5,
    '10-19': 15,
    '20-29': 25,
    '30-39': 35,
    '40-49': 45,
    '50-59': 55,
    '60-69': 65,
    '70-79': 75,
    '80+': 90
})

# drug and lab
df_drug = pd.read_csv('./data/HHP_herritage_health/DrugCount.csv')
df_drug['DrugCount'] = df_drug['DrugCount'].map({'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7+': 10})
df_lab = pd.read_csv('./data/HHP_herritage_health/LabCount.csv')
df_lab['LabCount'] = df_lab['LabCount'].map(
    {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10+': 12}
)

# days
df_days_y2 = pd.read_csv('./data/HHP_herritage_health/DaysInHospital_Y2.csv')
df_days_y2['Year'] = 'Y1'
df_days_y3 = pd.read_csv('./data/HHP_herritage_health/DaysInHospital_Y3.csv')
df_days_y3['Year'] = 'Y2'

df_days = pd.concat([df_days_y2, df_days_y3])

df_claims = pd.read_csv('./data/HHP_herritage_health/Claims.csv')

# divide by provider
# providers = df_claims['ProviderID'].value_counts().iloc[: 10].index.tolist()
# df_claims = df_claims[df_claims['ProviderID'].isin(providers)]
# print(df_claims.shape)

df_claims = pd.merge(df_claims, df_members, on='MemberID', how='left')
df_claims = pd.merge(df_claims, df_drug, on=['MemberID', 'Year', 'DSFS'], how='left')
df_claims = pd.merge(df_claims, df_lab, on=['MemberID', 'Year', 'DSFS'], how='left')
df_claims = pd.merge(df_claims, df_days, on=['MemberID', 'Year'], how='left')
df_claims = df_claims[df_claims['DaysInHospital'].notna()]
print(df_claims.shape)

#################################################################################################################
# Feature engineering
# drop missing age and sex
df_claims = df_claims[df_claims['AgeAtFirstClaim'].notna()].copy()
df_claims = df_claims[df_claims['Sex'].notna()].copy()
df_claims = df_claims[df_claims['DSFS'].notna()].copy()

# transform categorical columns
def transform1(row):
    if pd.isna(row):
        return 'None'
    else:
        return str(int(row))
    
df_claims['ProviderID'] = df_claims['ProviderID'].map(transform1)
df_claims['Vendor'] = df_claims['Vendor'].map(transform1)
df_claims['PCP'] = df_claims['PCP'].map(transform1)

# handle missing values for categorical columns
df_claims['ProcedureGroup'] = df_claims['ProcedureGroup'].fillna('None', inplace=False)
df_claims['Specialty'] = df_claims['Specialty'].fillna('None', inplace=False)
df_claims['PrimaryConditionGroup'] = df_claims['PrimaryConditionGroup'].fillna('None', inplace=False)
df_claims['PlaceSvc'] = df_claims['PlaceSvc'].fillna('None', inplace=False)

# encode number of columns
df_claims['CharlsonIndex'] = df_claims['CharlsonIndex'].map({'0': 0, '1-2': 1.5, '3-4': 3.5, '5+': 7})
df_claims['PayDelay'] = df_claims['PayDelay'].apply(lambda row: int(row) if row != '162+' else 200)
df_claims['LengthOfStay'] = df_claims['LengthOfStay'].map({
    '1 day': 1, '2 days': 2, '3 days': 3, '4 days': 4, '5 days': 5, '6 days': 6, '1- 2 weeks': 10, '2- 4 weeks': 21, '4- 8 weeks': 42,
})
df_claims['DSFS'] = df_claims['DSFS'].map({
    '0- 1 month': 1, '1- 2 months': 2, '2- 3 months': 3, '3- 4 months': 4, '4- 5 months': 5, '5- 6 months': 6, 
    '6- 7 months': 7, '7- 8 months': 8, '8- 9 months': 9, '9-10 months': 10, '10-11 months': 11, '11-12 months': 12
})

# filter all large claims
df_claims = df_claims[df_claims['DaysInHospital'] > 0].copy()
print(df_claims.shape)

# fill mean values for drug and lab counts
df_claims['DrugCount'] = df_claims['DrugCount'].fillna(df_claims['DrugCount'].mean(), inplace=False)
df_claims['LabCount'] = df_claims['LabCount'].fillna(df_claims['LabCount'].mean(), inplace=False)

# drop length of stay
df_claims = df_claims.drop(columns=['LengthOfStay'])

#########################################################################################################################
# Feature selection
# numerical features
def feature_agg(df, key):
    ret = df.groupby(['MemberID', 'Year']).agg(
        **{
            key+'_mean': pd.NamedAgg(column=key, aggfunc='mean'),
            key+'_std': pd.NamedAgg(column=key, aggfunc='std'),
            key+'_max': pd.NamedAgg(column=key, aggfunc='max'),
            key+'_min': pd.NamedAgg(column=key, aggfunc='min'),
        }
    )

    ret[f'{key}_range'] = ret[f'{key}_max'] - ret[f'{key}_min']
    ret = ret.reset_index()
    ret = ret.drop(columns=[f'{key}_min'])
    
    df = pd.merge(df, ret, on=['MemberID', 'Year'], how='left')
    df = df.drop(columns=[key])
    
    return df

df_claims = feature_agg(df_claims, 'DSFS')
df_claims = feature_agg(df_claims, 'PayDelay')
df_claims = feature_agg(df_claims, 'CharlsonIndex')
df_claims = feature_agg(df_claims, 'DrugCount')
df_claims = feature_agg(df_claims, 'LabCount')

def feature_agg2(df, key):
    df = df.groupby(['MemberID', 'Year']).agg(key).nunique().reset_index(name = key+'_counts')
    df = pd.merge(df_claims, df, on = ['MemberID', 'Year'], how = 'left')
    return df

df_claims = feature_agg2(df_claims, 'ProviderID')
df_claims = feature_agg2(df_claims, 'Vendor')
df_claims = feature_agg2(df_claims, 'PCP')
df_claims = feature_agg2(df_claims, 'Specialty')
df_claims = feature_agg2(df_claims, 'PlaceSvc')
df_claims = feature_agg2(df_claims, 'PrimaryConditionGroup')
df_claims = feature_agg2(df_claims, 'ProcedureGroup')
ret = df_claims.groupby(['MemberID', 'Year']).size().reset_index(name = 'claim_counts')
df_claims = pd.merge(df_claims, ret, on = ['MemberID', 'Year'], how = 'left')

df_claims.fillna(0, inplace=True)

# categoorical one-hot features
top_k = 2
for col in ['Specialty', 'PlaceSvc', 'PrimaryConditionGroup', 'ProcedureGroup']:
    top_k_cols = pd.get_dummies(df_claims[col]).corrwith(df_claims['DaysInHospital']).abs().sort_values(ascending = False)[:top_k]
    dummies = pd.get_dummies(df_claims[col])[top_k_cols.index]
    dummies.columns = [f'{col}_{idx}' for idx in range(len(dummies.columns))]
    df_claims = pd.concat([df_claims, dummies], axis = 1)
    df_claims.drop(columns = [col], inplace = True)
    
df_claims = df_claims.drop(columns = ['MemberID', 'ProviderID', 'Vendor', 'PCP', 'Year'])

#########################################################################################################################
# Split data
# columns
num_cols = ['AgeAtFirstClaim']
for col in ['CharlsonIndex', 'PayDelay', 'DrugCount', 'LabCount', 'DSFS']:
    num_cols += [f'{col}_mean', f'{col}_std', f'{col}_max', f'{col}_range']
for col in ['ProviderID', 'Vendor', 'PCP', 'Specialty', 'PlaceSvc', 'PrimaryConditionGroup', 'ProcedureGroup', 'claim']:
    num_cols += [f'{col}_counts']
print(len(num_cols))

cat_cols = ['Sex', 'SupLOS', 'ClaimsTruncated']
cat_cols += [f'{col}_{idx}' for col in ['Specialty', 'PlaceSvc', 'PrimaryConditionGroup', 'ProcedureGroup'] for idx in range(top_k)]
print(len(cat_cols))

target = 'DaysInHospital'

# sample data
df_claims_sample = df_claims.sample(n = 20000, random_state=42)

# standardize
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
df_claims_sample[num_cols] = scaler.fit_transform(df_claims_sample[num_cols])
scaler = MinMaxScaler()
df_claims_sample[num_cols] = scaler.fit_transform(df_claims_sample[num_cols])

# reorder target to be num cols, cat cols and target
df_claims_sample = df_claims_sample[num_cols + cat_cols + [target]]

print(df_claims_sample.shape)

data = df_claims_sample

avg_correlation_cols = avg_correlation(data)
avg_correlation_cols = [col[0] for col in avg_correlation_cols if col[0] in num_cols]
avg_correlation_cols = [col for col in avg_correlation_cols][:int(data.shape[1]*0.3)]

# data config
data_config = {
    'target': target,
    'features_idx': [idx for idx in range(0, data.shape[1]) if data.columns[idx] != target],
    'split_col_idx': [data.columns.tolist().index(col) for col in avg_correlation_cols],
    'ms_col_idx': [idx for idx in range(0, data.shape[1]) if data.columns[idx] in num_cols],
    'obs_col_idx': [idx for idx in range(0, data.shape[1]) if data.columns[idx] in cat_cols],
    "num_cols": len(num_cols),
    'task_type': 'regression',
    'clf_type': 'none',
    'data_type': 'tabular'
}

In [9]:
data[target].hist(bins = 100)

In [96]:
data_config

In [136]:
import json
data.to_csv('./data/HHP_herritage_health/data_cleaned.csv', index=False)
with open('./data/HHP_herritage_health/data_config.json', 'w') as f:
    json.dump(data_config, f)

In [132]:
data.shape

In [133]:
data_config['split_col_idx']

In [134]:
avg_correlation(data[num_cols])

In [135]:
data[num_cols].corrwith(data['DaysInHospital']).abs().sort_values(ascending = False)

In [6]:
data_config

In [58]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

X = df_claims_sample[num_cols]
y = df_claims_sample[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

In [56]:
from sklearn.linear_model import RidgeCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error

model = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10]).fit(X_train, y_train)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)), np.sqrt(mean_squared_log_error(y_test, y_pred)))

In [59]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error

model = MLPRegressor(hidden_layer_sizes=(128, 128), max_iter=1000, alpha=0.5, random_state=42, verbose=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))

## California Housing

In [10]:
from sklearn.preprocessing import PowerTransformer
def outlier_remove_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    data = data[(data[col] >= (Q1 - 1.5 * IQR)) & (data[col] <= (Q3 + 1.5 * IQR))]
    return data

def convert_gaussian(data, col):
    pt = PowerTransformer()
    data[col] = pt.fit_transform(data[col].values.reshape(-1, 1)).flatten()
    return data

In [11]:
from sklearn.datasets import fetch_california_housing, fetch_kddcup99
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
housing = fetch_california_housing()
data = pd.DataFrame(data=housing.data, columns=housing.feature_names)
target_col = 'MedHouseVal'
data[target_col] = housing.target

# drop missing values
print(data.shape)
data = data.dropna()
print(data.shape)

# remove outliers
data = outlier_remove_iqr(data, 'AveRooms')
data = outlier_remove_iqr(data, 'AveBedrms')
data = outlier_remove_iqr(data, 'Population')
data = outlier_remove_iqr(data, 'AveOccup')

# gaussian transform
data = convert_gaussian(data, 'MedInc')

num_cols = data.columns.tolist()[:-1]

scaler = Pipeline([
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler())
])

data[num_cols] = scaler.fit_transform(data[num_cols])
print(data.shape)

data_config = {
    'target': target_col,
    'features_idx': list(range(len(data.columns)-1)),
    'split_col_idx': [0, 2, 5],
    'ms_col_idx': list(range(len(num_cols))),
    'obs_col_idx': [4, 7],
    'num_cols': len(num_cols),
    'task_type': 'regression',
    'clf_type': 'none',
    'data_type': 'tabular'
}

print(data_config)

In [12]:
data.columns

In [13]:
data[target_col].hist(bins=100)

In [74]:
data.to_csv('./data/california/data_cleaned.csv', index=False)
import json
with open('./data/california/data_config.json', 'w') as f:
    json.dump(data_config, f)

In [62]:
data[num_cols].corrwith(data[target_col]).abs().sort_values(ascending = False)

In [63]:
avg_correlation(data[num_cols])

In [71]:
show_heatmap(data[num_cols])

In [68]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

X = data[num_cols]
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error

model = MLPRegressor(hidden_layer_sizes=(128, 128), max_iter=1000, alpha=0.5, random_state=42, verbose=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))

## Dvisits

In [40]:
! pip install pyreadr

In [14]:
def convert_gaussian(data, col):
    pt = PowerTransformer()
    data[col] = pt.fit_transform(data[col].values.reshape(-1, 1)).flatten()
    return data

In [15]:
import pyreadr
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
result = pyreadr.read_r('./data/dvisits/dvisits.rda')  
data = result['dvisits']
data = data.drop(['prescrib', 'nonpresc', 'agesq'], axis=1)
print(data.shape)

num_cols = ['age', 'income', 'illness', 'actdays', 'hscore', 'hospadmi', 'hospdays']
cat_cols = ['sex', 'levyplus', 'freepoor', 'freerepa', 'chcond1', 'chcond2']
target_col = 'medicine'

data = data[num_cols + cat_cols + [target_col]]

scaler = Pipeline([
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler())
])
data[num_cols] = scaler.fit_transform(data[num_cols])
print(data.shape)

data_config = {
    'target': target_col,
    'features_idx': list(range(len(data.columns)-1)),
    'split_col_idx': [0, 2, 1],
    'ms_col_idx': [idx for idx in range(0, data.shape[1]) if data.columns[idx] in num_cols],
    'obs_col_idx': [idx for idx in range(0, data.shape[1]) if data.columns[idx] in cat_cols],
    'num_cols': len(num_cols),
    'task_type': 'regression',
    'clf_type': 'none',
    'data_type': 'tabular'
}

print(data_config)


In [16]:
data.hist(bins=30, figsize=(10, 8))
plt.show()

In [17]:
data[target_col].hist(bins=100)

In [70]:
data.columns

In [14]:
data.to_csv('./data/dvisits/data_cleaned.csv', index=False)
import json
with open('./data/dvisits/data_config.json', 'w') as f:
    json.dump(data_config, f)

In [89]:
data.corrwith(data[target_col]).abs().sort_values(ascending = False)

In [90]:
avg_correlation(data)

In [71]:
show_heatmap(data.corr().abs())

In [84]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

X = data[cat_cols]
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error

model = MLPRegressor(hidden_layer_sizes=(128, 128), max_iter=1000, alpha=0.5, random_state=42, verbose=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))

## Vehicle

In [19]:
import scipy.io
from collections import Counter

In [20]:
mat = scipy.io.loadmat('./data/vehicle/vehicle.mat')
raw_x, raw_y = mat['X'], mat['Y']  # y = {-1, 1}
assert len(raw_x) == len(raw_y)
num_clients = len(raw_x)

dataset = []
for i in range(num_clients):
    features, labels = raw_x[i][0], raw_y[i][0].flatten()
    print(raw_x[i][0].shape, raw_y[i][0].shape)
    assert len(features) == len(labels)
    counter = Counter(labels)
    print(f'Client {i}:', counter, counter[1] / len(labels))
    dataset.append((features, labels))

positive_counts = [np.count_nonzero(labels + 1) for feats, labels in dataset]
positive_percentages = [np.count_nonzero(labels + 1) / len(labels) * 100
                      for feats, labels in dataset]

In [21]:
from src.evaluation.imp_quality_metrics import sliced_ws

dfs = []
for i in range(num_clients):
    df = pd.DataFrame(dataset[i][0])
    df['label'] = dataset[i][1]
    dfs.append(df)

final_df = pd.concat(dfs, axis=0)
split_indices = np.cumsum([df.shape[0] for df in dfs[:-1]])
print(split_indices)
print(final_df.shape)
final_df

In [22]:
# Number of clients/dataframes
num_dfs = len(dfs)
distance_matrix = np.zeros((num_dfs, num_dfs))

for i in range(num_dfs):
    for j in range(i + 1, num_dfs):
        # Assuming sliced_ws expects numpy arrays; adapt if necessary
        dist = sliced_ws(dfs[i].to_numpy(), dfs[j].to_numpy())
        distance_matrix[i, j] = dist
        distance_matrix[j, i] = dist

# Optional: Fill diagonal with zeros, assuming self-distance is zero
np.fill_diagonal(distance_matrix, 0)

# Average distance for each dataframe
average_distances = np.mean(distance_matrix, axis=1)

# Create an array of dataframe indices for reference
df_indices = np.arange(num_dfs)

# Combine indices and their corresponding average distances
average_distances_with_indices = list(zip(df_indices, average_distances))

# Sort the list of tuples by the average distance in descending order
sorted_by_distance = sorted(average_distances_with_indices, key=lambda x: x[1], reverse=True)

# Select the top 10
top_10_different = sorted_by_distance[:10]

# Unpack the top 10 list to separate indices and distances for clearer output
top_10_indices = [item[0] for item in top_10_different]
top_10_distances = [item[1] for item in top_10_different]

In [23]:
# feature selection
selected_dfs = [dfs[i] for i in top_10_indices]
selected_df = pd.concat(selected_dfs, axis=0)

from sklearn.ensemble import RandomForestRegressor  # or RandomForestClassifier based on your task

# Assuming 'label' is your prediction target
X = selected_df.drop('label', axis=1)
y = selected_df['label']

model = RandomForestRegressor()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
feature_names = X.columns
feature_importance_dict = dict(zip(feature_names, importances))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)

sorted_features = [feature[0] for feature in sorted_features if feature[1] > 0.01]
print(sorted_features)

correlated_cols = selected_df.corrwith(selected_df['label']).abs().sort_values(ascending = False)[:30].index.tolist()
print(correlated_cols)

final_cols = list(set(sorted_features) | set(correlated_cols))
final_cols.remove('label')
print(final_cols)

In [12]:
show_heatmap(selected_df[final_cols + ['label']], figsize=(20, 20))

In [24]:
data = selected_df[final_cols]
data = data.drop(columns = [41, 42, 43, 37, 44, 47, 46, 49, 45, 48])
print(data.shape)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

# scaling
scaler = Pipeline([
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler())
])

cols = data.columns.tolist()
data = scaler.fit_transform(data)
data = pd.DataFrame(data, columns = cols)

# gaussian transform
from sklearn.preprocessing import QuantileTransformer

# Assuming 'df' is your DataFrame with binary features
gaussian_cols = [11, 35, 36, 38, 39, 40, 58]
transformer = QuantileTransformer(output_distribution='normal', random_state=0)
df_gaussianized = transformer.fit_transform(data[gaussian_cols])
df_gaussianized = pd.DataFrame(df_gaussianized, columns=gaussian_cols)
data = pd.concat([data.drop(columns = gaussian_cols), df_gaussianized], axis = 1)

In [88]:
data.hist(figsize=(20, 20), bins = 100)
plt.show()

In [25]:
data = selected_df[final_cols]
data = data.drop(columns = [41, 42, 43, 37, 44, 47, 46, 49, 45, 48])
print(data.shape)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

# gaussian transform
from sklearn.preprocessing import QuantileTransformer

# scaling
scaler = Pipeline([
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler())
])

cols = data.columns.tolist()
data = scaler.fit_transform(data)
data = pd.DataFrame(data, columns = cols)

# Assuming 'df' is your DataFrame with binary features
gaussian_cols = [11, 35, 36, 38, 39, 40, 58]
transformer = QuantileTransformer(output_distribution='normal', random_state=0)
df_gaussianized = transformer.fit_transform(data[gaussian_cols])
df_gaussianized = pd.DataFrame(df_gaussianized, columns=gaussian_cols)
data = pd.concat([data.drop(columns = gaussian_cols), df_gaussianized], axis = 1)

# scaling
scaler = Pipeline([
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler())
])

cols = data.columns.tolist()
data = scaler.fit_transform(data)
data = pd.DataFrame(data, columns = cols)

data.columns = [idx for idx, col in enumerate(data.columns)]
data['label'] = selected_df['label'].values
client_split_indices = np.cumsum([df.shape[0] for df in selected_dfs[:-1]])
print(client_split_indices)
target_col = 'label'
data[target_col], codes = pd.factorize(data[target_col])
num_cols = data.columns.tolist()[:-1]
cat_cols = []

data_config = {
    'target': target_col,
    'features_idx': list(range(len(data.columns)-1)),
    'split_col_idx': [20, 19, 8, 3],
    'ms_col_idx': [idx for idx in range(0, data.shape[1]) if data.columns[idx] in num_cols],
    'obs_col_idx': [0, 13, 15],
    'num_cols': len(num_cols),
    'task_type': 'classification',
    'clf_type': 'binary',
    'data_type': 'tabular',
    'client_split_indices': client_split_indices.tolist()
}


In [99]:
print(data_config)

In [17]:
data.hist(figsize=(20, 20), bins = 100)

In [101]:
data.to_csv('./data/vehicle/data_cleaned.csv', index=False)
import json
with open('./data/vehicle/data_config.json', 'w') as f:
    json.dump(data_config, f)

In [18]:
show_heatmap(data, figsize=(10, 10))

In [79]:
avg_correlation(data)

In [76]:
show_heatmap(data, figsize=(10, 10))

In [104]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, mean_squared_log_error, accuracy_score, roc_auc_score, f1_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

X = data.drop(columns = [target_col])
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

model = MLPClassifier(hidden_layer_sizes=(128, 128), max_iter=1000, alpha=0.5, random_state=42, verbose=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred), f1_score(y_test, y_pred))

## Codon

In [47]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer

data = pd.read_csv("./data/codon/codon_usage.csv", sep=',', low_memory=False)
data = data.dropna()
# data.columns = [str(i) for i in range(data.sh
data = data.drop(['SpeciesID', 'Ncodons', 'SpeciesName', 'DNAtype'], axis=1)
target_col = 'Kingdom'
data = data[data[target_col] != 'plm']
data[target_col], codes = pd.factorize(data[target_col])
cols = data.corrwith(data[target_col]).abs().sort_values(ascending = False)[0:35].index.tolist()
cols.remove(target_col)
data = data[cols + [target_col]]

num_cols = data.columns.tolist()[:-1]
cat_cols = []

scaler = Pipeline([
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler())
])

cols = data.columns.tolist()
cols.remove(target_col)
data[cols] = scaler.fit_transform(data[cols])

quant = QuantileTransformer(output_distribution='normal', random_state=0)
data[cols] = quant.fit_transform(data[cols])

obs_cols = ['CAC', 'UGU', 'UCA']
split_cols = ['UGA', 'CUA', 'GAU']

data_config = {
    'target': target_col,
    'features_idx': list(range(len(data.columns)-1)),
    'split_col_idx': [data.columns.tolist().index(col) for col in split_cols],
    'ms_col_idx': [idx for idx in range(0, data.shape[1]) if data.columns[idx] in num_cols],
    'obs_col_idx': [idx for idx in range(0, data.shape[1]) if data.columns[idx] in obs_cols],
    'num_cols': len(num_cols),
    'task_type': 'classification',
    'clf_type': 'multi-class',
    'data_type': 'tabular'
}

print(data_config)

In [48]:
data.shape

In [40]:
data.corrwith(data[target_col]).abs().sort_values(ascending = False)

In [39]:
avg_correlation(data)

In [49]:
show_heatmap(data, figsize=(15, 15))

In [50]:
data.hist(figsize=(20, 20), bins = 100)

In [53]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, mean_squared_log_error, accuracy_score, roc_auc_score, f1_score

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

X = data.drop(columns = [target_col])
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

model = MLPClassifier(hidden_layer_sizes=(128, 128), max_iter=1000, alpha=0.5, random_state=42, verbose=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))

In [54]:
data.to_csv('./data/codon/data_cleaned.csv', index=False)
import json
with open('./data/codon/data_config.json', 'w') as f:
    json.dump(data_config, f)

## School

In [26]:
import scipy.io
from collections import Counter

In [27]:
mat = scipy.io.loadmat('./data/school/school.mat')
# Note that the raw data structure is different from school
raw_x, raw_y = mat['X'][0], mat['Y'][0]  # y is exam score
assert len(raw_x) == len(raw_y)
num_clients = len(raw_x)

print('School dataset:')
print('number of clients:', num_clients, len(raw_y))
print('number of examples:', [len(raw_x[i]) for i in range(num_clients)])
print('number of features:', len(raw_x[0][0]))

raw_x, raw_y = mat['X'][0], mat['Y'][0]
combined_x= np.vstack(raw_x)
combiend_y = np.vstack(raw_y)
data = pd.DataFrame(combined_x, columns = [f'Feature_{i}' for i in range(combined_x.shape[1])])
clients_split_indices = np.cumsum([len(raw_x[i]) for i in range(num_clients-1)])
print(clients_split_indices)
target_col = 'score'
data[target_col] = combiend_y
data

In [28]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.999)

cols = data.columns.tolist()[:-1]
data_pca = pca.fit_transform(data[cols])

data_pca = pd.DataFrame(data_pca, columns = [f'PCA_{i}' for i in range(data_pca.shape[1])])
data_pca[target_col] = data[target_col]
data_pca

In [29]:
data = data.drop(columns = [
    'Feature_27', 'Feature_11', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17', 
    'Feature_18', 'Feature_19', 'Feature_20', 'Feature_25'
], axis = 1)
print(data.shape)

data.columns = [idx for idx, col in enumerate(data.columns[:-1])] + [target_col] 

In [30]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

# scaling
scaler = Pipeline([
    ('standard', StandardScaler()),
    ('minmax', MinMaxScaler())
])

cols = data_pca.columns.tolist()
cols.remove(target_col)
data_pca[cols] = scaler.fit_transform(data_pca[cols])

In [31]:
show_heatmap(data_pca, figsize=(15, 15))

In [32]:
avg_correlation(data_pca)

In [33]:
data_pca.corrwith(data_pca[target_col]).abs().sort_values(ascending = False)

In [34]:
data[target_col].hist(bins = 100)

In [35]:
data[target_col].describe()

In [37]:
num_cols = data_pca.columns.tolist()[:-1]
data_config = {
    'target': target_col,
    'features_idx': list(range(len(data_pca.columns)-1)),
    'split_col_idx': [9, 7],
    'ms_col_idx': [idx for idx in range(0, data_pca.shape[1]) if data_pca.columns[idx] in num_cols],
    'obs_col_idx': [6, 10],
    'num_cols': len(num_cols),
    'task_type': 'regression',
    'clf_type': 'none',
    'data_type': 'tabular',
    'client_split_indices': clients_split_indices.tolist()
}
print(data_config)

In [39]:
data_pca.shape

In [34]:
data_pca[target_col] = data[target_col].astype(float)

In [40]:
data_pca.to_csv('./data/school/data_cleaned_pca.csv', index=False)
import json
with open('./data/school/data_config_pca.json', 'w') as f:
    json.dump(data_config, f)

In [29]:
data_pca.hist(figsize=(20, 20), bins = 100)

In [75]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

X = data.drop(columns = [target_col])
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error

model = MLPRegressor(hidden_layer_sizes=(128, 128), max_iter=1000, alpha=0.5, random_state=42, verbose=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))