In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
file_path = 'processed_data.csv'
data = pd.read_csv(file_path)

# Handle missing values in the Interventions column (if any)
data['Interventions'].fillna('', inplace=True)

In [3]:
# Step 1: Extract intervention types from the Interventions column
def extract_intervention_types(interventions):
    # Extract unique intervention types (e.g., DRUG, BIOLOGICAL) from each row
    return list(set([item.split(':')[0].strip() for item in interventions.split('|')]))

# Apply the function to create a new column with intervention types
data['Intervention_Types'] = data['Interventions'].apply(extract_intervention_types)

In [4]:
# Step 2: One-hot encode the intervention types using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
intervention_encoded = mlb.fit_transform(data['Intervention_Types'])

# Convert the encoded matrix into a DataFrame for readability
intervention_df = pd.DataFrame(intervention_encoded, columns=mlb.classes_)

In [5]:
# Step 3: Merge the encoded interventions back into the main dataset
data = pd.concat([data.reset_index(drop=True), intervention_df], axis=1)

# Step 4: Drop the original Interventions and temporary columns
data = data.drop(columns=['Interventions', 'Intervention_Types'])

In [13]:
print(intervention_df.head())

   BEHAVIORAL  BIOLOGICAL  COMBINATION_PRODUCT  DEVICE  DIAGNOSTIC_TEST  \
0           0           1                    0       0                0   
1           0           0                    0       0                0   
2           0           0                    0       0                0   
3           0           1                    0       0                0   
4           0           0                    0       0                0   

   DIETARY_SUPPLEMENT  DRUG  GENETIC  OTHER  PROCEDURE  RADIATION  
0                   0     0        0      0          0          0  
1                   0     1        0      0          0          0  
2                   0     1        0      0          0          0  
3                   0     0        0      1          0          0  
4                   0     1        0      0          0          0  


In [14]:
# Assuming 'intervention_df' contains the encoded interventions
intervention_df.to_csv('encoded_interventions.csv', index=False)