In [28]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion

# Load the dataset
file_path = 'processed_data.csv'
data = pd.read_csv(file_path)

# Handle missing values
data.fillna(method='ffill', inplace=True)

# **Step 1: Parse the `Study Design` column**
design_features = data['Study Design'].str.extract(
    r'Allocation:\s*(\w+)\|Intervention Model:\s*([\w\s]+)\|Masking:\s*([\w\s\(\),]+)\|Primary Purpose:\s*(\w+)',
    expand=True
)
design_features.columns = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']

# Add the parsed columns back to the dataset
data = pd.concat([data.drop(columns=['Study Design']), design_features], axis=1)

# Ensure unique column names
data = data.loc[:, ~data.columns.duplicated()]

# **Step 2: Identify columns**
categorical_columns = [
    'Study Status', 'Study Results', 'Sex', 'Age', 'Phases',
    'Enrollment', 'Funder Type', 'Study Type', 'Sponsor',
    'Allocation', 'Intervention Model', 'Masking', 'Primary Purpose'
]
numerical_columns = [col for col in data.columns if col not in categorical_columns + ['Study Recruitment Rate', 'NCT Number']]

# **Step 3: Define preprocessing for categorical columns**
categorical_preprocessor = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Combine all preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ]
)

# Define a TF-IDF vectorizer for the 'Conditions' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limit features to 1000 for efficiency

# Update the ColumnTransformer to include TF-IDF for 'Conditions'
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),  # Numerical columns remain as-is
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),  # Encode categorical columns
        ('text', tfidf_vectorizer, 'Conditions')  # TF-IDF vectorization for text
    ]
)

# **Step 4: Prepare features (X) and target (y)**
X = data.drop(columns=['Study Recruitment Rate', 'NCT Number'])
y = data['Study Recruitment Rate']

# Transform features using the preprocessor
X_encoded = preprocessor.fit_transform(X)

# **Step 5: Split the data**
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Calculate SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

smape_value = smape(y_test, y_pred)

# Print evaluation metrics
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")
print(f"SMAPE: {smape_value}")

ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [32]:
columns_to_check = ["Phases"]

# Apply value_counts() to each of the specified columns and store the results
for col in columns_to_check:
    print(f"Frequency of values in '{col}':\n")
    print(data[col].value_counts())
    print("\n" + "-"*50 + "\n")

Frequency of values in 'Phases':

Phases
PHASE1           9098
PHASE2           5519
PHASE3           3996
PHASE1|PHASE2    1591
PHASE2|PHASE3     382
Name: count, dtype: int64

--------------------------------------------------



In [33]:
columns_to_check = ["Conditions"]

# Apply value_counts() to each of the specified columns and store the results
for col in columns_to_check:
    print(f"Frequency of values in '{col}':\n")
    print(data[col].value_counts())
    print("\n" + "-"*50 + "\n")

Frequency of values in 'Conditions':

Conditions
Healthy                                                            1337
Healthy Volunteers                                                  611
COVID-19                                                            261
Healthy Participants                                                192
Healthy Subjects                                                    178
                                                                   ... 
Bipolar Depression|Suicidal Ideation and Behavior                     1
Infected Atopic Dermatitis/Eczema                                     1
Endometrial Cancer|Uterine Cancer|Ovarian Cancer|Carcinosarcoma       1
Thrombocytopenia|Hematologic Diseases|Bone Marrow Aplasia             1
Thrombus                                                              1
Name: count, Length: 7837, dtype: int64

--------------------------------------------------

