In [None]:
import pandas as pd
# df = pd.read_csv("clinical_trials_Glioblastoma_filtered.csv")
df = pd.read_csv("clinical_trials_Glioblastoma_filtered_PI and Enrollment.csv")

In [None]:
# drop empty columns
df.drop(columns=["Countries"], inplace=True)
df.drop(columns=["Locations"], inplace=True)

In [None]:
df.columns.tolist()

['NCT ID',
 'Title',
 'Status',
 'Phases',
 'Study Type',
 'Enrollment Count',
 'Enrollment Type',
 'Principal Investigators',
 'Start Date',
 'Completion Date',
 'Primary Completion Date',
 'Sponsor',
 'Collaborators',
 'Conditions',
 'Interventions',
 'Minimum Age',
 'Maximum Age',
 'Gender',
 'Brief Summary',
 'Detailed Description',
 'Study URL',
 'Tags']

In [None]:
# adult 和 child

df["Minimum Age"] = df["Minimum Age"].fillna("").astype(str).str.lower()
df["Maximum Age"] = df["Maximum Age"].fillna("").astype(str).str.lower()

# 年龄组分类逻辑
def classify_age_group(min_age, max_age):
    if "month" in min_age:
        return "Children"
    if "year" in min_age:
        try:
            age_val = int(min_age.split()[0])
            if age_val < 18:
                if "year" in max_age:
                    max_val = int(max_age.split()[0])
                    if max_val >= 18:
                        return "Mixed"
                return "Children"
        except:
            return "Adult"
    return "Adult"

df["Age Group"] = df.apply(lambda row: classify_age_group(row["Minimum Age"], row["Maximum Age"]), axis=1)

df.columns.tolist()

#只选择adult
#df_adult = df[df["Age Group"] == "Adult"]
#len(df_adult)

['NCT ID',
 'Title',
 'Status',
 'Phases',
 'Study Type',
 'Enrollment Count',
 'Enrollment Type',
 'Principal Investigators',
 'Start Date',
 'Completion Date',
 'Primary Completion Date',
 'Sponsor',
 'Collaborators',
 'Conditions',
 'Interventions',
 'Minimum Age',
 'Maximum Age',
 'Gender',
 'Brief Summary',
 'Detailed Description',
 'Study URL',
 'Tags',
 'Age Group']

In [None]:
# Primary 和 Metastatic

df["Combined Text"] = (
    df["Conditions"].fillna("") + " " +
    df["Brief Summary"].fillna("") + " " +
    df["Detailed Description"].fillna("")
).str.lower()

def classify_tumor_type(text):
    if any(k in text for k in ["metastatic", "metastasis", "brain metastases", "secondary glioblastoma"]):
        return "Metastatic"
    if any(k in text for k in ["primary glioblastoma", "newly diagnosed", "de novo"]):
        return "Primary"
    return "Unclear"

df["Tumor Type"] = df["Combined Text"].apply(classify_tumor_type)
df.drop(columns=["Combined Text"], inplace=True)
df.columns.tolist()

# 只选择primary的
# df_adult_primary = df_adult[df_adult["Tumor Type"] == "Primary"]


['NCT ID',
 'Title',
 'Status',
 'Phases',
 'Study Type',
 'Enrollment Count',
 'Enrollment Type',
 'Principal Investigators',
 'Start Date',
 'Completion Date',
 'Primary Completion Date',
 'Sponsor',
 'Collaborators',
 'Conditions',
 'Interventions',
 'Minimum Age',
 'Maximum Age',
 'Gender',
 'Brief Summary',
 'Detailed Description',
 'Study URL',
 'Tags',
 'Age Group',
 'Tumor Type']

In [None]:
df["Surgery Status Text"] = (
    df["Brief Summary"].fillna("") + " " +
    df["Detailed Description"].fillna("") + " " +
    df["Interventions"].fillna("")
).str.lower()

def classify_surgery_type(text):
    if any(k in text for k in ["postoperative", "post-surgery", "after surgery", "post resection", "resected", "adjuvant"]):
        return "Post-Surgery"
    elif any(k in text for k in ["surgery", "surgical resection", "craniotomy", "tumor resection"]):
        return "Surgery"
    return "Unclear"

df["Surgery Category"] = df["Surgery Status Text"].apply(classify_surgery_type)
df.drop(columns=["Surgery Status Text"], inplace=True)
df.columns.tolist()

# 只选择 post surgery
# df_adult_primary_postsur = df_adult_primary[df_adult_primary["Surgery Category"] == "Post-Surgery"]

['NCT ID',
 'Title',
 'Status',
 'Phases',
 'Study Type',
 'Enrollment Count',
 'Enrollment Type',
 'Principal Investigators',
 'Start Date',
 'Completion Date',
 'Primary Completion Date',
 'Sponsor',
 'Collaborators',
 'Conditions',
 'Interventions',
 'Minimum Age',
 'Maximum Age',
 'Gender',
 'Brief Summary',
 'Detailed Description',
 'Study URL',
 'Tags',
 'Age Group',
 'Tumor Type',
 'Surgery Category']

In [None]:
df.to_csv("clinic_trials_Glioblastoma_AGE_TUMOR_SURGERY_TYPE.csv", index=False)

In [None]:
# 只选择immunotherapy
# df_adult_primary_postsur_immuno = df[(df["Tags"].fillna("").str.lower().str.contains("immunotherapy"))]
# 保存结果
# df_adult_primary_postsur_immuno.to_csv("glioma_trials_glioblastoma_adult_primary_postsurgert_immunotherapy.csv", index=False)