# Input the data

In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import xgboost as xgb
from imblearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier

## load dataset (clinical_info)

In [None]:
folder_path = Path('../data/Shanghai_diabetes_datasets/clinical_info/csv')

if not os.path.exists(folder_path):
    raise FileNotFoundError(f"directory {folder_path} doesn't exist")

df1 = pd.read_csv(folder_path.joinpath('Shanghai_T1DM_Summary.csv'))
df2 = pd.read_csv(folder_path.joinpath('Shanghai_T2DM_Summary.csv'))

combined_df = pd.concat([df1, df2], ignore_index=True)
combined_df.head(9)

## handling missing values

In [None]:
combined_df.replace('/', pd.NA, inplace=True)  # convert / to null

cols_to_fill = [
    'Age (years)', 'Height (m)', 'Weight (kg)', 'BMI (kg/m2)', 'Smoking History (pack year)',
    'Duration of Diabetes (years)', 'Fasting Plasma Glucose (mg/dl)',
    '2-hour Postprandial Plasma Glucose (mg/dl)', 'Fasting C-peptide (nmol/L)',
    '2-hour Postprandial C-peptide (nmol/L)', 'Fasting Insulin (pmol/L)',
    '2-hour Postprandial Insulin (pmol/L)', 'HbA1c (mmol/mol)', 'Glycated Albumin (%)',
    'Total Cholesterol (mmol/L)', 'Triglyceride (mmol/L)',
    'High-Density Lipoprotein Cholesterol (mmol/L)', 'Low-Density Lipoprotein Cholesterol (mmol/L)',
    'Creatinine (umol/L)', 'Estimated Glomerular Filtration Rate  (ml/min/1.73m2)',
    'Uric Acid (mmol/L)', 'Blood Urea Nitrogen (mmol/L)'
]

for col in cols_to_fill:
    if col in combined_df.columns:
        combined_df[col] = pd.to_numeric(combined_df[col].astype(str).str.strip(), errors='coerce')

combined_df[cols_to_fill] = combined_df[cols_to_fill].fillna(combined_df[cols_to_fill].median())

## analyse data

In [None]:
combined_df.describe()

## handling duplicates

In [None]:
duplicate_rows_data = combined_df[combined_df.duplicated()]
print('duplicate_rows_data', duplicate_rows_data)

## uniqueness

In [None]:
for column in combined_df.columns:
    unique_value = len(combined_df[column].unique())
    print(f'{column}: {unique_value} quantity unique value')

## emissions

In [None]:
for col in cols_to_fill:
    q1 = combined_df[col].quantile(0.25)
    q3 = combined_df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    outliers = combined_df[(combined_df[col] < lower) | (combined_df[col] > upper)]
    print(f'{col}: {len(outliers)} emissions')

## correction emissions

In [None]:
combined_df = combined_df[combined_df['Fasting Insulin (pmol/L)'] < 700]
combined_df = combined_df[combined_df['2-hour Postprandial Insulin (pmol/L)'] < 800]

## fix faulty values

In [None]:
combined_df['Other Agents'] = combined_df['Other Agents'].str.replace('raberazole', 'rabeprazole')
combined_df['Other Agents'] = combined_df['Other Agents'].str.replace('calcium carbonate and vitamin D3 tablet',
                                                                      'calcium carbonate, vitamin D3 tablet')
combined_df['Other Agents'] = combined_df['Other Agents'].str.replace('rosuvastatinqn', 'rosuvastatin')
combined_df['Other Agents'] = combined_df['Other Agents'].str.replace('nifedipine doxazosin', 'nifedipine, doxazosin')

# grouped columns

## find_unknown_agents

In [None]:
def find_unknown_agents(df, col_name, items_to_group):
    all_agents = set()

    for elem in df[col_name]:
        agents = [agent.strip() for agent in elem.split(',')]
        all_agents.update(agents)

    unknown_agents = sorted(
        [unk_ag for unk_ag in all_agents if unk_ag not in items_to_group and unk_ag.lower() != 'none'])

    for unk_ag in unknown_agents: print(unk_ag)

## add_group_flags

In [None]:
def add_group_flags(df, column_name, items_to_group):
    all_groups = sorted(set(items_to_group.values()))

    def parse_agents(row):
        if pd.isna(row): return []
        return [agent.strip() for agent in row.split(',')]

    for group in all_groups:
        df[f'has_{group}'] = df[column_name].apply(
            lambda x: int(any(items_to_group.get(agent) == group for agent in parse_agents(x)))
        )

    df = df.drop(columns=[column_name], axis=1)
    return df

## Other Agents

In [None]:
drug_to_group = {
    # hypolipidemic
    'pravastatin': 'hypolipidemic',
    'rosuvastatin': 'hypolipidemic',
    'fenofibrate': 'hypolipidemic',
    'ezetimibe': 'hypolipidemic',
    'atorvastatin': 'hypolipidemic',

    # angioprotectors
    'calcium dobesilate': 'angioprotectors',
    'beiprostaglandin sodium': 'angioprotectors',

    # ace inhibitors
    'benazepril': 'ace_inhibitors',

    # minerals and vitamins
    'potassium chloride': 'minerals_and_vitamins',
    'calcium carbonate': 'minerals_and_vitamins',
    'calcitriol': 'minerals_and_vitamins',
    'multivitamin': 'minerals_and_vitamins',
    'vitamin B1': 'minerals_and_vitamins',
    'vitamin D3 tablet': 'minerals_and_vitamins',
    'mecobalamin': 'minerals_and_vitamins',

    # probiotics
    'clostridium butyricum': 'probiotics',

    # ARB
    'telmisartan': 'arb',
    'valsartan': 'arb',
    'olmesartan medoxomil': 'arb',
    'olmesartan': 'arb',
    'losartan': 'arb',
    'losartan/hydrochlorothiazide': 'arb',
    'irbesartan': 'arb',
    'candesartan': 'arb',
    'allisartan': 'arb',

    # psychotropic
    'quetiapine': 'psychotropic',

    # antianginal
    'isosorbide mononitrate': 'antianginal',

    # gout treatment
    'febuxostat': 'gout_treatment',

    # laxatives
    'bisacodyl': 'laxatives',

    # urological drugs
    'Qianlie Shutong capsule  (Chinese patent drug for prostatic hyperplasia)': 'urological_drugs',

    # calcium channel blockers
    'nifedipine': 'calcium_channel_blockers',
    'amlodipine': 'calcium_channel_blockers',
    'felodipine': 'calcium_channel_blockers',
    'benidipine': 'calcium_channel_blockers',

    # antiarrhythmic
    'doxazosin': 'antiarrhythmic',
    'labetalol': 'antiarrhythmic',
    'bisoprolol': 'antiarrhythmic',
    'metoprolol': 'antiarrhythmic',

    # gastroprotective
    'rabeprazole': 'gastroprotective',

    # circulatory support
    'Yinxingye tablet (extract of Ginkgo biloba leaves)': 'circulatory_support',

    # antithrombotic
    'aspirin': 'antithrombotic',
    'clopidogrel': 'antithrombotic',
    'rivaroxaban': 'antithrombotic',

    # vasodilators
    'trimetazidine': 'vasodilators',
    'magnesium isoglycyrrhizinate': 'vasodilators',

    # pancreatic
    'pancreatic kininogenase': 'pancreatic',

    # neuroprotectors
    'epalrestat': 'neuroprotectors',

    # kidney support
    'compound α-keto acid tablet': 'kidney_support',
    'Shen Shuai Ning capsule (Chinese patent drug for renal dysfunction)': 'kidney_support',

    # hepatoprotector
    'polyene phosphatidylcholine': 'hepatoprotector',
    'diammonium glycyrrhizinate': 'hepatoprotector',

    # immunomodulators
    'leucogen': 'immunomodulators',

    # thyroid diseases
    'levothyroxine': 'thyroid_diseases',

    # antibiotics
    'levofloxacin': 'antibiotics',

    # antihypertensives
    'Zhenju Jiangya tablet (Chinese patent drug for hypertension)': 'antihypertensives',

    # vestibular disorders
    'betahistine': 'vestibular_disorders',
}

In [None]:
find_unknown_agents(combined_df, 'Other Agents', drug_to_group)
combined_df = add_group_flags(combined_df, 'Other Agents', drug_to_group)
combined_df.head()

## Comorbidities

In [None]:
disease_to_group = {
    # diseases_of_the_stomach_and_intestines
    'chronic atrophic gastritis': 'diseases_of_the_stomach_and_intestines',
    'colorectal polyp': 'diseases_of_the_stomach_and_intestines',
    'chronic gastritis': 'diseases_of_the_stomach_and_intestines',
    'gastric polyp': 'diseases_of_the_stomach_and_intestines',

    # diseases_of_the_musculoskeletal_system
    'lumbar herniated disc': 'diseases_of_the_musculoskeletal_system',
    'osteopenia': 'diseases_of_the_musculoskeletal_system',
    'osteoporosis': 'diseases_of_the_musculoskeletal_system',
    'lumbar spine tumor': 'diseases_of_the_musculoskeletal_system',

    # cardiovascular_diseases
    'myocardial bridging': 'cardiovascular_diseases',
    'sinus arrhythmia': 'cardiovascular_diseases',
    'hypertension': 'cardiovascular_diseases',
    'hyperlipidemia': 'cardiovascular_diseases',
    'sinus bradycardia': 'cardiovascular_diseases',
    'atrial fibrillation': 'cardiovascular_diseases',

    # kidney_diseases
    'kidney cyst': 'kidney_diseases',
    'hydronephrosis': 'kidney_diseases',
    'nephrolithiasis': 'kidney_diseases',
    'urinary tract infection': 'kidney_diseases',

    # dental_diseases
    'periodontitis': 'dental_diseases',

    # gynecological_diseases
    'hysteromyoma': 'gynecological_diseases',

    # neurological_and_psychiatric_diseases
    'anxiety': 'neurological_and_psychiatric_diseases',
    'cerebrovascular disease': 'neurological_and_psychiatric_diseases',
    "Alzheimer's disease": 'neurological_and_psychiatric_diseases',
    "Parkinson's disease": 'neurological_and_psychiatric_diseases',

    # liver_diseases
    'fatty liver disease': 'liver_diseases',
    'fatty liver disese': 'liver_diseases',
    'liver cyst': 'liver_diseases',
    'hepatic dysfunction': 'liver_diseases',

    # gallbladder_diseases
    'cholecystitis': 'gallbladder_diseases',
    'cholelithiasis': 'gallbladder_diseases',
    'gallbladder polyp': 'gallbladder_diseases',

    # infectious_diseases
    'chronic hepatitis B': 'infectious_diseases',

    # oncology
    'breast cancer': 'oncology',
    'pancreatic cancer': 'oncology',
    'parotid gland carcinoma': 'oncology',
    'lung lesion': 'oncology',
    'pulmonary nodule': 'oncology',

    # endocrine_diseases
    'hypoparathyroidism': 'endocrine_diseases',
    'hypothyroidism': 'endocrine_diseases',
    'enlarged adrenal gland': 'endocrine_diseases',
    'thyroid nodule': 'endocrine_diseases',

    # male_reproductive_diseases
    'prostatic hyperplasia': 'male_reproductive_diseases',

    # eye_diseases
    'cataract': 'eye_diseases',
    'conjunctivitis': 'eye_diseases',

    # hematologic_disorders
    'hypoleukocytemia': 'hematologic_disorders',
    'leucopenia': 'hematologic_disorders',

    # autoimmune_diseases
    'systemic sclerosis': 'autoimmune_diseases',
    'psoriasis': 'autoimmune_diseases',

    # electrolyte_and_mineral_disorders
    'hypocalcemia': 'electrolyte_and_mineral_disorders',
    'hypokalemia': 'electrolyte_and_mineral_disorders',
    'vitamin D deficiency': 'electrolyte_and_mineral_disorders',
    'hyperuricemia': 'electrolyte_and_mineral_disorders',
}

In [None]:
find_unknown_agents(combined_df, 'Comorbidities', disease_to_group)
combined_df = add_group_flags(combined_df, 'Comorbidities', disease_to_group)
combined_df.head()

In [None]:
combined_df.shape

## drop unused columns

In [None]:
combined_df = combined_df.drop(columns=['Hypoglycemic Agents'])

In [None]:
combined_df.head()

# specify has_macrovascular and has_microvascular complications

In [None]:
def specify_has_or_no(df, column_name):
    column_name = column_name.strip()

    def has_any_value(row):
        if pd.isna(row): return 0
        for el in row.split(','):
            el = el.strip().lower()
            if el and el != 'none':
                return 1
        return 0

    new_name_column = f'has_{column_name.lower().replace(' ', '_')}'
    df[new_name_column] = df[column_name].apply(has_any_value)

    return df

In [None]:
combined_df = specify_has_or_no(combined_df, 'Diabetic Microvascular Complications')

combined_df = combined_df.drop(columns=['Diabetic Microvascular Complications'])

combined_df

In [None]:
combined_df = specify_has_or_no(combined_df, 'Diabetic Macrovascular  Complications')

combined_df = combined_df.drop(columns=['Diabetic Macrovascular  Complications'])

combined_df.head(6)

In [None]:
combined_df = specify_has_or_no(combined_df, 'Acute Diabetic Complications')

combined_df = combined_df.drop(columns=['Acute Diabetic Complications'])

combined_df.head(8)

## encoding columns Alcohol Drinking History (drinker/non-drinker) and Hypoglycemia (yes/no)

Hypoglycemia (yes/no) -> (yes 1 / no 0)
Alcohol Drinking History (drinker/non-drinker) -> (yes 0 / no 1)

In [None]:
label_encoder = preprocessing.LabelEncoder()

for col_to_encode in ['Alcohol Drinking History (drinker/non-drinker)', 'Hypoglycemia (yes/no)', 'Type of Diabetes']:
    combined_df[col_to_encode] = label_encoder.fit_transform(combined_df[col_to_encode])

combined_df.head()

## rename Hypoglycemia (yes/no)

In [None]:
combined_df[f'has_hypoglycemia'] = combined_df['Hypoglycemia (yes/no)']

In [None]:
df = combined_df.drop(columns=['Hypoglycemia (yes/no)'])
df.head()

In [None]:
df.columns

# indicating therapy_type

In [None]:
folder_Shanghai_T1DM = Path('../data/Shanghai_diabetes_datasets/Shanghai_CSV-Data/T1DM')
folder_Shanghai_T2DM = Path('../data/Shanghai_diabetes_datasets/Shanghai_CSV-Data/T2DM')

## converting check

In [None]:
folder_excel = Path('../data/Shanghai_diabetes_datasets/Shanghai_T2DM/')
folder_csv = Path('../data/Shanghai_diabetes_datasets/Shanghai_CSV-Data/T2DM')

excel_names = {file.stem for file in folder_excel.glob('*.xlsx')} | \
              {file.stem for file in folder_excel.glob('*.xls')}

csv_names = {f.stem for f in folder_csv.glob('*.csv')}

missing_converted = excel_names - csv_names

if missing_converted:
    print("The following Excel files were not converted to CSV:")
    for name in sorted(missing_converted):
        print(name + ' (missing .csv)')
else:
    print("All Excel files were successfully converted to CSV.")

## determine_treatment

In [None]:
def determine_treatment(file_path):
    df = pd.read_csv(file_path)
    treatments = []

    if 'Insulin dose - s.c.' in df.columns and df['Insulin dose - s.c.'].dropna().astype(str).str.strip().ne('').any():
        treatments.append('s.c. insulin')

    if 'Insulin dose - i.v.' in df.columns and df['Insulin dose - i.v.'].dropna().astype(str).str.strip().ne('').any():
        treatments.append('i.v. insulin')

    if 'CSII - bolus insulin (Novolin R, IU)' in df.columns and df[
        'CSII - bolus insulin (Novolin R, IU)'].dropna().astype(str).str.strip().ne('').any():
        treatments.append('CSII bolus')

    if 'CSII - basal insulin (Novolin R, IU / H)' in df.columns and df[
        'CSII - basal insulin (Novolin R, IU / H)'].dropna().astype(str).str.strip().ne('').any():
        treatments.append('CSII basal')

    if 'Non-insulin hypoglycemic agents' in df.columns and df['Non-insulin hypoglycemic agents'].dropna().astype(
            str).str.strip().ne('').any():
        treatments.append('non-insulin agents')

    return '; '.join(treatments)

In [None]:
treatment_map_1 = {}
treatment_map_2 = {}

for file in folder_Shanghai_T1DM.glob('*.csv'):
    for patient_id in df['Patient Number']:
        if patient_id in file.name:
            treatment_map_1[patient_id] = determine_treatment(file)
            break

for file in folder_Shanghai_T2DM.glob('*.csv'):
    for patient_id in df['Patient Number']:
        if patient_id in file.name:
            treatment_map_2[patient_id] = determine_treatment(file)
            break

treatment_map = {**treatment_map_1, **treatment_map_2}

df['treatment'] = df['Patient Number'].map(treatment_map)
df.head()

## deleted null treatment

In [None]:
df = df[df['treatment'].notna()]

# folder_path = Path('../data/Shanghai_diabetes_datasets/3_step_finish_dataset/3_step_finish_dataset.csv')
# df.to_csv(folder_path)

## grouped non-insulin, insulin treatment or mixed

In [None]:
def classify_treatment(df):
    insulin_treatments = {'CSII bolus', 'CSII basal', 's.c. insulin', 'i.v. insulin'}
    non_insulin_treatment = 'non-insulin agents'

    def classify(row):

        treatments = set([value.strip() for value in row.split(';')])

        has_insulin = bool(treatments.intersection(insulin_treatments))
        has_non_insulin = non_insulin_treatment in treatments

        if has_insulin and has_non_insulin:
            return 'mixed_treatment'
        elif has_insulin:
            return 'insulin_treatment'
        elif has_non_insulin:
            return 'drug_treatment'
        else:
            return None

    df['treatment_type'] = df['treatment'].apply(classify)

    df = df.dropna(subset=['treatment_type'])

    return df


df = classify_treatment(df)
folder_path = Path('../data/Shanghai_diabetes_datasets/3_step_finish_dataset/3_step_finish_dataset.csv')
df.to_csv(folder_path)

In [None]:
# def classify_treatment(df):
#     insulin_treatments = {'CSII bolus', 'CSII basal', 's.c. insulin', 'i.v. insulin'}
#     non_insulin_treatment = 'non-insulin agents'
#
#     def classify(row):
#         treatments = set([value.strip() for value in row.split(';')])
#
#         has_insulin = bool(treatments.intersection(insulin_treatments))
#         has_non_insulin = non_insulin_treatment in treatments
#
#         if has_insulin:
#             return 'insulin_treatment'
#         elif has_non_insulin:
#             return 'drug_treatment'
#         else:
#             return None
#
#     df['treatment_type'] = df['treatment'].apply(classify)
#
#     df = df.dropna(subset=['treatment_type'])
#
#     return df
#
#
# df = classify_treatment(df)
# folder_path = Path('../data/Shanghai_diabetes_datasets/3_step_finish_dataset/3_step_finish_dataset.csv')
# df.to_csv(folder_path)

In [None]:
df = df.drop(columns=['treatment', 'Patient Number'], axis=1)

In [None]:
df.tail()

## encoding columns treatment_type

In [None]:
# label_encoder = preprocessing.LabelEncoder()
#
# df['treatment_type'] = label_encoder.fit_transform(df['treatment_type'])
#
# combined_df.head()

## Visualisation treatment_type

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='treatment_type', hue='treatment_type', data=df, palette='viridis')
plt.title('treatment_type')
plt.show()

# PREDICTIVE ANALYSIS

In [None]:
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), [
#             'Age (years)', 'Height (m)', 'Weight (kg)', 'BMI (kg/m2)', 'Smoking History (pack year)',
#             'Duration of Diabetes (years)', 'Fasting Plasma Glucose (mg/dl)',
#             '2-hour Postprandial Plasma Glucose (mg/dl)',
#             'Fasting C-peptide (nmol/L)', '2-hour Postprandial C-peptide (nmol/L)', 'Fasting Insulin (pmol/L)',
#             '2-hour Postprandial Insulin (pmol/L)', 'HbA1c (mmol/mol)', 'Glycated Albumin (%)',
#             'Total Cholesterol (mmol/L)',
#             'Triglyceride (mmol/L)', 'High-Density Lipoprotein Cholesterol (mmol/L)',
#             'Low-Density Lipoprotein Cholesterol (mmol/L)', 'Creatinine (umol/L)',
#             'Estimated Glomerular Filtration Rate  (ml/min/1.73m2)', 'Uric Acid (mmol/L)',
#             'Blood Urea Nitrogen (mmol/L)'
#         ]),
#         ('bin', 'passthrough',
#          ['Gender (Female=1, Male=2)', 'has_hypoglycemia', 'Alcohol Drinking History (drinker/non-drinker)',
#           'Type of Diabetes'
#           ])
#     ]
# )
#
# X = df.drop('treatment_type', axis=1)
# y = df['treatment_type']
combined_df

In [None]:
df.shape

In [None]:
df.head()
df.to_csv('result.csv')

# Define preprocessor

In [None]:
# # df_path = Path('../research/result1.csv')
# # dfTest = pd.read_csv(df_path)
#
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(),
#          ['Age (years)', 'Height (m)', 'Weight (kg)', 'BMI (kg/m2)', 'Smoking History (pack year)',
#           'Duration of Diabetes (years)', 'Fasting Plasma Glucose (mg/dl)',
#           '2-hour Postprandial Plasma Glucose (mg/dl)',
#           'Fasting C-peptide (nmol/L)', '2-hour Postprandial C-peptide (nmol/L)', 'Fasting Insulin (pmol/L)',
#           '2-hour Postprandial Insulin (pmol/L)', 'HbA1c (mmol/mol)', 'Glycated Albumin (%)',
#           'Total Cholesterol (mmol/L)',
#           'Triglyceride (mmol/L)', 'High-Density Lipoprotein Cholesterol (mmol/L)',
#           'Low-Density Lipoprotein Cholesterol (mmol/L)', 'Creatinine (umol/L)',
#           'Estimated Glomerular Filtration Rate  (ml/min/1.73m2)', 'Uric Acid (mmol/L)',
#           'Blood Urea Nitrogen (mmol/L)']),
#         ('bin', 'passthrough', [
#             'Gender (Female=1, Male=2)', 'has_ace_inhibitors', 'has_angioprotectors', 'has_antianginal',
#             'has_antiarrhythmic', 'has_antibiotics', 'has_antihypertensives', 'has_antithrombotic', 'has_arb',
#             'has_calcium_channel_blockers', 'has_circulatory_support', 'has_gastroprotective', 'has_gout_treatment',
#             'has_hepatoprotector', 'has_hypolipidemic', 'has_immunomodulators', 'has_kidney_support', 'has_laxatives',
#             'has_minerals_and_vitamins', 'has_neuroprotectors', 'has_pancreatic', 'has_probiotics', 'has_psychotropic',
#             'has_thyroid_diseases', 'has_urological_drugs', 'has_vasodilators', 'has_vestibular_disorders',
#             'has_autoimmune_diseases', 'has_cardiovascular_diseases', 'has_dental_diseases',
#             'has_diseases_of_the_musculoskeletal_system', 'has_diseases_of_the_stomach_and_intestines',
#             'has_electrolyte_and_mineral_disorders', 'has_endocrine_diseases', 'has_eye_diseases',
#             'has_gallbladder_diseases',
#             'has_gynecological_diseases', 'has_hematologic_disorders', 'has_infectious_diseases', 'has_kidney_diseases',
#             'has_liver_diseases', 'has_male_reproductive_diseases', 'has_neurological_and_psychiatric_diseases',
#             'has_oncology',
#             'has_diabetic_microvascular_complications', 'has_diabetic_macrovascular__complications',
#             'has_acute_diabetic_complications', 'has_hypoglycemia', 'Alcohol Drinking History (drinker/non-drinker)',
#             'Type of Diabetes'
#         ])
#     ]
# )
#
# X = df.drop('treatment_type', axis=1)
# y = df['treatment_type']

In [None]:
# df.to_csv('result.csv')

In [None]:
# df.tail()

# Create Pipeline / predict and evaluate

## RFC

In [None]:
# from imblearn.over_sampling import SMOTE
#
#
# def run_binary_classification(df, class_a, class_b):
#     print(f"=== classification: {class_a} vs {class_b} ===")
#
#     binary_df = df[df['treatment_type'].isin([class_a, class_b])].copy()
#     X = binary_df.drop(columns=['treatment_type'])
#     y = binary_df['treatment_type']
#
#     categorical_features = [
#         'Gender (Female=1, Male=2)', 'has_ace_inhibitors', 'has_angioprotectors', 'has_antianginal',
#         'has_antiarrhythmic', 'has_antibiotics', 'has_antihypertensives', 'has_antithrombotic', 'has_arb',
#         'has_calcium_channel_blockers', 'has_circulatory_support', 'has_gastroprotective', 'has_gout_treatment',
#         'has_hepatoprotector', 'has_hypolipidemic', 'has_immunomodulators', 'has_kidney_support', 'has_laxatives',
#         'has_minerals_and_vitamins', 'has_neuroprotectors', 'has_pancreatic', 'has_probiotics', 'has_psychotropic',
#         'has_thyroid_diseases', 'has_urological_drugs', 'has_vasodilators', 'has_vestibular_disorders',
#         'has_autoimmune_diseases', 'has_cardiovascular_diseases', 'has_dental_diseases',
#         'has_diseases_of_the_musculoskeletal_system', 'has_diseases_of_the_stomach_and_intestines',
#         'has_electrolyte_and_mineral_disorders', 'has_endocrine_diseases', 'has_eye_diseases',
#         'has_gallbladder_diseases', 'has_gynecological_diseases', 'has_hematologic_disorders',
#         'has_infectious_diseases', 'has_kidney_diseases', 'has_liver_diseases', 'has_male_reproductive_diseases',
#         'has_neurological_and_psychiatric_diseases', 'has_oncology', 'has_diabetic_microvascular_complications',
#         'has_diabetic_macrovascular__complications', 'has_acute_diabetic_complications', 'has_hypoglycemia',
#         'Alcohol Drinking History (drinker/non-drinker)', 'Type of Diabetes'
#     ]
#     numerical_features = [col for col in X.columns if col not in categorical_features]
#
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, stratify=y, test_size=0.2, random_state=42
#     )
#
#     preprocessor = ColumnTransformer([
#         ('num', StandardScaler(), numerical_features),
#         ('cat', 'passthrough', categorical_features)
#     ])
#
#     over = SMOTE(sampling_strategy='auto', random_state=42)
#
#     pipeline = ImbPipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('over', over),
#         ('classifier', RandomForestClassifier(random_state=42))
#     ])
#
#     param_grid = {
#         'classifier__n_estimators': [200],
#         'classifier__max_depth': [10, 20],
#         'classifier__min_samples_split': [2, 5],
#         'classifier__min_samples_leaf': [1, 2],
#     }
#
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#
#     grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, n_jobs=-1)
#     grid.fit(X_train, y_train)
#
#     print("best params:", grid.best_params_)
#     y_pred = grid.predict(X_test)
#     print("accuracy:", round(accuracy_score(y_test, y_pred), 4))
#     print(classification_report(y_test, y_pred))
#
#
# pairs = [
#     ('insulin_treatment', 'drug_treatment'),
#     ('insulin_treatment', 'mixed_treatment'),
#     ('mixed_treatment', 'drug_treatment')
# ]
#
#
# df_path = Path('../research/result2.csv')
# dfTest = pd.read_csv(df_path)
#
# for a, b in pairs:
#     run_binary_classification(dfTest, a, b)

In [None]:
from imblearn.over_sampling import SMOTE

df_path = Path('../research/result2.csv')
dfTest = pd.read_csv(df_path)

X = dfTest.drop('treatment_type', axis=1)
y = dfTest['treatment_type']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

over = SMOTE(sampling_strategy='auto', random_state=42)
# over = SMOTE(sampling_strategy={'insulin_treatment': 26}, random_state=42)


categorical_features = [
    'Gender (Female=1, Male=2)', 'has_ace_inhibitors', 'has_angioprotectors', 'has_antianginal',
    'has_antiarrhythmic', 'has_antibiotics', 'has_antihypertensives', 'has_antithrombotic', 'has_arb',
    'has_calcium_channel_blockers', 'has_circulatory_support', 'has_gastroprotective', 'has_gout_treatment',
    'has_hepatoprotector', 'has_hypolipidemic', 'has_immunomodulators', 'has_kidney_support', 'has_laxatives',
    'has_minerals_and_vitamins', 'has_neuroprotectors', 'has_pancreatic', 'has_probiotics', 'has_psychotropic',
    'has_thyroid_diseases', 'has_urological_drugs', 'has_vasodilators', 'has_vestibular_disorders',
    'has_autoimmune_diseases', 'has_cardiovascular_diseases', 'has_dental_diseases',
    'has_diseases_of_the_musculoskeletal_system', 'has_diseases_of_the_stomach_and_intestines',
    'has_electrolyte_and_mineral_disorders', 'has_endocrine_diseases', 'has_eye_diseases',
    'has_gallbladder_diseases', 'has_gynecological_diseases', 'has_hematologic_disorders',
    'has_infectious_diseases', 'has_kidney_diseases', 'has_liver_diseases', 'has_male_reproductive_diseases',
    'has_neurological_and_psychiatric_diseases', 'has_oncology', 'has_diabetic_microvascular_complications',
    'has_diabetic_macrovascular__complications', 'has_acute_diabetic_complications', 'has_hypoglycemia',
    'Alcohol Drinking History (drinker/non-drinker)', 'Type of Diabetes'
]
numerical_features = [col for col in X.columns if col not in categorical_features]
# cat_idx = [X.columns.get_loc(col) for col in categorical_features]

# over = SMOTE(categorical_features=cat_idx, sampling_strategy={'insulin_treatment': 26}, random_state=42)
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', 'passthrough', categorical_features)
])
# ova_classifier = OneVsOneClassifier(RandomForestClassifier(random_state=42))

pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('over', over),
    # ('classifier', ova_classifier),
    ('classifier', RandomForestClassifier(random_state=42)),
])
param_grid = {
    'classifier__n_estimators': [200, 300],
    'classifier__max_depth': [10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
}
# param_grid = {
#     'classifier__estimator__n_estimators': [200, 300],
#     'classifier__estimator__max_depth': [10, 20],
#     'classifier__estimator__min_samples_split': [2, 5, 10],
#     'classifier__estimator__min_samples_leaf': [1, 2, 4],
# }

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_test)

print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print(classification_report(y_test, y_pred))

In [None]:
print(y.value_counts(normalize=True))

In [None]:
y_train.value_counts()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(grid_search, X_test, y_test, cmap='Blues')

In [None]:
print("Train accuracy:", accuracy_score(y_train, grid_search.predict(X_train)))

In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
sns.countplot(data=df, x='treatment_type')
plt.title('New diabetes distribution')
plt.show()

In [None]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [50],
    'classifier__max_depth': [20],
    'classifier__min_samples_split': [2],
    'classifier__min_samples_leaf': [2]
}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


In [None]:
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=skf,
    n_jobs=-1
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)

In [None]:
y_pred = grid_search.predict(X_test)

print("Accuracy on test:", round(accuracy_score(y_test, y_pred), 4))
print(classification_report(y_test, y_pred))

In [None]:
y_pred = grid_search.predict(X_test)

print('model accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
print(sorted(y.unique()))

### DTC

In [None]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

In [None]:
param_grid = {
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_depth': [10, 20, 30],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__max_leaf_nodes': [10, 20, 50],
}

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)
print('best parameters', grid_search.best_params_)

In [None]:
y_pred = grid_search.predict(X_test)
print('model accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)
print('best parameters', grid_search.best_params_)
y_pred = grid_search.predict(X_test)
print('model accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('confusion matrix')
plt.xlabel('predicted')
plt.ylabel('true')
plt.show()

## KNN

In [None]:
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier())
    ]
)

In [None]:
param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9],
    'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
    'classifier__p': [1, 2],
    'classifier__weights': ['uniform', 'distance'],
}

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)
print('best parameters', grid_search.best_params_)

In [None]:
y_pred = grid_search.predict(X_test)

print('model accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

## LR

In [None]:
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ]
)

In [None]:
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__max_iter': [300, 500, 1000]
}

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=5)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

print('best parameters', grid_search.best_params_)

In [None]:
y_pred = grid_search.predict(X_test)

print('model accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

## XGBoost

In [None]:
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', xgb.XGBClassifier(random_state=42))
    ]
)

In [None]:
param_grid = {
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__n_estimators': [100, 200],
    'classifier__subsample': [0.8, 1.0],
}

In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

grid_search = GridSearchCV(clf, param_grid, cv=5)

grid_search.fit(X_train, y_train)
print('best parameters', grid_search.best_params_)

In [None]:
y_pred = grid_search.predict(X_test)

print('model accuracy', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

## Lightgbm

In [None]:
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(verbose=-1))
    ]
)

In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 7],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__num_leaves': [15, 31],
    'classifier__min_child_samples': [10, 20, 30],
}

In [None]:
grid_search = GridSearchCV(clf, param_grid, cv=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)
print('best parameters', grid_search.best_params_)

In [None]:
y_pred = grid_search.predict(X_test)

print('model accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()