In [1]:
from statistics import median

import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import numpy as np
import os
import seaborn as sns

# Input the data

## load dataset

In [2]:
folder_path = Path('../data/Shanghai_diabetes_datasets/clinical_info/csv')

if not os.path.exists(folder_path):
    raise FileNotFoundError(f"directory {folder_path} doesn't exist")

df1 = pd.read_csv(folder_path.joinpath('Shanghai_T1DM_Summary.csv'))
df2 = pd.read_csv(folder_path.joinpath('Shanghai_T2DM_Summary.csv'))

combined_df = pd.concat([df1, df2], ignore_index=True)
combined_df.to_csv('combined_df.csv')
combined_df.head(9)

Unnamed: 0,Patient Number,"Gender (Female=1, Male=2)",Age (years),Height (m),Weight (kg),BMI (kg/m2),Smoking History (pack year),Alcohol Drinking History (drinker/non-drinker),Type of Diabetes,Duration of Diabetes (years),...,Glycated Albumin (%),Total Cholesterol (mmol/L),Triglyceride (mmol/L),High-Density Lipoprotein Cholesterol (mmol/L),Low-Density Lipoprotein Cholesterol (mmol/L),Creatinine (umol/L),Estimated Glomerular Filtration Rate (ml/min/1.73m2),Uric Acid (mmol/L),Blood Urea Nitrogen (mmol/L),Hypoglycemia (yes/no)
0,1001_0_20210730,1,66,1.5,60,26.67,0.0,non-drinker,T1DM,10.0,...,40.7,3.59,1.02,0.86,2.01,37.3,160,188.86,6.47,no
1,1002_0_20210504,2,68,1.7,63,21.8,50.0,drinker,T1DM,26.0,...,19.6,4.78,2.2,0.93,3.28,66.8,109,342.57,6.05,yes
2,1002_1_20210521,2,68,1.7,67,23.18,50.0,drinker,T1DM,26.0,...,19.6,4.78,2.2,0.93,3.28,69.4,104,322.18,3.06,yes
3,1002_2_20210909,2,68,1.7,65,22.49,50.0,drinker,T1DM,26.0,...,25.1,3.49,1.82,0.84,1.83,63.7,115,342.34,6.21,yes
4,1003_0_20210831,2,37,1.9,60,16.62,0.0,non-drinker,T1DM,0.08,...,46.6,5.61,1.14,1.08,3.95,49.6,174,93.39,1.85,yes
5,1004_0_20210425,1,67,1.55,47,19.56,0.0,non-drinker,T1DM,12.0,...,37.6,4.57,0.91,1.27,2.76,45.2,127,240.61,3.98,yes
6,1005_0_20210522,2,58,1.7,50,17.3,22.5,non-drinker,T1DM,16.0,...,25.7,4.05,0.46,1.57,2.12,75.4,98,205.48,3.96,yes
7,1006_0_20210114,2,57,1.61,53,20.52,0.0,non-drinker,T1DM,7.0,...,29.2,4.44,0.68,1.97,2.57,74.0,97,247.0,6.4,yes
8,1006_1_20210209,2,57,1.61,52,20.18,0.0,non-drinker,T1DM,7.0,...,27.0,5.12,0.64,1.88,2.93,82.0,91,257.0,6.4,yes


## handling missing values

In [3]:
combined_df.replace('/', pd.NA, inplace=True)  # convert / to null

cols_to_fill = [
    'Age (years)', 'Height (m)', 'Weight (kg)', 'BMI (kg/m2)', 'Smoking History (pack year)',
    'Duration of Diabetes (years)', 'Fasting Plasma Glucose (mg/dl)',
    '2-hour Postprandial Plasma Glucose (mg/dl)', 'Fasting C-peptide (nmol/L)',
    '2-hour Postprandial C-peptide (nmol/L)', 'Fasting Insulin (pmol/L)',
    '2-hour Postprandial Insulin (pmol/L)', 'HbA1c (mmol/mol)', 'Glycated Albumin (%)',
    'Total Cholesterol (mmol/L)', 'Triglyceride (mmol/L)',
    'High-Density Lipoprotein Cholesterol (mmol/L)', 'Low-Density Lipoprotein Cholesterol (mmol/L)',
    'Creatinine (umol/L)', 'Estimated Glomerular Filtration Rate  (ml/min/1.73m2)',
    'Uric Acid (mmol/L)', 'Blood Urea Nitrogen (mmol/L)'
]
for col in cols_to_fill:
    if col in combined_df.columns:
        combined_df[col] = pd.to_numeric(combined_df[col].astype(str).str.strip(), errors='coerce')
# for col in cols_to_fill:
#     median_value = combined_df[col].median()
#     combined_df[col].fillna(median_value, inplace=True)
combined_df[cols_to_fill] = combined_df[cols_to_fill].fillna(combined_df[cols_to_fill].median())

# combined_df.to_csv('finish_data.csv', index=False)
combined_df.to_csv('finish_data.csv', index=False)