In [None]:
 
##########################
##########################
###### SECTION 3 #########
##########################
##########################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm


####################
# Data Preparation #
####################


# Load the data
data = pd.read_csv('input.tbl_safa.csv')

# Filter positive COVID cases
data_filtered = data[(data['cov_det_sarscov2'] == "POSITIVE") & 
                     (data['cov_det_id'].isna()) | (data['cov_det_id'] == "POSITIVE")].copy()

# Create new column sars_cov2
data_filtered.loc[:, 'sars_cov2'] = data_filtered['cov_det_id']

# Replace NaN values in sars_cov2 with 'Unknown'
data_filtered.loc[:, 'sars_cov2'] = data_filtered['sars_cov2'].fillna('Unknown')

# Convert sars_cov2 to a categorical type 
data_filtered.loc[:, 'sars_cov2'] = pd.Categorical(data_filtered['sars_cov2'],
                                                   categories=['POSITIVE', 'Unknown'])


# Find complication columns
start_col = 'comps_nosocomial_sepsis'
end_col = 'comps_empyema'
cols_to_update = data_filtered.loc[:, start_col:end_col].columns

# Replace NaN values with False for complications
data_filtered[cols_to_update] = data_filtered[cols_to_update].fillna(False)


# Create the cardiac arrest column based on diagnoses
data_filtered['Cardiac arrest'] = data_filtered['comps_cardiac_arrest']

# Create the cardiac arrhythmia column based on diagnoses
complication_cols = [
    'comps_cardiac_arrhythmia',
    'comps_ventricular_arrhythmia',
    'comps_supraventricular_arrhythmia',
    'comps_atrial_fibrillation'
]

def determine_cardiac_arrhythmia(row):
    if row[complication_cols].any():
        return True
    else:
        return False

data_filtered['Cardiac Arrhythmia'] = data_filtered.apply(determine_cardiac_arrhythmia, 
                                                          axis=1)

# Create the stroke column based on diagnoses
data_filtered['Stroke'] = data_filtered['comps_stroke']

# Create the heart failure column based on diagnoses
data_filtered['Heart Failure'] = data_filtered['comps_congestive_heart_failure']


# Create the myocardial injury column based on diagnoses
myocardial_injury_cols = [
    'comps_myocardial_infarction',
    'comps_cardiac_ischaemia',
    'comps_acute_cardiac_injury',
    'comps_cardiogenic_shock',
    'comps_st_elevation',
    'comps_elevated_troponin'
]

def determine_myocardial(row):
    if row[myocardial_injury_cols].any():
        return True
    else:
        return False

data_filtered['Myocardial Injury'] = data_filtered.apply(determine_myocardial, axis=1)


# Create the MACE columnn as composite outcome
MACE_cols = ['Cardiac arrest', 'Cardiac Arrhythmia', 'Stroke', 
             'Heart Failure', 'Myocardial Injury']
    
def determine_mace_status(row):
    if row[MACE_cols].any():
        return True
    else:
        return False

data_filtered['MACE'] = data_filtered[MACE_cols].apply(determine_mace_status, axis=1)


#Drop columns relating to COVID detection
columns_to_drop = [
    'cov_det_cronavir',
    'cov_det_sarscov2',
    'cov_id_cronavir',
    'cov_id_sarscov2',
    'cov_det_id'
]

data_cleaned = data_filtered.drop(columns=columns_to_drop)


# Find column numbers for 'age' and 'lab_wbc'
age_col_num = data_cleaned.columns.get_loc('age')
lab_wbc_col_num = data_cleaned.columns.get_loc('lab_wbc')
date_admit_col_num = data_cleaned.columns.get_loc('date_admit')



columns_indices = list(range(3, 67)) + [142]

# Calculate percentage of missing values in each row 
missing_percentage = yo.iloc[:, columns_indices].isnull().mean(axis=1) * 100

# Define 50% threshold for rows to keep
threshold_50 = 50

# Filter rows where missing percentage is less than or equal to 50%
rows_to_keep_50 = missing_percentage <= threshold_50
data_cleaned_50percent = data_cleaned[rows_to_keep_50]

# Count total rows before and after dropping rows with more than 50% missing data
rows_after_dropping_50percent = data_cleaned_50percent.shape[0]


# Drop rows with missing values in 'slider_sex', 'age', or 'income' columns
final_data2 = data_cleaned_50percent.dropna(subset=['slider_sex', 'age', 'income'])

final_data2.to_csv('final_data2.csv', index=False)


# Count the number of individuals in each country
country_counts = final_data2['slider_country'].value_counts()

# Identify countries with fewer than 50 individuals
countries_to_replace = country_counts[country_counts < 50].index

# Replace these country names with 'Other'
final_data2['slider_country'] = final_data2['slider_country'].apply(
    lambda x: 'Other' if x in countries_to_replace else x)


# Add a age group column
age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
age_labels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60',
              '60-70', '70-80', '80-90', '90-100']

# Categorise ages into the defined bins
final_data2['age_group'] = pd.cut(final_data2['age'], bins=age_bins, 
                                  labels=age_labels, right=False)

data = pd.read_csv('final_data2.csv')


########################################################################
# Figure 4 (b-f): Pie chart for age distribution of MACE complications #
########################################################################


columns_of_interest = [
    'Cardiac arrest', 'Cardiac Arrhythmia', 'Stroke', 
    'Heart Failure', 'Myocardial Injury'
]

# Combine first five age groups into one '0-50' group
combined_age_group = '0-50'
age_order = [combined_age_group, '50-60', '60-70', '70-80', '80-90', '90-100']

data_copy = data.copy()

# Combine age groups in the data
data_copy['age_group'] = data_copy['age_group'].replace({
    '0-10': combined_age_group,
    '10-20': combined_age_group,
    '20-30': combined_age_group,
    '30-40': combined_age_group,
    '40-50': combined_age_group
})

# Aesthetics for pie charts
colors = plt.cm.Set3(np.linspace(0, 1, len(age_order)))
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, column in enumerate(columns_of_interest):
    # Filter the data to include only rows where complication present
    true_data = data_copy[data_copy[column] == True]
    
    # Group the filtered data by age category and count the occurrences
    age_distribution = true_data['age_group'].value_counts(normalize=True) * 100
    
    age_distribution = age_distribution.reindex(age_order, fill_value=0)
    
    # Create pie chart 
    wedges, texts, autotexts = axes[i].pie(
        age_distribution, 
        labels=age_distribution.index, 
        autopct='%1.1f%%', 
        startangle=140, 
        colors=colors
    )
    
    # Change text appearance
    for text in texts:
        text.set_color('black')
    for autotext in autotexts:
        autotext.set_color('black')
    
    axes[i].set_title(f'Age Distribution for {column}')
    axes[i].axis('equal')  

    
# Show the plot
fig.tight_layout()
plt.show()


########################################################################
# Figure 9 (d): Pie chart for age distribution of high income group #
########################################################################


data_copy2 = data.copy()

# Combine age groups in the data
combined_age_group = {
    '0-10': '0-20',
    '10-20': '0-20',
    '80-90': '80-100',
    '90-100': '80-100'
}

data_copy2['age_group'] = data_copy2['age_group'].replace(combined_age_group)

# Define the order of age groups
age_order = ['0-20', '20-30', '30-40', '40-50', '50-60', 
             '60-70', '70-80', '80-100']

# Filter data for high-income group
income_data = data_copy2[data_copy2['income'] == 'High income']

# Calculate the count of each age group 
age_group_counts = income_data['age_group'].value_counts().reindex(
    age_order, fill_value=0)

# Define color palette
colors = plt.cm.Set3(np.linspace(0, 1, len(age_order)))

# Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(age_group_counts, labels=age_group_counts.index, 
        autopct=lambda p: f'{p:.1f}%', 
        startangle=140, counterclock=False, colors=colors, 
        textprops={'fontsize': 18})
plt.axis('equal') 


# Show the plot
plt.show()


###################################################
# Univariable Logistic Regression - Comorbidities #
###################################################


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


start_col = 12
end_col = 33

# Initialize empty DataFrame
comorb_results = pd.DataFrame(columns=['Variable', 'Odds Ratio', '95% CI Lower', 
                                       '95% CI Upper', 'P-Value'])

# Iterate over each column
for col in data.columns[start_col:end_col + 1]:
    # Prepare the data for logistic regression
    try:
        X = data[[col]].astype(float)  
    except ValueError:
        X = pd.get_dummies(data[[col]], drop_first=True)

    y = data['MACE'].astype(int)  
    
    # Drop rows with missing values in both X and y
    valid_indices = X.dropna().index.intersection(y.dropna().index)
    X = X.loc[valid_indices]
    y = y.loc[valid_indices]

    # Check for infinite values in X 
    X = X.replace([np.inf, -np.inf], np.nan).dropna()
    y = y.loc[X.index]
    
    # Add a constant to the independent variable
    X = sm.add_constant(X, has_constant='add')
    
    # Perform logistic regression
    model = sm.Logit(y, X)
    comorb_result = model.fit(disp=0)  

    # Get odds ratio
    odds_ratios = np.exp(comorb_result.params)
    conf = comorb_result.conf_int()
    conf = np.exp(conf)
    p_values = comorb_result.pvalues
    
    # Add results to DataFrame
    for param in odds_ratios.index:
        if param == 'const':
            continue  

        new_row = pd.DataFrame({
            'Variable': [param],
            'Odds Ratio': [odds_ratios[param]],
            '95% CI Lower': [conf.loc[param, 0]],
            '95% CI Upper': [conf.loc[param, 1]],
            'P-Value': [p_values[param]]
        })
        comorb_results = pd.concat([comorb_results, new_row], ignore_index=True)


# Save results to a CSV file
comorb_results.to_csv('odds_ratio_data.csv', index=False)


#########################################
# Univariable Logistic Regression - Sex #
#########################################


# Prepare independent and dependent variables
X = pd.get_dummies(data['slider_sex'], drop_first=True).astype(float) 
y = data['MACE'].astype(int)  

# Add constant to the independent variable
X = sm.add_constant(X)

# Perform logistic regression
model = sm.Logit(y, X)
sex_result = model.fit()

# Get the odds ratio, confidence interval, and p-value
odds_ratios = np.exp(sex_result.params)
conf = sex_result.conf_int()
conf = np.exp(conf)
p_values = sex_result.pvalues

# Create DataFrame
sex_results = pd.DataFrame({
    'Variable': odds_ratios.index,
    'Odds Ratio': odds_ratios.values,
    '95% CI Lower': conf.iloc[:, 0].values,
    '95% CI Upper': conf.iloc[:, 1].values,
    'P-Value': p_values.values
})

# Display the results
print(sex_results)


############################################
# Univariable Logistic Regression - Income #
############################################

# Prepare independent and dependent variables
X = pd.get_dummies(data['income'], drop_first=True).astype(float)
y = data['MACE'].astype(int)  

# Add constant to the independent variable
X = sm.add_constant(X)

# Perform logistic regression
model = sm.Logit(y, X)
income_result = model.fit()

# Get the odds ratio, confidence interval, and p-value
odds_ratios = np.exp(income_result.params)
conf = income_result.conf_int()
conf = np.exp(conf)
p_values = income_result.pvalues

# Create DataFrame 
income_results= pd.DataFrame({
    'Variable': odds_ratios.index,
    'Odds Ratio': odds_ratios.values,
    '95% CI Lower': conf.iloc[:, 0].values,
    '95% CI Upper': conf.iloc[:, 1].values,
    'P-Value': p_values.values
})

# Display results
print(income_results)


#########################################
# Univariable Logistic Regression - Age #
#########################################


X = pd.get_dummies(data['age_group'], drop_first=True).astype(float)  
y = data['MACE'].astype(int)  

# Add constant to independent variable
X = sm.add_constant(X)

# Perform logistic regression
model = sm.Logit(y, X)
age_result = model.fit()

# Get the odds ratio, confidence interval, and p-value
odds_ratios = np.exp(age_result.params)
conf = age_result.conf_int()
conf = np.exp(conf)
p_values = age_result.pvalues

# Create a DataFrame 
age_results = pd.DataFrame({
    'Variable': odds_ratios.index,
    'Odds Ratio': odds_ratios.values,
    '95% CI Lower': conf.iloc[:, 0].values,
    '95% CI Upper': conf.iloc[:, 1].values,
    'P-Value': p_values.values
})

# Display the results
print(age_results)



