In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tableone import TableOne
import getpass, re, json, sys
from datetime import datetime, timedelta
import pandas_gbq as pgbq
# Load packages for Big Query 
from google.cloud import bigquery
import os
from sqlalchemy import create_engine


%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tableone import TableOne
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%matplotlib inline

In [None]:
causes_GPT = pd.read_csv('./medical_data_with_causes.csv')  

In [None]:
final_cohort = pd.read_csv('./final_cohort.csv')  

In [None]:
causes_GPT.drop('exposure_group', axis=1, inplace=True)

In [None]:
causes_final = causes_GPT.merge(final_cohort[['pat_deid','exposure_group']], on='pat_deid', how='inner')

In [None]:
final_cohort[final_cohort["death_from_after_diagnosis_op_exposure"]<=14].drop_duplicates(subset = ['pat_deid']).exposure_group.value_counts()

In [None]:
causes_final.drop_duplicates(subset = ['pat_deid']).exposure_group.value_counts()

In [None]:
causes_final.exposure_group.value_counts()

In [None]:

# Group by two columns and concatenate col1, col2, col3 values as a single list
df_agg = causes_final.groupby(['pat_deid', 'exposure_group']).agg(
    {'Cause1': lambda x: list(x), 
     'Cause2': lambda x: list(x), 
     'Cause3': lambda x: list(x)}
).apply(lambda row: row['Cause1'] + row['Cause2'] + row['Cause3'], axis=1).reset_index(name='concatenated_list')


In [None]:
full_list = []
for i in df_agg['concatenated_list']:
    full_list += i
unique_causes = list(set(full_list))

In [None]:

cause_category_mapping = {
    'Cancer': ['cancer', 'leukemia', 'lymphoma', 'metastatic', 'myeloma', 'malignancy', 'carcinoma'],
    'Cardiovascular Diseases': ['heart failure', 'cardiac', 'cardiomyopathy', 'arrhythmia', 'hypertension', 'myocardial', 'angina', 'vascular', 'coronary', 'embolism', 'shock', 'aortic'],
    'Cerebrovascular Events': ['stroke', 'cva', 'hemorrhage', 'ischemic', 'infarction', 'subarachnoid', 'subdural', 'cerebral'],
    'Gastrointestinal': ['gi bleed', 'colitis', 'cholangitis', 'perforation', 'ischemic bowel', 'gastric', 'intestinal', 'pancreatic'],
    'Infection': ['pneumonia', 'sepsis', 'bacteremia', 'uti', 'abscess', 'infection', 'peritonitis'],
    'Kidney Problems': ['renal', 'kidney', 'aki', 'ckd', 'esrd', 'uremia'],
    'Liver Disorders': ['liver', 'cirrhosis', 'hepatic', 'alcoholic'],
    'Neurological Disorders': ['dementia', 'alzheimer', 'parkinson', 'seizure', 'encephalopathy', 'neurological', 'brain', 'stroke'],
    'Respiratory Issues': ['respiratory', 'hypoxia', 'aspiration', 'pulmonary', 'pneumonia', 'copd', 'ards', 'covid19'],
    'Sepsis': ['sepsis', 'septic'],
    'Trauma': ['trauma', 'injury', 'fracture', 'tbi', 'accident'],
    'Other': ['malnutrition', 'anemia', 'electrolyte imbalance', 'acidosis', 'failure to thrive', 'frailty', 'multi-organ failure']
}

In [None]:
cause_category_mapping.keys()

In [None]:

# Flatten the dictionary values into a single list
mapped_causes = [cause for causes in cause_category_mapping.values() for cause in causes]

# Find any causes from your unique list that are missing in the dictionary
missing_causes = [cause for cause in unique_causes if cause not in mapped_causes]

# Check if there are any missing causes
if missing_causes:
    print("The following causes are missing from the dictionary:", missing_causes)
else:
    print("All causes are included in the dictionary.")

In [None]:
respiratory_issues_mapping = {
    'Respiratory Failure Conditions': [
        'respiratory failure', 'hypoxic respiratory failure', 'hypercarbic respiratory failure', 
        'acute respiratory failure', 'chronic respiratory failure', 'hypoxemia', 'hypoxia', 
        'acute pulmonary edema'
    ],
    'COVID-19 Conditions': [
        'covid-19', 'covid pneumonia', 'covid-19 pneumonia', 'pneumonia due to covid-19'
    ],
    'Pneumonia-Related Conditions': [
        'pneumonia', 'aspiration pneumonia', 'bacterial pneumonia', 'viral pneumonia', 
        'community acquired pneumonia', 'multifocal pneumonia'
    ],
    'Aspiration and Airway Obstruction Conditions': [
        'aspiration', 'aspiration pneumonitis', 'airway obstruction', 'laryngeal stenosis', 
        'difficulty clearing secretions'
    ],
    'Pulmonary Embolism and Hypertension': [
        'pulmonary embolism', 'pulmonary hypertension', 'acute pulmonary embolism', 
        'segmental PE'
    ],
    'Respiratory Distress and ARDS': [
        'respiratory distress', 'acute respiratory distress syndrome', 'ARDS', 
        'diffuse pulmonary alveolar hemorrhage'
    ],
    'Other Respiratory Issues': [
        'COPD', 'idiopathic pulmonary fibrosis', 'pulmonary edema', 'pneumothorax', 
        "Wegener's granulomatosis"
    ]
}

In [None]:

# Flatten the dictionary values into a single list
mapped_causes = [cause for causes in respiratory_issues_mapping.values() for cause in causes]

# Find any causes from your unique list that are missing in the dictionary
missing_causes = [cause for cause in cause_category_mapping['Respiratory Issues'] if cause not in mapped_causes]

# Check if there are any missing causes
if missing_causes:
    print("The following causes are missing from the dictionary:", missing_causes)
else:
    print("All causes are included in the dictionary.")

In [None]:
import pandas as pd
import ast

# Function to safely parse the causes of death
def parse_causes(cause_str):
    if isinstance(cause_str, (list, pd.Series, np.ndarray)):  # Check for lists, arrays, or series and return as is
        return cause_str
    if pd.isnull(cause_str):  # If the value is NaN, return an empty list
        return []
    if isinstance(cause_str, str):  # If it's a string, try to parse it as a list
        try:
            return ast.literal_eval(cause_str)  # Try to evaluate it as a Python literal
        except (ValueError, SyntaxError):  # If parsing fails, return empty list
            return []
    return []  # If it's not NaN, string, list, or array, return empty list

# Parse the concatenated_list
df_agg['causes_of_death'] = df_agg['concatenated_list'].apply(parse_causes)
# Explode the causes of death into separate rows
exploded_data = df_agg.explode('causes_of_death')
exploded_data = exploded_data[exploded_data['causes_of_death'].notna() & exploded_data['causes_of_death'].apply(lambda x: isinstance(x, (list, str)) and len(x) > 0)]

In [None]:
# Assuming parsed_data_list is a pandas Series from exploded_data['causes_of_death']
parsed_data_list = exploded_data['causes_of_death']

# Initialize two lists to store the results
in_dictionary = []
not_in_dictionary = []

# Check if each parsed data item belongs to any value list in the dictionary
for parsed_data in parsed_data_list:
    belongs_to_values = any(parsed_data in value_list for value_list in cause_category_mapping.values())
    
    if belongs_to_values:
        in_dictionary.append(parsed_data)
    else:
        not_in_dictionary.append(parsed_data)

# Print or use the lists as needed
print("Values that belong to the dictionary:")
print(in_dictionary)

print("\nValues that do not belong to the dictionary:")
print(not_in_dictionary)



In [None]:
# Define the function to map each cause to multiple categories
def map_to_categories(cause, mapping):
    categories = []  # Initialize an empty list to store all matching categories
    
    try:
        # Ensure cause is not NaN and is a string
        if pd.isna(cause) or not isinstance(cause, str):
            return 'not matching'

        # Convert cause to lowercase and strip whitespace for comparison
        cause_lower = cause.strip().lower()

        # Iterate over the dictionary and match the cause to all applicable categories
        for category, keywords in mapping.items():
            if any(isinstance(keyword, str) and keyword.strip().lower() in cause_lower for keyword in keywords):
                categories.append(category)

        # If no categories found, return 'not matching'
        if not categories:
            return 'not matching'

    except Exception as e:
        # Handle any unexpected issues by returning 'not matching'
        print(f"Error processing cause: {cause}, error: {e}")
        return 'not matching'

    return categories  # Return list of matching categories or 'not matching'


# Apply the function to the 'causes_of_death' column and create a new column 'category'
exploded_data['category'] = exploded_data['causes_of_death'].apply(lambda x: map_to_categories(x, cause_category_mapping))

# Explode the DataFrame to handle the list of categories
exploded_data2 = exploded_data.explode('category')

# Apply the function to the 'causes_of_death' column and create a new column 'category'
exploded_data['respiratory_category'] = exploded_data['causes_of_death'].apply(lambda x: map_to_categories(x, respiratory_issues_mapping))

respiratory_exploded_data = exploded_data.explode('respiratory_category')

exploded_data2= exploded_data2.drop_duplicates(subset = ['pat_deid','exposure_group','causes_of_death','category'])
respiratory_exploded_data =  respiratory_exploded_data.drop_duplicates(subset = ['pat_deid','exposure_group','causes_of_death','respiratory_category'])

In [None]:
grouped_data = exploded_data2.groupby(['exposure_group', 'causes_of_death']).size().reset_index(name='count')

# Sort by 'exposure_group' and 'count' in descending order
grouped_data = grouped_data.sort_values(by=['exposure_group', 'count'], ascending=[True, False])

# Get the top 10 causes of death for each exposure group
top_10_causes = grouped_data.groupby('exposure_group').head(10)
top_10_causes

In [None]:
import pandas as pd

consistent_user_sample_size = 62
new_user_sample_size = 179

# Create a new DataFrame for unique combinations of pat_deid, exposure_group, and category
exploded_data_category = exploded_data2[['pat_deid', 'exposure_group', 'category']].drop_duplicates()

# Group by 'exposure_group' and 'category', and get the count
grouped_category_data = exploded_data_category.groupby(['exposure_group', 'category']).size().reset_index(name='count')

# Calculate prevalence rate for each exposure group
grouped_category_data['sample_size'] = grouped_category_data['exposure_group'].map({
    'consistent user': consistent_user_sample_size,
    'new user': new_user_sample_size
})




In [None]:
def perform_chi_square(row):
    # Calculate counts
    new_user_cases = int(row['prevalence_new_user'] * row['total_new_user'])
    consistent_user_cases = int(row['prevalence_consistent_user'] * row['total_consistent_user'])
    new_user_non_cases = row['total_new_user'] - new_user_cases
    consistent_user_non_cases = row['total_consistent_user'] - consistent_user_cases

    # Create contingency table
    contingency_table = [
        [new_user_cases, consistent_user_cases],
        [new_user_non_cases, consistent_user_non_cases]
    ]
    
    # Perform Chi-Square test
    chi2, p, _, _ = chi2_contingency(contingency_table)
    return pd.Series({'chi2_statistic': chi2, 'p_value': p})


In [None]:
grouped_category_data['non_cases'] = grouped_category_data['sample_size'] - grouped_category_data['count']
grouped_category_data

In [None]:

# Prevalence rate calculation
grouped_category_data['prevalence_rate'] = np.round(grouped_category_data['count'] / grouped_category_data['sample_size'],2)

# Pivot the table to get prevalence rate for both exposure groups side-by-side for each category
pivot_table = grouped_category_data.pivot(index='category', columns='exposure_group', values='prevalence_rate').reset_index()

# Replace NaN values with 0 (in case any category is missing in an exposure group)
pivot_table.fillna(0, inplace=True)

# Calculate rate difference between 'new_user' and 'consistent_user'
pivot_table['rate_difference'] = pivot_table['new user'] - pivot_table['consistent user']

# Categorize based on rate difference
pivot_table['category_label'] = pivot_table['rate_difference'].apply(lambda x: 'Higher in New Users' if x > 0 else 'Higher in Consistent Users' if x < 0 else 'No Difference')

# Sort the table by the absolute value of 'rate_difference' in descending order
pivot_table_sorted = pivot_table.sort_values(by='rate_difference', key=abs, ascending=False)


pivot_table_sorted = pivot_table_sorted[pivot_table_sorted.category != 'not matching']

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Set font to Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

df = pivot_table_sorted  # Assuming you already have this DataFrame

# Calculate rate difference (new user rate - consistent user rate)
df['rate_difference'] = df['new user'] - df['consistent user']

# Set up the bar width and figure size
bar_width = 0.3
index = np.arange(len(df['category']))

plt.figure(figsize=(12, 8))

# Bar plot for prevalence rates of new users and consistent users
bar1 = plt.bar(index, df['new user'], bar_width, label='New Users', color='#FF6347')  # Tomato color
bar2 = plt.bar(index + bar_width, df['consistent user'], bar_width, label='Consistent Users', color='#4682B4')  # SteelBlue color

# Bar plot for rate difference, where negative values indicate higher consistent user rates
bar3 = plt.bar(index + 2 * bar_width, df['rate_difference'], bar_width, label='Rate Difference', color='gray')

# Add labels, title, and legend
plt.xlabel('Health Condition', fontsize=18)
plt.ylabel('Prevalence Rate', fontsize=18)
plt.title('Prevalence Rates and Rate Difference Between New and Consistent Users', fontsize=20)

# Customize x-ticks for better readability and ensure they do not overlap
plt.xticks(index + bar_width, df['category'], fontsize=20, rotation=20, ha='right')

# Customize y-ticks for better readability
plt.yticks(fontsize=20)

# Adjust the y-axis scale to include negative values if rate difference is negative
plt.ylim(min(df['rate_difference'].min(), 0) * 1.1, max(df['new user'].max(), df['consistent user'].max()) * 1.1)

# Adjust legend for better positioning
plt.legend(loc='best', fontsize=20, frameon=False)

# Save the figure as vector images (SVG and PDF)
#plt.savefig("prevalence_rate_difference_helvetica_negative.svg", format="svg", bbox_inches="tight")
#plt.savefig("../figures/prevalence_rate_difference_helvetica_negative.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.tight_layout()
plt.show()






In [None]:
respiratory_exploded_data =  respiratory_exploded_data.drop_duplicates(subset = ['pat_deid','exposure_group','causes_of_death','respiratory_category'])

import pandas as pd

# Assume sample size for each exposure group
consistent_user_sample_size = 62
new_user_sample_size = 179

# Create a new DataFrame for unique combinations of pat_deid, exposure_group, and category
exploded_data_category = respiratory_exploded_data[['pat_deid', 'exposure_group', 'respiratory_category']].drop_duplicates()

# Group by 'exposure_group' and 'category', and get the count
grouped_category_data = exploded_data_category.groupby(['exposure_group', 'respiratory_category']).size().reset_index(name='count')

# Calculate prevalence rate for each exposure group
grouped_category_data['sample_size'] = grouped_category_data['exposure_group'].map({
    'consistent user': consistent_user_sample_size,
    'new user': new_user_sample_size
})


In [None]:

# Calculate prevalence rate if it's not already in the DataFrame
# Prevalence rate = count / total * 100
if 'prevalence_rate' not in grouped_category_data.columns:
    grouped_category_data['prevalence_rate'] = np.round((grouped_category_data['count'] / grouped_category_data['sample_size']),2)
grouped_category_data['non_cases'] = grouped_category_data['sample_size'] - grouped_category_data['count']
grouped_category_data

In [None]:

# Pivot the table to get prevalence rate for both exposure groups side-by-side for each category
pivot_table = grouped_category_data.pivot(index='respiratory_category', columns='exposure_group', values='prevalence_rate').reset_index()

# Replace NaN values with 0 (in case any category is missing in an exposure group)
pivot_table.fillna(0, inplace=True)

# Calculate rate difference between 'new user' and 'consistent user'
pivot_table['rate_difference'] = pivot_table['new user'] - pivot_table['consistent user']

# Categorize based on rate difference
pivot_table['category_label'] = pivot_table['rate_difference'].apply(
    lambda x: 'Higher in New Users' if x > 0 else 'Higher in Consistent Users' if x < 0 else 'No Difference'
)

# Sort the table by the absolute value of 'rate_difference' in descending order
pivot_table_sorted = pivot_table.sort_values(by='rate_difference', key=abs, ascending=False)

# Round the prevalence rate and rate difference to 2 decimal places
pivot_table_sorted[['new user', 'consistent user', 'rate_difference']] = pivot_table_sorted[['new user', 'consistent user', 'rate_difference']]

# Drop rows where respiratory_category is 'not matching'
pivot_table_sorted = pivot_table_sorted[pivot_table_sorted['respiratory_category'] != 'not matching']

# Display the result
pivot_table_sorted



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Set font to Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

df = pivot_table_sorted  # Assuming you already have this DataFrame

# Calculate rate difference (new user rate - consistent user rate)
df['rate_difference'] = df['new user'] - df['consistent user']

# Set up the bar width and figure size
bar_width = 0.3
index = np.arange(len(df['respiratory_category']))

plt.figure(figsize=(12, 8))

# Bar plot for prevalence rates of new users and consistent users
bar1 = plt.bar(index, df['new user'], bar_width, label='New Users', color='#FF6347')  # Tomato color
bar2 = plt.bar(index + bar_width, df['consistent user'], bar_width, label='Consistent Users', color='#4682B4')  # SteelBlue color

# Bar plot for rate difference, where negative values indicate higher consistent user rates
bar3 = plt.bar(index + 2 * bar_width, df['rate_difference'], bar_width, label='Rate Difference', color='gray')

# Add labels, title, and legend
plt.xlabel('Health Condition', fontsize=18)
plt.ylabel('Prevalence Rate', fontsize=18)
plt.title('Respiratory Condition Prevalence Rates and Rate Difference Between New and Consistent Users', fontsize=20)

# Customize x-ticks for better readability and ensure they do not overlap
plt.xticks(index + bar_width, df['respiratory_category'], fontsize=20, rotation=15, ha='right')

# Customize y-ticks for better readability
plt.yticks(fontsize=20)

# Adjust the y-axis scale to include negative values if rate difference is negative
plt.ylim(min(df['rate_difference'].min(), 0) * 1.1, max(df['new user'].max(), df['consistent user'].max()) * 1.1)

# Adjust legend for better positioning
plt.legend(loc='best', fontsize=20, frameon=False)

# Save the figure as vector images (SVG and PDF)
#plt.savefig("prevalence_rate_difference_helvetica_negative.svg", format="svg", bbox_inches="tight")
plt.savefig("../figures/respiratory_prevalence_rate_difference_helvetica_negative.pdf", format="pdf", bbox_inches="tight")

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import ast

# Load the data
file_path = 'causes_death_no_deid.csv'
data = pd.read_csv(file_path)

# Function to safely parse the causes of death
def parse_causes(cause_str):
    if pd.isnull(cause_str):
        return []
    try:
        return ast.literal_eval(cause_str)
    except (ValueError, SyntaxError):
        return []

# Parse the causes of death
data['causes_of_death'] = data['concatenated_list'].apply(parse_causes)

# Explode the causes of death into separate rows
exploded_data = data.explode('causes_of_death')

# Remove rows with NaN or empty causes
exploded_data = exploded_data[exploded_data['causes_of_death'].notna() & (exploded_data['causes_of_death'] != '')]

# Define a more granular cause mapping for respiratory issues
respiratory_issues_mapping = {
    'Respiratory Failure Conditions': [
        'respiratory failure', 'hypoxic respiratory failure', 'hypercarbic respiratory failure', 
        'acute respiratory failure', 'chronic respiratory failure', 'hypoxemia', 'hypoxia', 
        'acute pulmonary edema'
    ],
    'COVID-19 Conditions': [
        'covid-19', 'covid pneumonia', 'covid-19 pneumonia', 'pneumonia due to covid-19'
    ],
    'Pneumonia-Related Conditions': [
        'pneumonia', 'aspiration pneumonia', 'bacterial pneumonia', 'viral pneumonia', 
        'community acquired pneumonia', 'multifocal pneumonia'
    ],
    'Aspiration and Airway Obstruction Conditions': [
        'aspiration', 'aspiration pneumonitis', 'airway obstruction', 'laryngeal stenosis', 
        'difficulty clearing secretions'
    ],
    'Pulmonary Embolism and Hypertension': [
        'pulmonary embolism', 'pulmonary hypertension', 'acute pulmonary embolism', 
        'segmental PE'
    ],
    'Respiratory Distress and ARDS': [
        'respiratory distress', 'acute respiratory distress syndrome', 'ARDS', 
        'diffuse pulmonary alveolar hemorrhage'
    ],
    'Other Respiratory Issues': [
        'COPD', 'idiopathic pulmonary fibrosis', 'pulmonary edema', 'pneumothorax', 
        "Wegener's granulomatosis"
    ]
}

# Function to categorize respiratory related causes
def map_respiratory_causes(cause):
    if pd.isnull(cause):
        return 'Unknown'
    cause_lower = cause.lower()
    for category, keywords in respiratory_cause_mapping.items():
        if any(keyword in cause_lower for keyword in keywords):
            return category
    return 'Other'

# Map causes to granular respiratory categories
exploded_data['respiratory_category'] = exploded_data['causes_of_death'].apply(map_respiratory_causes)

# Calculate the prevalence of each respiratory related issue by exposure group
respiratory_counts_by_group = exploded_data.groupby(['exposure_group', 'respiratory_category']).size().reset_index(name='count')

# Total counts per exposure group
exposure_totals = exploded_data['exposure_group'].value_counts().reset_index()
exposure_totals.columns = ['exposure_group', 'total']

# Merge counts with total patient counts to calculate prevalence rates
merged_counts = pd.merge(respiratory_counts_by_group, exposure_totals, on='exposure_group')
merged_counts['prevalence_rate'] = merged_counts['count'] / merged_counts['total']

# Create a pivot table for prevalence summary
prevalence_summary = merged_counts.pivot(index='respiratory_category', columns='exposure_group', values='prevalence_rate').fillna(0)

# Calculate the difference in prevalence rates
prevalence_summary['rate_difference'] = prevalence_summary['new user'] - prevalence_summary['consistent user']

# Sort by absolute difference in prevalence rates
prevalence_comparison = prevalence_summary.sort_values(by='rate_difference', key=abs, ascending=False)

# Display prevalence comparison
print(prevalence_comparison[['new user', 'consistent user', 'rate_difference']])

# Display total patients in each group
print("\nTotal Patients in Each Group:")
print(exposure_totals)