In [120]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime
import math
from IPython.display import display, HTML
from datetime import date
import os

In [121]:
my_bucket = os.getenv('WORKSPACE_BUCKET')
CDR_version=os.getenv("WORKSPACE_CDR")

In [122]:
# SQL query for getting the participant IDs for cohort
query = ("""
SELECT DISTINCT person_id, question, question_concept_id, answer
FROM `"""+ str(CDR_version) +""".ds_survey` 
WHERE question_concept_id in (1585386, 43528428)
ORDER BY person_id
""")
insurance = pd.read_gbq(query, dialect="standard")

The classes include “Public”, “Private” or “Self-pay”. <br> 
Public class includes ‘Medicare’, ‘Medicare Advantage’, ‘Medicare Assigned’, ‘Medi-Cal’, ‘Medicaid’, and ‘Medi-Cal Assigned’. <br>
Private class includes ‘International Payor’, ‘Group Health Plan’, ‘Worker’s Comp’, ‘Tricare’, ‘UCLA Managed Care’, ‘Blue Shield’, ’Commercial’, ’Blue Cross’, ‘Package Billing’ and ‘Other’.

In [124]:
def extract_value(answer):
    parts = answer.split(': ')
    return parts[1] if len(parts) > 1 else answer

# Apply function to extract values
insurance['answer'] = insurance['answer'].apply(extract_value)

def insurance_class(answer):
    if answer in ['Medicare', 'Medicaid']:
        return 'Public'
    elif answer in ['Employer Or Union', 'VA', 'Other Health Plan', 'Military', 'Indian']:
        return 'Private'
    elif answer in ['Purchased']:
        return 'Self-pay'
    elif answer in ['No']:
        return 'No'
    elif answer in ['Skip', 'Prefer Not To Answer', 'Dont Know', 'None', 'Invalid']:
        return 'Unknown'

insurance['case'] = insurance['answer'].apply(insurance_class)  
insurance.loc[insurance['answer'].str.contains('Yes'), 'case'] = 'Yes'

In [125]:
merged_insurance = insurance.groupby(['person_id'])['case'].apply(lambda x: ','.join(x)).reset_index()

In [126]:
# Function to extract value
import re
def extract_value(response):
    # Check if 'Yes' is the only thing in the string
    if response.strip() == 'Yes':
        return 'Yes'
    
    # Remove 'Yes' from the response if it's accompanied by other values
    cleaned_response = re.sub(r'\bYes\b,? ?', '', response)
    
    # Split the cleaned response by commas to extract all remaining values
    remaining_values = cleaned_response.split(',')
    
    # Remove leading and trailing whitespace from each value and return the first one
    remaining_values = [value.strip() for value in remaining_values if value.strip()]
    
    if remaining_values:
        return ', '.join(remaining_values)
    else:
        return response

In [127]:
# Function to extract value
def change_duplicates(response):
    # Split the response by commas
    values = response.split(', ')
    
    # Keep only unique values
    unique_values = list(set(values))
    
    # Join the unique values back together with commas
    return ', '.join(unique_values)

In [128]:
def more_than_one_option(response):
    if response not in ['Private', 'Public', 'No', 'Yes', 'Self-pay', 'Unknown']:
        return 'Yes'
    else:
        return response

In [129]:
merged_insurance['case_adjusted'] = merged_insurance['case'].apply(extract_value)

In [130]:
merged_insurance['case_adjusted'] = merged_insurance['case_adjusted'].apply(change_duplicates)

In [131]:
merged_insurance['case_adjusted'] = merged_insurance['case_adjusted'].apply(more_than_one_option)

In [132]:
# some patients could select more than one option, so all those are just included as 'yes'
merged_insurance.case_adjusted.value_counts()

case_adjusted
Private     142452
Public      126725
Yes          88393
No           26779
Self-pay     14553
Unknown      12668
Name: count, dtype: int64

In [135]:
insurance_forModel = merged_insurance[['person_id', 'case_adjusted']]

In [137]:
insurance_forModel.to_csv("./insurance_class_allofUs.csv")

In [138]:
!gsutil cp ./insurance_class_allofUs.csv {my_bucket}/insurance_class_allofUs.csv

Copying file://./insurance_class_allofUs.csv [Content-Type=text/csv]...
\ [1 files][  8.4 MiB/  8.4 MiB]                                                
Operation completed over 1 objects/8.4 MiB.                                      
