In [21]:
import sys
sys.path.append('../src/')

import data_io
import pandas as pd
import numpy as np
import re

In [22]:
unique_locations = pd.read_csv(data_io.input_cleaned/'geolocations'/'unique_locations_w_fips.csv',
                                    encoding='utf-8',
                              dtype={'state_county_fips_str':'str'})
print(len(unique_locations))
unique_locations.head()

14254


Unnamed: 0,location,county_name,state,state_fips,county_fips,state_county_fips_str
0,"el dorado, ks",Butler,KS,20,15,20015
1,"new rochelle, ny",Westchester,NY,36,119,36119
2,"columbia, pa",Lancaster,PA,42,71,42071
3,"forest hill, md",Harford,MD,24,25,24025
4,"mount dora, fl",Lake,FL,12,69,12069


In [23]:
df = pd.read_csv(data_io.input_cleaned/'gfm'/'US_cancer_campaigns_2018_2021.csv',index_col=[0],
                 sep='|',encoding='utf-8')

In [None]:
exclusion_df = pd.read_csv(data_io.input_cleaned/'gfm'/'exclusion_tracker_rd_2.csv',
                              index_col = 0)

In [24]:
#drop locations that didn't geocode
unique_locations = unique_locations.replace('nan',np.nan).replace('none',np.nan)
unique_locations.dropna(subset=['county_name'], inplace=True)

county_dict = dict(zip(unique_locations['location'].to_list(), unique_locations['county_name'].to_list()))
fips_dict = dict(zip(unique_locations['location'].to_list(), unique_locations['county_fips'].to_list()))
long_fips_dict = dict(zip(unique_locations['location'].to_list(), 
                          unique_locations['state_county_fips_str'].to_list()))


cleaned_location_city = df['location_city'].str.lower().str.strip()
df['location_county'] = cleaned_location_city.map(county_dict)
df['location_county_fip'] = cleaned_location_city.map(fips_dict)
df['location_state_county_fip'] = cleaned_location_city.map(long_fips_dict)

In [25]:
unique_locs_to_scrape = df.loc[df.location_state_county_fip.isna(),
                               ['location_city','location_city_only','location_stateprefix']].drop_duplicates()
print(unique_locs_to_scrape.shape)
unique_locs_to_scrape.to_csv(data_io.input_cleaned/'geolocations'/'unique_locations_to_scrape.csv')

(2394, 3)


In [None]:
exclusion_df.loc['deleted', 'failed_geocode'] = df['county'].isnull().sum()
df = df.dropna(subset = ['county'])
exclusion_df.loc['total', 'failed_geocode'] = len(df)
exclusion_df.to_csv(data_io.input_cleaned/'gfm'/'final_exclusion_tracker.csv')
geo_fail = df[pd.isnull(df['county'])]
#save failed geocodes to make sure nothing in the US failed
save = False
if save:
    geo_fail.to_csv(data_io.input_cleaned/'gfm'/'master_failed_geocode.csv', encoding='utf-8-sig')
df.dropna(subset=['county'], inplace=True)

#### Define text mining functions

In [26]:
#functions for additional text mining
SEARCH_OPTIONS = pd.read_csv(data_io.gfm/'gfm'/'free_text_search_terms.csv')

SEARCH_DICT = {'cancer_type': SEARCH_OPTIONS['cancer_type'].dropna().to_list(),
              'insurance_type': SEARCH_OPTIONS['insurance_type'].dropna().to_list(),
              'oop_type': SEARCH_OPTIONS['oop_type'].dropna().to_list(),
              'tx_type':SEARCH_OPTIONS['tx_type'].dropna().to_list(),
              'clin_trial':SEARCH_OPTIONS['clin_trial'].dropna().to_list(),
              'complementary':SEARCH_OPTIONS['complementary'].dropna().to_list(),
              'battle':SEARCH_OPTIONS['battle'].dropna().to_list(),
              'self_reliance':SEARCH_OPTIONS['self_reliance'].dropna().to_list(),
              'journey': SEARCH_OPTIONS['journey'].dropna().to_list(),
              'thank': SEARCH_OPTIONS['thank'].dropna().to_list(),
              'nice':SEARCH_OPTIONS['nice'].dropna().to_list(),
              'brave':SEARCH_OPTIONS['brave'].dropna().to_list(),
              'financial_distress': SEARCH_OPTIONS['financial_distress'].dropna().to_list()}

    
def create_dict(search_type):
    key_col = 'collapsed_'+search_type
    new = SEARCH_OPTIONS.dropna(subset=[search_type])
    this_dict = pd.Series(new[key_col].values,index=new[search_type].values).to_dict()
    
    return this_dict

INSURE_DICT = create_dict('insurance_type')
#OOP_DICT = create_dict('oop_type')
import string   
#import regex as re
def extract_search_term_regex(x, search_type = 'cancer_type', return_context = False,
                             find_uninsured = False, collapse_dict = 'none'):
    if type(x) == str:
        x = x.lower()
    else:
        return np.nan
    
    search_terms = SEARCH_DICT[search_type]
    #match only if char after match is a space or punctuation
    if 'cancer' in search_type:
        for s in search_terms:
            smatch = re.search(s+'\W', x)
            if smatch:
                if return_context == True:
                    end_smatch = smatch.span()[1]
                    new = x[smatch.span()[0]:]
                    new = new[0: new.find('.')]
                    return new

                return(x[smatch.span()[0]:smatch.span()[1]])
        return np.nan
    else:
        return_val = False
        uninsure = False
        mention = []
        collapsed_mention = []
        for s in search_terms:
            smatch = re.search(s, x)
            
            if smatch:
                if return_context == True:
                    
                    new = x[smatch.span()[0]:smatch.span()[1]]
                    #print(new)
                    mention.append(new)
                
                if find_uninsured == True:
                    if INSURE_DICT[s] == 'uninsured' or INSURE_DICT[s] == 'underunisured':
                        return True
                else:
                    return_val = True
                
                if type(collapse_dict) != str:
                    collapsed_mention.append(collapse_dict[s])

                return_val = True
                
        if len(mention) >= 1:
            mention = ','.join(mention)
        else:
            mention = None
            
        if len(collapsed_mention) >= 1:
            collapsed_mention = np.unique(np.asarray(collapsed_mention))
            collapsed_mention = list(collapsed_mention)
            collapsed_mention = ','.join
        else:
            collapsed_mention = None
            
        if type(collapse_dict)!= str:
            return collapsed_mention
        
        return mention if return_context == True else return_val

def search_story_and_title(story, title, search_type):
    story_truth = extract_search_term_regex(story, search_type = search_type)
    title_truth = extract_search_term_regex(title, search_type = search_type)
    if title_truth == True or story_truth == True:
        return True
    elif title_truth == False and story_truth == False:
        return False

def get_all_mentions(story, title, search_type):
    story_truth = extract_search_term_regex(story, 
                                            search_type = search_type, 
                                            return_context = True)
    title_truth = extract_search_term_regex(title, 
                                            search_type = search_type, 
                                            return_context = True)
    if type(story_truth) == str:
        if type(title_truth) == str:
            story_truth += title_truth
        return story_truth
    elif type(title_truth) == str:
        return title_truth
    else:
        return None

#Returns a comma separated string of the features that match the search in question
def extract_feature(story, feature = 'tx_type_search', title = None):
    features = SEARCH_DICT[feature]
    if pd.isnull(title):
        searches = [story]
    else:
        searches = [story, title]
        
    return_str = ''
    for x in searches:
        if type(x) == str:
            x = x.lower()
            for f in features:
                if f in x:
                    if len(return_str) == 0:
                        return_str += f
                    else:
                        return_str += ', '
                        return_str += f
                        
    if len(return_str) == 0:
        return np.nan
    else:
        return return_str


def collapse_feature(mentions, feature_dict):
    if type(mentions) == str:
        temp_mentions = mentions.split(', ')
        new_mentions = []
        for t in temp_mentions:
            new_mentions.append(feature_dict[t])
        
        new_mentions = np.unique(new_mentions)
        
        new_mentions = ', '.join(new_mentions)
        return new_mentions


    
def assign_num_occurrences(mentions):
    if type(mentions) == str:
        if ',' in mentions:
            new = mentions.split(',')
            return len(new)
        else:
            if mentions != '':
                return 1
        
    else:
        return 0

In [8]:
df_old = df.copy()

####  Mine each text feature

In [27]:
recode = True
if recode:
    #Look for clinical/financial details
    recode_feats_to_search = ['oop_type', 'insurance_type', 'tx_type',
                                 'cancer_type']
    df['story_and_title'] = df['title'] + ' ' + df['story']
    for r in recode_feats_to_search:
        new_col = r + '_is_mentioned'
        print(f"searching for {r}")
        df[new_col] = df.apply(lambda x: extract_search_term_regex(x['story_and_title'],
                                                            search_type = r),
                                       axis = 1)
        print(f"extracting {r}")
        df[r] = df.apply(lambda x: extract_search_term_regex(x['story_and_title'],
                                                            search_type = r,
                                                            return_context = True),
                                       axis = 1)
        recode = 'collapsed_' + r
        
        feat_dict = create_dict(r)
        print(f"collapsing {r}")
        df[recode] = df.apply(lambda x: extract_search_term_regex(x['story_and_title'],
                                                            search_type = r,
                                                            collapse_dict = feat_dict),
                                       axis = 1)

    
    df['num_tx'] = df['collapsed_tx_type'].apply(assign_num_occurrences)
    df['num_oop'] = df['collapsed_oop_type'].apply(assign_num_occurrences)
    df['uninsured'] = df.apply(lambda x: extract_search_term_regex(x['story_and_title'],
                                                                search_type = 'insurance_type',
                                                                find_uninsured = True),
                                       axis = 1)
    #Look for worth indicators
    worth_indicators = ['brave', 'nice', 'thank', 'self_reliance', 'battle']
    
    for w in worth_indicators:
        new_col = w + '_is_mentioned'
        print(f"searching for {w}")
        df[new_col] = df.apply(lambda x: extract_search_term_regex(x['story_and_title'],
                                                            search_type = w),
                                       axis = 1)
        print(f"extracting {w}")
        df[w] = df.apply(lambda x: extract_search_term_regex(x['story_and_title'],
                                                            search_type = w,
                                                            return_context = True),
                                       axis = 1)
    save = True
    if save:
        df.drop(columns=['story_and_title'],inplace=True)
        df.to_csv(data_io.input_cleaned/'gfm'/'US_cancer_campaigns_2018_2021_with_fips_and_text_features.csv',
                  encoding='utf-8',sep='|')

searching for oop_type
extracting oop_type
collapsing oop_type
searching for insurance_type
extracting insurance_type
collapsing insurance_type
searching for tx_type
extracting tx_type
collapsing tx_type
searching for cancer_type
extracting cancer_type
collapsing cancer_type
searching for brave
extracting brave
searching for nice
extracting nice
searching for thank
extracting thank
searching for self_reliance
extracting self_reliance
searching for battle
extracting battle
