In [1]:
import os
import pandas as pd
from io import StringIO
base_dir = 'file2/'
# base_dir = r'C:\Users\rstem15\Desktop\Dissertation\Data\files'

#Get all files in the directory
count = 0
main_data = pd.DataFrame()
for file in os.listdir(base_dir):
    if 'json' in file:
        count += 1
        if count == 1:
            json_path = os.path.join(base_dir, file)
            main_data = pd.read_json(json_path)
            main_data['mrn'] = file[:-5]
        else:
            json_path = os.path.join(base_dir, file)
            temp = pd.read_json(json_path)
            temp['mrn'] = file[:-5]
            main_data = pd.concat([main_data, temp], axis=0, sort=False)

In [2]:
# dataframe with one row for each ID and sentence 
sentences = []
for index, row in main_data.iterrows():
#     print(row[2])
    for sentence in row['RPT_TEXT'].split('.'):
#         print(sentence, '\n')
        if sentence != '': 
            sentences.append((index, row['ENCOUNTER_DATE'], sentence, row['mrn']))
sentence = pd.DataFrame(sentences, columns= ['MAIN_INDEX', 'ENCOUNTER_DATE', 'RPT_TEXT', 'mrn'])

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# cleaning 

# lower case
sentence['RPT_TEXT'] = sentence['RPT_TEXT'].str.lower() 

# remove punc. 
sentence['RPT_TEXT'] = sentence['RPT_TEXT'].str.replace('[^\w\s]','')

# remove numbers 
sentence['RPT_TEXT']= sentence['RPT_TEXT'].str.replace('\d+', '') 

# remove single letter words
sentence["RPT_TEXT"] = sentence["RPT_TEXT"].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')

# remove specific words
# sentence["RPT_TEXT"] = sentence["RPT_TEXT"].str.replace('metadata','')
# sentence["RPT_TEXT"] = sentence["RPT_TEXT"].str.replace('smartlist','')

# remove stopwords 

stopwords = set(stopwords.words('english')) 

sentence["RPT_TEXT"] = [' '.join([w for w in x.lower().split() if w not in stopwords]) 
    for x in sentence["RPT_TEXT"].tolist()]

In [4]:
# filter out additional noise 
searchfor = ['smartlist', 'metadata']
sentence = sentence[~sentence['RPT_TEXT'].str.contains('|'.join(searchfor))]

# drop empty rows 
sentence.dropna()

Unnamed: 0,MAIN_INDEX,ENCOUNTER_DATE,RPT_TEXT,mrn
0,0,2017-05-09T05:00:00Z,pcp yee lam md chief complaint follow visit ck...,000000085456
1,0,2017-05-09T05:00:00Z,marsha lee bynum,000000085456
2,0,2017-05-09T05:00:00Z,,000000085456
3,0,2017-05-09T05:00:00Z,year old patient past medical history signific...,000000085456
4,0,2017-05-09T05:00:00Z,hasnt seen clinic since march last year,000000085456
...,...,...,...,...
11944554,1623,2018-04-09T15:33:00Z,fluid pleural light microscopy performed dr,100052491162
11944555,1623,2018-04-09T15:33:00Z,fedoriw,100052491162
11944556,1623,2018-04-09T15:33:00Z,decalcified specimens interpreted caution setting,100052491162
11944557,1623,2018-04-09T15:33:00Z,immunohistochemical reagents used case may cla...,100052491162


In [5]:
# tokenize sents 

sentence['tokenized_sents'] = sentence.apply(lambda row: nltk.word_tokenize(row['RPT_TEXT']), axis=1)


words = sentence['tokenized_sents']

sentence.dropna()

Unnamed: 0,MAIN_INDEX,ENCOUNTER_DATE,RPT_TEXT,mrn,tokenized_sents
0,0,2017-05-09T05:00:00Z,pcp yee lam md chief complaint follow visit ck...,000000085456,"[pcp, yee, lam, md, chief, complaint, follow, ..."
1,0,2017-05-09T05:00:00Z,marsha lee bynum,000000085456,"[marsha, lee, bynum]"
2,0,2017-05-09T05:00:00Z,,000000085456,[]
3,0,2017-05-09T05:00:00Z,year old patient past medical history signific...,000000085456,"[year, old, patient, past, medical, history, s..."
4,0,2017-05-09T05:00:00Z,hasnt seen clinic since march last year,000000085456,"[hasnt, seen, clinic, since, march, last, year]"
...,...,...,...,...,...
11944554,1623,2018-04-09T15:33:00Z,fluid pleural light microscopy performed dr,100052491162,"[fluid, pleural, light, microscopy, performed,..."
11944555,1623,2018-04-09T15:33:00Z,fedoriw,100052491162,[fedoriw]
11944556,1623,2018-04-09T15:33:00Z,decalcified specimens interpreted caution setting,100052491162,"[decalcified, specimens, interpreted, caution,..."
11944557,1623,2018-04-09T15:33:00Z,immunohistochemical reagents used case may cla...,100052491162,"[immunohistochemical, reagents, used, case, ma..."


In [6]:
from gensim.models import Word2Vec
import multiprocessing

EMB_DIM = 300 

w2v = Word2Vec(words, size=EMB_DIM, window=5, min_count=5, negative=15, iter=10, workers=multiprocessing.cpu_count())

In [7]:
from gensim.models.phrases import Phrases, Phraser

phrases = Phrases(words, min_count=1, threshold=1)


In [8]:
from gensim.models.phrases import Phrases, Phraser

bigram = Phrases(words, min_count=5, threshold=10)

In [9]:
print(bigram['pt is unemployed and lost their food stamps'.split()])

['pt', 'is', 'unemployed', 'and', 'lost', 'their', 'food_stamps']


In [10]:
from gensim.models import Word2Vec
import multiprocessing

bigram[words]

EMB_DIM = 300 

w2v = Word2Vec(bigram[words], size=EMB_DIM, window=5, min_count=5, negative=15, iter=10, workers=multiprocessing.cpu_count())

In [11]:
word_vectors = w2v.wv 

In [13]:
# housing insecurity 
word_vectors = w2v.wv 
result = word_vectors.similar_by_word('homeless')
print("Most similar to homeless:\n ", result[:10])

result = word_vectors.similar_by_word('shelter')
print("Most similar to shelter:\n ", result[:10])

result = word_vectors.similar_by_word('housing')
print("Most similar to housing:\n ", result[:10])

result = word_vectors.similar_by_word('transitional')
print("Most similar to transitional:\n ", result[:10])

result = word_vectors.similar_by_word('streets')
print("Most similar to streets:\n ", result[:10])

result = word_vectors.similar_by_word('motel')
print("Most similar to motel:\n ", result[:10])

result = word_vectors.similar_by_word('evicted')
print("Most similar to evicted:\n ", result[:10])

result = word_vectors.similar_by_word('inadequate')
print("Most similar to inadequate:\n ", result[:10])

result = word_vectors.similar_by_word('cluttered')
print("Most similar to cluttered:\n ", result[:10])

result = word_vectors.similar_by_word('excessive')
print("Most similar to excessive:\n ", result[:10])

result = word_vectors.similar_by_word('banned')
print("Most similar to banned:\n ", result[:10])

result = word_vectors.similar_by_word('homelessness')
print("Most similar to homelessness:\n ", result[:10])

result = word_vectors.similar_by_word('hoarder')
print("Most similar to hoarder:\n ", result[:10])

result = word_vectors.similar_by_word('foreclosed')
print("Most similar to foreclosed:\n ", result[:10])

result = word_vectors.similar_by_word('landlord')
print("Most similar to landlord:\n ", result[:10])

result = word_vectors.similar_by_word('foreclosure')
print("Most similar to foreclosure:\n ", result[:10])

result = word_vectors.similar_by_word('eviction')
print("Most similar to eviction:\n ", result[:10])

Most similar to homeless:
  [('lives_alone', 0.5754784941673279), ('homeless_staying', 0.540141224861145), ('lives_roommate', 0.5232914686203003), ('lives', 0.5080665349960327), ('resides', 0.49626779556274414), ('staying_friend', 0.49555113911628723), ('lives_parents', 0.48142844438552856), ('receives_disability', 0.478386789560318), ('shelter', 0.47013694047927856), ('staying_friends', 0.46764492988586426)]
Most similar to shelter:
  [('homeless_shelter', 0.702965259552002), ('oxford_house', 0.6457653641700745), ('ifc_shelter', 0.6203469038009644), ('hotel', 0.6199221014976501), ('ifc', 0.6040788888931274), ('ifc_mens', 0.5870614647865295), ('shelters', 0.582786500453949), ('boarding_house', 0.5823005437850952), ('crescent_green', 0.5737310647964478), ('raleigh_rescue', 0.556501030921936)]
Most similar to housing:
  [('lodging', 0.5638847947120667), ('apartment', 0.5606194138526917), ('employment', 0.5590101480484009), ('funding', 0.5206238031387329), ('financially', 0.51862680912017

In [20]:
# bigram housing insecurity 

# result = word_vectors.similar_by_word('currently_homeless')
# print("Most similar to currently_homeless:\n ", result[:10])

result = word_vectors.similar_by_word('homeless_shelter')
print("Most similar to homeless_shelter:\n ", result[:10])

result = word_vectors.similar_by_word('oxford_house')
print("Most similar to oxford_house:\n ", result[:10])

result = word_vectors.similar_by_word('rescue_mission')
print("Most similar to rescue_mission:\n ", result[:10])

result = word_vectors.similar_by_word('unstable_housing')
print("Most similar to unstable_housing:\n ", result[:10])

result = word_vectors.similar_by_word('transitional_housing')
print("Most similar to transitional_housing:\n ", result[:10])

result = word_vectors.similar_by_word('ifc_shelter')
print("Most similarifc_shelter:\n ", result[:10])

result = word_vectors.similar_by_word('subsidized_housing')
print("Most similar subsidized_housing:\n ", result[:10])

result = word_vectors.similar_by_word('public_housing')
print("Most similar public_housing:\n ", result[:10])

result = word_vectors.similar_by_word('halfway_house')
print("Most similar halfway_house:\n ", result[:10])

result = word_vectors.similar_by_word('stressors_homelessness')
print("Most similar stressors_homlessness\n ", result[:10])

result = word_vectors.similar_by_word('stressors_homeless')
print("Most similar stressors_homless\n ", result[:10])

result = word_vectors.similar_by_word('oxford_house')
print("Most similar oxford_house\n ", result[:10])

# result = word_vectors.similar_by_word('housing_crisis')
# print("Most similar housing_crisis:\n ", result[:10])

# result = word_vectors.similar_by_word('housing_issue')
# print("Most similar housing_issue:\n ", result[:10])

# result = word_vectors.similar_by_word('inadequate_housing')
# print("Most similar to inadequate_housing:\n ", result[:10])

Most similar to homeless_shelter:
  [('shelter', 0.702965259552002), ('hotel', 0.6357588768005371), ('boarding_house', 0.6170910596847534), ('oxford_house', 0.6069959402084351), ('ifc_shelter', 0.5913164615631104), ('ifc', 0.5798057913780212), ('streets', 0.5728400945663452), ('crescent_green', 0.5720561742782593), ('ifc_mens', 0.5714375972747803), ('shelters', 0.5697826743125916)]
Most similar to oxford_house:
  [('hotel', 0.6519111394882202), ('shelter', 0.6457653641700745), ('homeless_shelter', 0.6069959402084351), ('ifc_shelter', 0.6056114435195923), ('freedom_house', 0.6054555177688599), ('rooming_house', 0.5972055792808533), ('halfway_house', 0.596527099609375), ('apartments', 0.5717361569404602), ('ifc', 0.5633367896080017), ('crescent_green', 0.5629474520683289)]
Most similar to rescue_mission:
  [('targeted_housing', 0.5402595400810242), ('public_housing', 0.5164884328842163), ('alamance_county', 0.5102652907371521), ('housing_authority', 0.503994345664978), ('willow_creek', 0

In [21]:
# unigram general_financial
result = word_vectors.similar_by_word('afford')
print("Most similar to afford:\n ", result[:10])

result = word_vectors.similar_by_word('affordable')
print("Most similar to affordable:\n ", result[:10])

result = word_vectors.similar_by_word('welfare')
print("Most similar to welfare:\n ", result[:10])

result = word_vectors.similar_by_word('income')
print("Most similar to income:\n ", result[:10])

result = word_vectors.similar_by_word('financially')
print("Most similar to financially:\n ", result[:10])

result = word_vectors.similar_by_word('financial')
print("Most similar to financial:\n ", result[:10])

result = word_vectors.similar_by_word('subsidized')
print("Most similar to subsidized:\n ", result[:10])

Most similar to afford:
  [('expensive', 0.6366941928863525), ('able_afford', 0.6253128051757812), ('pay', 0.593163013458252), ('afford_copay', 0.5903345346450806), ('afford_pay', 0.5860451459884644), ('affordable', 0.574178159236908), ('money_pay', 0.5705806016921997), ('paying', 0.5193784832954407), ('difficulty_affording', 0.5098163485527039), ('copays', 0.5073404312133789)]
Most similar to affordable:
  [('able_afford', 0.6455603837966919), ('expensive', 0.6152728796005249), ('afford', 0.574178159236908), ('copays', 0.5529758930206299), ('paying', 0.5518890619277954), ('difficulty_affording', 0.5497854948043823), ('costs', 0.5343084335327148), ('cheaper', 0.5294228792190552), ('cost', 0.5224942564964294), ('insurance', 0.5206737518310547)]
Most similar to welfare:
  [('grave', 0.3998778164386749), ('difficulty_trusting', 0.39060476422309875), ('lives_greenville', 0.38642311096191406), ('workers', 0.3842325210571289), ('shares', 0.38309091329574585), ('individuals', 0.37548506259918

In [22]:
# bigram general_financial

result = word_vectors.similar_by_word('financial_strain')
print("Most similar to financial_strain:\n ", result[:10])


Most similar to financial_strain:
  [('family_discord', 0.6071642637252808), ('familial_discord', 0.6026310920715332), ('stressors', 0.5613694190979004), ('recent_breakup', 0.5604428648948669), ('unemployment', 0.5481169819831848), ('unstable_housing', 0.5472395420074463), ('homelessness', 0.5444921255111694), ('disability', 0.5383155345916748), ('spmi', 0.5275355577468872), ('homelessness_lack', 0.5223388075828552)]


In [26]:
# employment 

result = word_vectors.similar_by_word('unemployment')
print("Most similar to unemployment:\n ", result[:10])

result = word_vectors.similar_by_word('unemployed')
print("Most similar to unemployed:\n ", result[:10])

result = word_vectors.similar_by_word('retired')
print("Most similar to retired:\n ", result[:10])

result = word_vectors.similar_by_word('prison')
print("Most similar to prison:\n ", result[:10])

result = word_vectors.similar_by_word('jail')
print("Most similar to jail:\n ", result[:10])

result = word_vectors.similar_by_word('prostitution')
print("Most similar to prostitution:\n ", result[:10])

# result = word_vectors.similar_by_word('prostitute')
# print("Most similar to prostitute:\n ", result[:10])

result = word_vectors.similar_by_word('police')
print("Most similar to police:\n ", result[:10])

result = word_vectors.similar_by_word('incarcerations')
print("Most similar to incarcerations:\n ", result[:10])

result = word_vectors.similar_by_word('incarcerated')
print("Most similar to incarcerated:\n ", result[:10])

result = word_vectors.similar_by_word('trespassing')
print("Most similar to tresspassing:\n ", result[:10])

# result = word_vectors.similar_by_word('inmate')
# print("Most similar to inmate:\n ", result[:10])

result = word_vectors.similar_by_word('prisoner')
print("Most similar to prisoner:\n ", result[:10])

result = word_vectors.similar_by_word('veteran')
print("Most similar to veteran:\n ", result[:10])

result = word_vectors.similar_by_word('probation')
print("Most similar to probation:\n ", result[:10])

result = word_vectors.similar_by_word('parole')
print("Most similar to parole:\n ", result[:10])

result = word_vectors.similar_by_word('disability')
print("Most similar to disability:\n ", result[:10])

result = word_vectors.similar_by_word('disabled')
print("Most similar to disabled:\n ", result[:10])

result = word_vectors.similar_by_word('jobless')
print("Most similar to jobless:\n ", result[:10])

# result = word_vectors.similar_by_word('forging')
# print("Most similar to forging\n ", result[:10])

Most similar to unemployment:
  [('financial_strain', 0.5481169819831848), ('disability', 0.5367335081100464), ('family_discord', 0.535839855670929), ('familial_discord', 0.5119908452033997), ('stressors_homeless', 0.5024783611297607), ('spmi', 0.4820428490638733), ('unstable_housing', 0.4800345301628113), ('stressors_homelessness', 0.479671835899353), ('finances', 0.47024017572402954), ('stressors', 0.4639405608177185)]
Most similar to unemployed:
  [('retired', 0.676026463508606), ('employed', 0.6191266775131226), ('software_engineer', 0.5961828231811523), ('employment', 0.595928430557251), ('disabled', 0.5739959478378296), ('previously_worked', 0.5587162971496582), ('incomeemploymentdisability', 0.5426405668258667), ('married_children', 0.534203827381134), ('receives_disability', 0.5335801839828491), ('married', 0.5310548543930054)]
Most similar to retired:
  [('unemployed', 0.6760265827178955), ('employed', 0.6375187039375305), ('disabled', 0.6284197568893433), ('army', 0.585494279

In [33]:
# bigram employment_income

# result = word_vectors.similar_by_word('court_date')
# print("Most similar to court_date:\n ", result[:10])

# result = word_vectors.similar_by_word('on_disability')
# print("Most similar to on_disability:\n ", result[:10])

result = word_vectors.similar_by_word('disability_income')
print("Most similar to disability_income:\n ", result[:10])

result = word_vectors.similar_by_word('lost_job')
print("Most similar to lost_job:\n ", result[:10])

result = word_vectors.similar_by_word('receives_disability')
print("Most similar to receives_disability:\n ", result[:10])

# result = word_vectors.similar_by_word('job_loss')
# print("Most similar to job_loss:\n ", result[:10])

# result = word_vectors.similar_by_word('into_custody')
# print("Most similar to into_custody:\n ", result[:10])

# result = word_vectors.similar_by_word('financial_stressors')
# print("Most similar to financial_stressors:\n ", result[:10])

# result = word_vectors.similar_by_word('financial_concerns')
# print("Most similar to financial_concerns:\n ", result[:10])

Most similar to disability_income:
  [('food_stamps', 0.6804975271224976), ('pension', 0.6583424210548401), ('ssdi', 0.639662504196167), ('social_security', 0.6058430671691895), ('supported_fathers', 0.580163836479187), ('retirement', 0.5799654126167297), ('union_pension', 0.5734497308731079), ('ssd', 0.5453264713287354), ('income', 0.5432124137878418), ('receives_ssi', 0.5411105155944824)]
Most similar to lost_job:
  [('injecting_spiralled', 0.528311550617218), ('lived', 0.5022199153900146), ('abusive_relationship', 0.4971252679824829), ('marriage', 0.49201154708862305), ('disabled', 0.48587942123413086), ('anniversary', 0.4775221347808838), ('passed_away', 0.4758349657058716), ('divorce', 0.4747774600982666), ('recently_moved', 0.47129231691360474), ('fathers', 0.47118622064590454)]
Most similar to receives_disability:
  [('receives_ssi', 0.6490846872329712), ('receives_ssdi', 0.6300133466720581), ('ssdi', 0.5835277438163757), ('employed', 0.5600742101669312), ('pension', 0.546322107

In [34]:
# insurance insecurity

result = word_vectors.similar_by_word('uninsured')
print("Most similar to uninsured:\n ", result[:10])

result = word_vectors.similar_by_word('medicaid')
print("Most similar to medicaid:\n ", result[:10])

result = word_vectors.similar_by_word('copay')
print("Most similar to copay:\n ", result[:10])

result = word_vectors.similar_by_word('cheaper')
print("Most similar to cheaper:\n ", result[:10])

result = word_vectors.similar_by_word('selfpay')
print("Most similar to selfpay:\n ", result[:10])

Most similar to uninsured:
  [('funding', 0.376424103975296), ('income', 0.3704181909561157), ('housing_options', 0.3576876223087311), ('enrolled', 0.3487488627433777), ('worked_artist', 0.3478553295135498), ('fmla', 0.335788369178772), ('paying_bills', 0.3245146870613098), ('employed', 0.3243080675601959), ('clinics', 0.32112812995910645), ('fulltime_job', 0.31521227955818176)]
Most similar to medicaid:
  [('insurance', 0.7137954831123352), ('bcbs', 0.6974966526031494), ('mcd', 0.6910043954849243), ('tricare', 0.6490012407302856), ('humana', 0.6252036094665527), ('medicaremedicaid', 0.6222045421600342), ('mcaid', 0.6189523935317993), ('cigna', 0.6132909059524536), ('payment', 0.6007585525512695), ('social_security', 0.5882778763771057)]
Most similar to copay:
  [('copays', 0.623701810836792), ('abbvie', 0.5887880325317383), ('pay', 0.5878186225891113), ('payment', 0.5805902481079102), ('cost', 0.5635175704956055), ('carve', 0.5596886277198792), ('donut_hole', 0.5570360422134399), ('co

In [39]:
# bigram insurance insecurity 

# result = word_vectors.similar_by_word('charity_care')
# print("Most similar to charity_care:\n ", result[:10])

# result = word_vectors.similar_by_word('self_pay')
# print("Most similar to self_pay:\n ", result[:10])

# result = word_vectors.similar_by_word('no_insurance')
# print("Most similar to no_insurance:\n ", result[:10])

# result = word_vectors.similar_by_word('lost_insurance')
# print("Most similar to lost_insurance:\n ", result[:10])

In [43]:
# bigram poor social support 

result = word_vectors.similar_by_word('lives_alone')
print("Most similar lives_alone:\n ", result[:10])

# result = word_vectors.similar_by_word('social_isolation')
# print("Most similar social_isolation:\n ", result[:10])

# result = word_vectors.similar_by_word('family_support')
# print("Most similar family_support:\n ", result[:10])

# result = word_vectors.similar_by_word('social_support')
# print("Most similar social_support:\n ", result[:10])

Most similar lives_alone:
  [('lives', 0.7275352478027344), ('lives_spouse', 0.6455349326133728), ('resides', 0.6334966421127319), ('lives_husband', 0.6309038400650024), ('lives_roommate', 0.6277313232421875), ('lives_parents', 0.6217008829116821), ('homeless', 0.5754784345626831), ('living_arrangements', 0.5704300403594971), ('homeless_staying', 0.5651353597640991), ('resides_alone', 0.5605531930923462)]


In [45]:
# bigram food insecurity 

result = word_vectors.similar_by_word('food_pantry')
print("Most similar food_pantry:\n ", result[:10])

result = word_vectors.similar_by_word('food_pantries')
print("Most similar food_pantries:\n ", result[:10])

result = word_vectors.similar_by_word('food_stamps')
print("Most similar food_stamps:\n ", result[:10])

# result = word_vectors.similar_by_word('food_insecure')
# print("Most similar lives_alone:\n ", result[:10])

result = word_vectors.similar_by_word('food_insecurity')
print("Most similar food_insecurity:\n ", result[:10])

Most similar food_pantry:
  [('food_stamps', 0.598235547542572), ('food_pantries', 0.5629051327705383), ('meals_wheels', 0.5467780232429504), ('shiip', 0.513883113861084), ('hrsweek', 0.5100759267807007), ('bank', 0.4840356111526489), ('ez_rider', 0.4739542007446289), ('companies', 0.471824049949646), ('rental', 0.47024738788604736), ('ifc', 0.47004586458206177)]
Most similar food_pantries:
  [('shelters', 0.59828782081604), ('food_pantry', 0.5629051327705383), ('snfs', 0.5425512790679932), ('homeless_shelters', 0.5221374034881592), ('oxford_houses', 0.5219051837921143), ('aana_meetings', 0.5180584192276001), ('alfs', 0.5083172917366028), ('bank', 0.4976741671562195), ('innetwork', 0.4933004677295685), ('website', 0.4874088764190674)]
Most similar food_stamps:
  [('disability_income', 0.6804975271224976), ('social_security', 0.6754821538925171), ('ssdi', 0.6657308340072632), ('pension', 0.6196086406707764), ('income', 0.6165660619735718), ('meals_wheels', 0.6057977676391602), ('food_pa