### Import packages

In [1]:
import sqlite3
import pandas as pd
import json
from itertools import chain
from random import sample
import numpy as np
import os
from bs4 import BeautifulSoup
import string
import re
import xmltodict
from xml.etree import ElementTree as ET
import requests
import json
import seaborn as sns
import matplotlib.pyplot as plt
import zipcodes
import fasttext
from collections import Counter

### Query database

In [2]:
conn = sqlite3.connect("gfm.db")
feed = pd.read_sql_query("SELECT url, fund_description FROM feed_tb", conn)

### Prepare data

In [3]:
feed = feed.drop_duplicates('url')
feed['fund_description'] = [' '.join(BeautifulSoup(x).findAll(text=True)).replace(u'\xa0', u' ') for x in feed['fund_description']]
feed['fund_description'] = feed['fund_description'].str.lower()
feed['fund_description'] = [x.replace('\n','') for x in feed['fund_description']]
feed['nchar']= [len(x) for x in feed['fund_description']]
feed = feed[feed['nchar'] >= 500]

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


In [4]:
def GetLanguage(df):
    path_to_pretrained_model = 'models/lid.176.bin'
    fmodel = fasttext.load_model(path_to_pretrained_model)
    r = fmodel.predict(feed['fund_description'].to_list())
    labels = r[0]
    labels = [i[0].split('__')[2] for i in labels]
    scores = r[1]
    scores = [i[0] for i in scores]
    return labels, scores

In [5]:
feed['language'], feed['lang_score'] = GetLanguage(feed)



In [6]:
feed = feed[feed['language'] == 'en']

### Search for terms

In [7]:
def CountTerms(df, terms):
    '''
    input: pandas df of campaign descriptions, list of terms
    '''
    results = {}
    for term in terms:
        if type(term) == list:
            search_term_0 = r"\b{0}\b".format(term[0])
            search_term_1 = r"\b{0}\b".format(term[1])
            res_0 = np.array(df['fund_description'].str.contains(search_term_0).to_list())
            res_1 = np.array(df['fund_description'].str.contains(search_term_1).to_list())
            res_and = np.logical_and(res_0, res_1).astype(int).tolist()
            label = (" + ").join(term)
            results[label] = res_and
        else:
            search_term = r"\b{0}\b".format(term)
            results[term] = df['fund_description'].str.contains(search_term).astype(int).to_list()
    return results

In [85]:
terms = ['lung']

In [86]:
count_dict = CountTerms(feed, terms)

In [117]:
counts = pd.DataFrame(count_dict)

In [118]:
feed.reset_index(drop=True, inplace=True)
counts.reset_index(drop=True, inplace=True)

In [120]:
df = pd.concat([feed, counts], axis=1)

In [121]:
df['lung'].value_counts()

0    74124
1     4923
Name: lung, dtype: int64

### Get most common terms after 'lung'

In [122]:
df = df[df['lung'] == 1]

In [132]:
df['terms_after_lung'] = [x.split('lung')[1].split()[0].translate(str.maketrans('', '', string.punctuation)) for x in df['fund_description']]

In [134]:
terms_clean = list(filter(None, df['terms_after_lung']))

In [137]:
counts = Counter(terms_clean)

In [150]:
term_freq = pd.DataFrame.from_dict(counts, orient='index').reset_index()
term_freq.columns = ['term','freq']
term_freq = term_freq.sort_values(['freq'], ascending=False)

In [155]:
term_freq.head()

Unnamed: 0,term,freq
1,cancer,1480
3,s,645
7,and,366
4,transplant,251
2,disease,202


In [153]:
for term in term_freq['term']:
    print(term)

cancer
s
and
transplant
disease
infection
was
that
function
collapsed
issues
which
capacity
condition
biopsy
failure
infections
damage
is
had
tissue
removed
problems
surgery
to
with
development
machine
bypass
collapse
has
in
so
nodules
doctor
but
area
as
from
injury
specialist
issue
cavity
disorder
collapsing
brain
metastasis
or
carcinoma
surgeries
diseases
puncture
complications
due
functions
transplants
last
association
growth
making
institute
blood
tests
along
lobe
cancers
testing
at
pain
on
contusion
tumor
liver
nodule
lining
partially
defect
treatments
org
for
clearing
fibrosis
diease
muscles
injuries
constriction
about
causing
complication
also
tubes
during
mass
were
he
ailment
right
biopsies
it
requiring
clasped
conditions
pneumonia
ct
contusions
the
test
full
chest
fluid
cancerand
ventilator
by
region
ed
where
tumors
functioning
surgeon
we
bone
auscultation
transplantnow
before
hence
cancerhe
cutoffandaribremovedlater
adenocarcinoma
lymph
among
indicating
defficiency
–
shortly


In [154]:
cancer_terms = ['cancer',
                'carcinoma',
                'cancers',
                'tumor',
                'nodule',
                'mass', 
                'masses', 
                'carciona', 
                'cancerhow', 
                'cancer...both', 
                'cancerstage', 
                'cancerefgr',
                'cancerand',
                'cancerhe',
                'adenocarcinoma',
                'carcinoid',
                'cancerwas',
                'cancerim',
                'cancertrying',
                'cancerour',
                'cancer"',
                'cancerinvolving',
                'canceryou']

### Run search for lung cancer

In [156]:
terms = ['lung cancer']

count_dict = CountTerms(feed, terms)

counts = pd.DataFrame(count_dict)

feed.reset_index(drop=True, inplace=True)
counts.reset_index(drop=True, inplace=True)

df = pd.concat([feed, counts], axis=1)

df['lung cancer'].value_counts()

0    77372
1     1675
Name: lung cancer, dtype: int64

In [157]:
df = df[df['lung cancer'] == 1]

In [175]:
df['cigar'] = df['fund_description'].str.contains('cigar')

In [179]:
df['tobacco'] = df['fund_description'].str.contains('tobacco')

In [176]:
df['smoke'] = df['fund_description'].str.contains('smoke')

In [182]:
df['smoke'].sum()

128