### Import packages

In [12]:
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

### Import qualtrics data

In [2]:
r = pd.read_csv('Vet/Vet+Survey+1_December+3,+2020_17.30.csv', header=None).T.iloc[17:]

### Prepare qualtrics data

In [3]:
#delete first (incomplete) survey response
del r[2]

In [4]:
#rename columns
r.columns = ['qid','header','res']

In [5]:
#exctract url from header
r['url'] = [x.split(' - ')[0] for x in r['header']]

In [14]:
r.head()

Unnamed: 0,qid,header,res,url
17,QID2,https://www.gofundme.com/f/tracy039s-kidney-do...,False Positive,https://www.gofundme.com/f/tracy039s-kidney-do...
18,QID3,https://www.gofundme.com/f/save-letty-the-pitb...,True Positive,https://www.gofundme.com/f/save-letty-the-pitb...
19,QID4,"https://www.gofundme.com/f/1n5fdq742o - hi, i...",False Positive,https://www.gofundme.com/f/1n5fdq742o
20,QID5,https://www.gofundme.com/f/letshelpty - ty is ...,True Positive,https://www.gofundme.com/f/letshelpty
21,QID6,https://www.gofundme.com/f/fortheloveofRiley -...,True Positive,https://www.gofundme.com/f/fortheloveofRiley


### Import feed data

In [7]:
conn = sqlite3.connect("gfm.db")
feed = pd.read_sql_query("SELECT url, fund_description FROM feed_tb", conn)
feed = feed.drop_duplicates('url')
feed['fund_description'] = [' '.join(BeautifulSoup(x).findAll(text=True)).replace(u'\xa0', u' ') for x in feed['fund_description']]
feed['fund_description'] = feed['fund_description'].str.lower()

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


### Define search terms and function

In [8]:
search_terms = [
    ['pet','veterinary'],
    ['pet','veterinarian'],
    ['pet','vet'],
    ['dog','veterinary'],
    ['dog','veterinarian'],
    ['dog','vet'],
    ['service dog','veterinary'],
    ['service dog','veterinarian'],
    ['service dog','vet'],
    ['puppy','veterinary'],
    ['puppy','veterinarian'],
    ['puppy','vet'],
    ['kitty','veterinary'],
    ['kitty','veterinarian'],
    ['kitty','vet'],
    ['cat','veterinary'],
    ['cat','veterinarian'],
    ['cat','vet']
]

In [9]:
def CountTerms(df, terms):
    '''
    input: pandas df of campaign descriptions, list of terms
    '''
    results = {}
    for term in terms:
        if type(term) == list:
            search_term_0 = r"\b{0}\b".format(term[0])
            search_term_1 = r"\b{0}\b".format(term[1])
            res_0 = np.array(df['fund_description'].str.contains(search_term_0).to_list())
            res_1 = np.array(df['fund_description'].str.contains(search_term_1).to_list())
            res_and = np.logical_and(res_0, res_1).astype(int).tolist()
            label = (" + ").join(term)
            results[label] = res_and
        else:
            search_term = r"\b{0}\b".format(term)
            results[term] = df['fund_description'].str.contains(search_term).astype(int).to_list()
    return results

### Execute search

In [13]:
count_dict = CountTerms(feed, search_terms)
counts = pd.DataFrame(count_dict)

In [19]:
counts_sum = counts.sum(axis=1)

### Merge counts with feed

In [20]:
feed.reset_index(drop=True, inplace=True)
counts.reset_index(drop=True, inplace=True)

In [21]:
merged = pd.concat([feed, counts], axis=1)

In [25]:
merged['counts_sum'] = counts_sum

### Merge manual review with feed

In [26]:
df = merged.merge(r, on='url')

In [27]:
df = df[df['counts_sum'] > 0]

### Inspect results

In [30]:
df.head()

Unnamed: 0,url,fund_description,pet + veterinary,pet + veterinarian,pet + vet,dog + veterinary,dog + veterinarian,dog + vet,service dog + veterinary,service dog + veterinarian,...,kitty + veterinary,kitty + veterinarian,kitty + vet,cat + veterinary,cat + veterinarian,cat + vet,counts_sum,qid,header,res
0,https://www.gofundme.com/f/tracy039s-kidney-do...,"for those of you that may not know tracy, and ...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,QID2,https://www.gofundme.com/f/tracy039s-kidney-do...,False Positive
1,https://www.gofundme.com/f/save-letty-the-pitb...,meet letty the pitbull. she is my precocious ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,QID3,https://www.gofundme.com/f/save-letty-the-pitb...,True Positive
2,https://www.gofundme.com/f/1n5fdq742o,"hi, i'm gail. i am a 71-year-old woman who s...",0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,2,QID4,"https://www.gofundme.com/f/1n5fdq742o - hi, i...",False Positive
3,https://www.gofundme.com/f/letshelpty,ty is our beloved orange kitty. my brother and...,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,3,QID5,https://www.gofundme.com/f/letshelpty - ty is ...,True Positive
4,https://www.gofundme.com/f/fortheloveofRiley,"last week, was unremarkable. just a normal su...",1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,3,QID6,https://www.gofundme.com/f/fortheloveofRiley -...,True Positive


In [29]:
df['res'].value_counts()

True Positive     269
False Positive    149
Name: res, dtype: int64

In [35]:
pd.crosstab(df['res'],df['service dog + veterinary'])
pd.crosstab(df['res'],df['service dog + veterinarian'])
pd.crosstab(df['res'],df['service dog + vet'])

service dog + vet,0,1
res,Unnamed: 1_level_1,Unnamed: 2_level_1
False Positive,107,42
True Positive,250,19


In [38]:
pet_urls_to_exclude = df[df['res'] == 'True Positive']['url'].tolist()