### Loading packages and data

In [26]:
import sqlite3
import pandas as pd
import json
from itertools import chain
from random import sample
import numpy as np
import os
from bs4 import BeautifulSoup
import string
import re
import xmltodict
from xml.etree import ElementTree as ET
import requests
import json

### Establish database connection

In [121]:
conn = sqlite3.connect("../data/db/gfm.db")

### Get json files of scraped data

In [122]:
path_to_json = '../data/scraping/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

### Define functions

In [13]:
#scrape_tb
def make_scrape_tb(data):
    scrape_tb = pd.DataFrame([i['scrape'] for i in data])
    scrape_tb.to_sql('scrape_tb', conn, if_exists='append', index = False)

#feed_tb
def make_feed_tb(data):
    feed = pd.DataFrame(i['feed']['campaign'] for i in data if i['feed'] is not None)
    feed['url'] = ["https://www.gofundme.com/f/" + s for s in feed['url']]
    feed['city'] = [i['city'] for i in feed['location']]
    feed['country'] = [i['country'] for i in feed['location']]
    feed['postal_code'] = [i['postal_code'] for i in feed['location']]
    feed['bene_id'] = [i['id'] for i in feed['beneficiary']]
    feed['bene_user_id'] = [i['user_id'] for i in feed['beneficiary']]
    feed['bene_person_id'] = [i['person_id'] for i in feed['beneficiary']]
    feed['bene_first_name'] = [i['first_name'] for i in feed['beneficiary']]
    feed['bene_last_name'] = [i['last_name'] for i in feed['beneficiary']]
    feed['bene_is_placeholder'] = [i['is_placeholder_bene'] for i in feed['beneficiary']]
    feed['bene_profile_url'] = [i['profile_url'] for i in feed['beneficiary']]
    feed['campaign_photo_url'] = [i['url'] for i in feed['campaign_photo']]
    feed['team_name'] = [i['name'] if i != {} else None for i in feed['team'] ]
    feed['team_pic_url'] = [i['team_pic_url'] if i != {} else None for i in feed['team']]
    feed['team_media_type'] = [i['media_type'] if i != {} else None for i in feed['team']]
    feed['team_pub_attr'] = [i['public_attributions'] if i != {} else None for i in feed['team']]
    feed['team_invite_limit'] = [i['team_invite_limit'] if i != {} else None for i in feed['team']]
    feed['team_status'] = [i['status'] if i != {} else None for i in feed['team']]
    feed['team_created_date'] = [i['created_at'] if i != {} else None for i in feed['team']]
    feed['team_updated_date'] = [i['updated_at'] if i != {} else None for i in feed['team']]
    donors = pd.DataFrame(i['donor'] for i in data if i['donor'] is not None)
    donors['url'] = [i['scrape']['url'] for i in data if i['donor'] is not None]
    donors['donor_resp_status'] = [";".join(map(str,i)) for i in donors['donor_resp_status']]
    del donors['donor_list']
    feed = feed.merge(donors, on="url")
    del feed['beneficiary']
    del feed['charity']
    del feed['campaign_photo']
    del feed['location']
    del feed['tags']
    del feed['business']
    del feed['team']
    del feed['partner']
    feed.to_sql('feed_tb', conn, if_exists='append', index = False)
    return feed

#donation
def make_donation_tb(data):
    #add url to each donation log
    for i in range(0, len(data)):
        camp = data[i]
        scrape = camp['scrape']
        if(scrape['target_cat'] == 1 and scrape['country'] == "US" and scrape['activity_status'] == "active"):
            url = scrape['url']
            for don in data[i]['donor']['donor_list']:
                don['url'] = url
    #make pandas df
    donors = pd.DataFrame(i['donor'] for i in data if i['donor'] is not None)
    #extract donation log
    donation_tb = pd.DataFrame(i for i in chain.from_iterable(donors['donor_list']))
    #rename columns
    donation_tb.columns = ["online_id","don_amt","don_offline","don_anon","don_name","don_date","don_profile","don_verified","url"]
    #insert into sql db
    donation_tb.to_sql('donation_tb', conn, if_exists='append', index = False)
    
#team
def make_team_member_tb(data, feed):
    for i in range(0, len(data)):
        camp = data[i]
        scrape = camp['scrape']
        if(scrape['target_cat'] == 1 and scrape['country'] == "US" and scrape['activity_status'] == "active"):
            url = scrape['url']
            for mem in data[i]['feed']['team_members']:
                mem['url'] = url
    feed_for_team = pd.DataFrame([i['feed'] for i in data if i['feed'] is not None])
    if("True" in feed['is_team']):
        team_member_tb = pd.DataFrame(i for i in chain.from_iterable(feed_for_team['team_members']))
        team_member_tb.columns = ["team_mem_amt", "team_mem_fb", "team_mem_first_name","team_mem_gfm_profile","team_mem_id","team_mem_last_name","team_mem_don_attr","team_mem_profile","team_mem_role","team_mem_status","team_mem_person_id","team_mem_locale","url"]
        team_member_tb.to_sql('team_member_tb', conn, if_exists='append', index = False)
        
#comment
def make_comment_tb(data):
    for i in range(0, len(data)):
        camp = data[i]
        scrape = camp['scrape']
        if(scrape['target_cat'] == 1 and scrape['country'] == "US" and scrape['activity_status'] == "active"):
            url = scrape['url']
            for mem in data[i]['comment']['comment_list']:
                mem['url'] = url
    comments = pd.DataFrame([i['comment'] for i in data if i['comment'] is not None])
    comment_tb = pd.DataFrame(i for i in chain.from_iterable(comments['comment_list']))
    comment_tb['donation_amount'] = [i['amount'] if type(i) == dict else None for i in comment_tb['donation']]
    comment_tb['is_offline'] = [i['is_offline'] if type(i) == dict else None for i in comment_tb['donation']]
    comment_tb['is_anonymous'] = [i['is_anonymous'] if type(i) == dict else None for i in comment_tb['donation']]
    comment_tb['created_at'] = [i['created_at'] if type(i) == dict else None for i in comment_tb['donation']]
    
    #parse out comment variables
    try:
        comment_tb['comment_id_gfm'] = [i['comment_id'] if type(i) == dict else None for i in comment_tb['comment']]
        comment_tb['comment_text'] = [i['comment'] if type(i) == dict else None for i in comment_tb['comment']]
        comment_tb['status'] = [i['status'] if type(i) == dict else None for i in comment_tb['comment']]
        comment_tb['time_stamp'] = [i['timestamp'] if type(i) == dict else None for i in comment_tb['comment']]
        comment_tb['profile_url'] = [i['profile_url'] if type(i) == dict else None for i in comment_tb['comment']]
        comment_tb['deny_delete'] = [i['deny_delete'] if type(i) == dict else None for i in comment_tb['comment']]
    except:
        comment_tb['comment_id_gfm'] = [i['comment_id'] if len(i)>0 else None for i in comment_tb['comment']]
        comment_tb['comment_text'] = [i['comment'] if len(i)>0 else None for i in comment_tb['comment']]
        comment_tb['status'] = [i['status'] if len(i)>0 else None for i in comment_tb['comment']]
        comment_tb['time_stamp'] = [i['timestamp'] if len(i)>0 else None for i in comment_tb['comment']]
        comment_tb['profile_url'] = [i['profile_url'] if len(i)>0 else None for i in comment_tb['comment']]
        comment_tb['deny_delete'] = [i['deny_delete'] if len(i)>0 else None for i in comment_tb['comment']]
    
    comment_tb['photos'] = [i if type(i) == list else [] for i in comment_tb['photos']]
    comment_tb['photos'] = [i[0]['url'] if len(i) > 0 else None for i in comment_tb['photos']]
    del comment_tb['donation']
    del comment_tb['comment']
    del comment_tb['photos']
    del comment_tb['timestamp']
    #if("amount" in comment_tb.columns):
    #    del comment_tb['amount']
    comment_tb.to_sql('comment_tb', conn, if_exists='append', index = False)
    
#update
def make_update_tb(data):
    for i in range(0, len(data)):
        camp = data[i]
        scrape = camp['scrape']
        if(scrape['target_cat'] == 1 and scrape['country'] == "US" and scrape['activity_status'] == "active"):
            url = scrape['url']
            for mem in data[i]['update']['update_list']:
                mem['url'] = url
    updates = pd.DataFrame([i['update'] for i in data if i['update'] is not None])
    update_tb = pd.DataFrame(i for i in chain.from_iterable(updates['update_list']))
    if(len(update_tb)>0):
        #extract photo url
        update_tb['photo_url'] = [i[0]['url'] if len(i)> 0 else None for i in update_tb['photos']]
        #remove/reformat data in dictionary format so that it can be inserted into sql
        del update_tb['photos']
        #rename columns
        update_tb = update_tb.rename(columns={'text': 'update_text'})
        #send to db
        update_tb.to_sql('update_tb', conn, if_exists='append', index = False)

### Add data to database

In [1]:
for json_file in json_files:
    json_file_path = os.path.join(path_to_json, json_file)
    with open (json_file_path, "r") as f:
        data = json.load(f)
    #send to db
    make_scrape_tb(data)
    feed = make_feed_tb(data)
    make_donation_tb(data)
    make_team_member_tb(data, feed)
    make_comment_tb(data)
    make_update_tb(data)
    print(json_file)