### Import packages

In [1]:
import pandas as pd
import datetime
import os
import xmltodict
import urllib.request
import gzip
import numpy as np
import copy

### Sitemaps

In [2]:
#read in active sitemaps
basepath = '../data/sitemaps/sitemaps_raw/'

sitemaps_list = []

for filename in os.listdir(basepath):
    if filename.endswith(".gz"): 
        with gzip.open(basepath + filename, 'r') as fd:
            gz = fd.read()
            sitemaps_list.append(xmltodict.parse(gz))

In [3]:
#define function to extract info from sitemap
def getSitemapInfo(sitemap):
    sitemapinfo = []
    for i in range(len(sitemap['urlset']['url'])):
        url = sitemap['urlset']['url'][i]['loc']
        lastmod = sitemap['urlset']['url'][i]['lastmod']
        sitemapinfo.append([url, lastmod])
    return sitemapinfo

In [4]:
#store in dataframe
sitemaps = pd.DataFrame()
for sitemap in sitemaps_list:
    sitemaps = sitemaps.append(getSitemapInfo(sitemap))

In [5]:
#rename columns
sitemaps = sitemaps.rename(columns={0: "url", 1: "last-mod"})

### Convert urls to include /f/

In [11]:
sitemaps_no_fix = sitemaps[sitemaps['url'].str.contains('https://www.gofundme.com/f/')]

In [12]:
sitemaps_to_fix = sitemaps[~sitemaps['url'].str.contains('https://www.gofundme.com/f/')]

In [13]:
new_urls = copy.deepcopy(sitemaps_to_fix['url'].str.replace('https://www.gofundme.com/', 'https://www.gofundme.com/f/'))

In [14]:
sitemaps_to_fix.loc[:,'url'] = new_urls.loc[::]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [15]:
sitemaps = sitemaps_no_fix.append(sitemaps_to_fix)

In [16]:
urls_to_fix = []
for url in sitemaps['url']:
    if 'https://www.gofundme.com/f/' not in url:
        urls_to_fix.append(url)

In [1]:
#urls_to_fix

In [18]:
sitemaps = sitemaps[~sitemaps['url'].isin(urls_to_fix)]

In [19]:
urls_to_fix = []
for url in sitemaps['url']:
    if 'https://www.gofundme.com/f/' not in url:
        urls_to_fix.append(url)

In [20]:
urls_to_fix

[]

### Spit combined into chunks of 1000 for scraping and export

In [21]:
#randomize order of urls
sitemaps = sitemaps.sample(frac=1).reset_index(drop=True)

In [24]:
n = 1000  #chunk row size
list_df = [sitemaps[i:i+n] for i in range(0,sitemaps.shape[0],n)]

In [28]:
for i in range(len(list_df)):
    df = list_df[i]
    filename = "../data/sitemaps/sitemaps_csv/sitemaps_" + str(i) + ".csv"
    df.to_csv(filename)