In [1]:
import data_io
import pandas as pd
import numpy as np

## 1. Merge every table from the first rescrape into a giant csv 

Optional: use tqdm to have progress bar of things that take long time by setting use_tqdm=True

In [2]:
use_tqdm=True
if use_tqdm: from tqdm.notebook import tqdm

In [3]:
# if this is a rerun
dfs = pd.read_csv(data_io.input_raw/'gfm'/'master'/'rescrape.csv',sep="|",encoding='utf-8',dtype=str)
# skip the rest of step 1

In [None]:
def check_for_column_mixup(df):
    if (df.charity_details=='request timeout').any():
        # shift by one columns
        _m = df.charity_details=='request timeout'
        df.loc[_m,'page_status'] = df.loc[_m,'gfm_url']
        df.loc[_m,'gfm_url'] = df.loc[_m,'i_num']
        df.loc[_m,'i_num'] = df.loc[_m,'error_message']
        df.loc[_m,'error_message'] = df.loc[_m,'charity_details']
        df.loc[_m,'charity_details'] = 'none'
        df.loc[_m,'page_status'] = 'none' # currently the value would be "NA" due to old code
    return df

# Read all csv and append to list
fiter=(data_io.input_raw/'gfm'/'all_rescrape_success_files_gfm').glob('*.csv')
if use_tqdm: fiter = tqdm([*fiter])
dflist = []
for fp in fiter:
    if use_tqdm: fiter.set_description(f"Reading {fp.stem}")
    df = pd.read_csv(fp,encoding='utf-8',dtype=str)
    dflist.append(check_for_column_mixup(df).assign(i_filename=fp.stem))
    
# there's a column called i_filename to signal which rescrape-table each row came from 
dfs = pd.concat(dflist,ignore_index=True)
dfs.fillna('none',inplace=True)

HBox(children=(IntProgress(value=0, max=1480), HTML(value='')))

Save 

In [None]:
dfs.to_csv(data_io.input_raw/'gfm'/'master'/'rescrape.csv',index=False,sep="|")

In [None]:
dfs.head()

## 2a. Prepare for updating rescrape info 

Define function that keeps the best duplicate

In [None]:
def keep_best_duplicate(df,subset=['title', 'location'],use_tqdm=False):
    # For processing, these columns will be dropped later
    df = df.assign(keep_this_duplicate=False,uid=range(df.shape[0]))
    # Get potentially duplicated campaigns
    mb_duplicates_m = df.duplicated(subset=subset, keep=False)
    mb_duplicates = df.loc[mb_duplicates_m, :]
    # Higher score means having this field != none is more important
    importance_score = pd.Series({
        'goal': 10,
        'created_date': 10,
        'status': 5,
        'num_likes': 5,
        'num_shares': 5,
        'story': 3,
        'location': 3
    })

    def get_index_of_best_duplicate(group):
        group = group.copy().replace('none',np.nan)
        # since we're edditing group, pandas will act all weird so need to copy
        # Calculate parsing quality
        
        for idx, row in group.iterrows():
            group.loc[idx, 'parsing_quality'] = (row[importance_score.index].notna() *
                                                 importance_score).sum()
        # Sort campaigns by timestamp and consequently quality
        # More recent timestamp and higher quality will be the last row
        return group.sort_values(by=['archive_timestamp', 'parsing_quality'
                                     ],na_position='first').uid.iloc[-1]  # return uid of last row
    # Process each group of duplicate
    if use_tqdm:
        # use tqdm to make it pretty
        tqdm.pandas(desc='Processing duplicates')
        best_duplicate_uids = mb_duplicates.groupby(
            subset).progress_apply(get_index_of_best_duplicate)
    else:
        best_duplicate_uids = mb_duplicates.groupby(
            subset).apply(get_index_of_best_duplicate)

    # Signal the duplicate to keep based on uid
    df.loc[df.uid.isin(best_duplicate_uids), 'keep_this_duplicate'] = True
    # Return rows that are not duplicates OR is the best duplicate
    return df.loc[(df.keep_this_duplicate & mb_duplicates_m)
                  | ~mb_duplicates_m, :].drop(
                      columns=['keep_this_duplicate', 'uid'])

Cleaning functions

In [None]:
import urllib.parse as up
def clean_gfm_url(gurl):
    cleaned_path = clean_path(up.urlsplit(gurl).path)
    parts=['https','www.gofundme.com',cleaned_path,'','']
    return up.urlunsplit(parts)
def clean_path(path):
    return '/'.join([p for p in path.split('/') if p!=''])
def get_campaign_id(gurl):
    return clean_path(up.urlsplit(gurl).path).split('/')[-1]
def prepare_url_for_update(url):
    # remove new gofundme /f/ redirect 
    return url.replace('gofundme.com/f/','gofundme.com/')

In [None]:
# Clean gfm_url if value is not none, else left as none
if use_tqdm:
    tqdm.pandas(desc='Clean GFM url')
    dfs.gfm_url = dfs.gfm_url.where(dfs.gfm_url=='none',other=dfs.gfm_url.progress_apply(clean_gfm_url))
    tqdm.pandas(desc='Get campaign_id')
    dfs = dfs.assign(campaign_id=dfs.gfm_url.progress_apply(get_campaign_id))
else:
    dfs.gfm_url = dfs.gfm_url.where(dfs.gfm_url=='none',other=dfs.gfm_url.apply(clean_gfm_url))
    dfs = dfs.assign(campaign_id=dfs.gfm_url.apply(get_campaign_id))

## 2. Merge info from rescrape_missing_years & rescrape_social into our master csv 

###### read in csv if needed
dfs_unique_gfmurl = pd.read_csv(data_io.input_raw/'gfm'/'master'/'first_rescrape_unique_gfm_url.csv',encoding='utf-8',sep='|',low_memory=False,index_col=[0])

_a=dfs_unique_gfmurl.assign(campaign_id=dfs_unique_gfmurl.index)
_a_m=_a.duplicated(['campaign_id'],keep=False)
_a[_a_m]

In [None]:
# Get missing-date rescrape dataset Elisabeth did
myears_list =[]
for fp in (data_io.input_raw/'gfm'/'rescrape_missing_years').glob('*.csv'):
    myears_list.append(pd.read_csv(fp,encoding='utf-8',dtype=str))
myears = pd.concat(myears_list,ignore_index=True)
myears['gfm_url'] = myears.gfm_url.where(myears.gfm_url=='none',other=myears.gfm_url.apply(clean_gfm_url))
myears['campaign_id'] = myears.gfm_url.apply(get_campaign_id)
myears['url'] = myears.url.apply(prepare_url_for_update) 
myears.set_index(['url','campaign_id'],inplace=True)

In [None]:
# Get rescrape social
msocial_list =[]
for fp in (data_io.input_raw/'gfm'/'rescrape_social_all').glob('*.csv'):
    msocial_list.append(pd.read_csv(fp,encoding='utf-8',dtype=str))
msocial = pd.concat(msocial_list,ignore_index=True)
msocial['gfm_url'] = msocial.gfm_url.where(msocial.gfm_url=='none',other=msocial.gfm_url.apply(clean_gfm_url))
msocial['campaign_id'] = msocial.gfm_url.apply(get_campaign_id)
msocial['url'] = msocial.url.apply(prepare_url_for_update)
msocial.set_index(['url','campaign_id'],inplace=True)

In [None]:
# Get none date rescrape
ndate_list=[]
for fp in (data_io.input_raw/'gfm'/'rescrape_none_date').glob('*.csv'):
    ndate_list.append(pd.read_csv(fp,encoding='utf-8',dtype=str))
ndate = pd.concat(ndate_list,ignore_index=True)
ndate['gfm_url'] = ndate.gfm_url.where(ndate.gfm_url=='none',other=ndate.gfm_url.apply(clean_gfm_url))
ndate['campaign_id'] = ndate.gfm_url.apply(get_campaign_id)
ndate['url'] = ndate.url.apply(prepare_url_for_update)
ndate = keep_best_duplicate(ndate,subset=['url','campaign_id'],use_tqdm=True)
ndate.set_index(['url','campaign_id'],inplace=True)

In [None]:
# Get rescrape_fail 
rfail_list=[]
for fp in (data_io.input_raw/'gfm'/'all_table_w_tags_rescrape_failed_v3').glob('*.csv'):
    rfail_list.append(pd.read_csv(fp,encoding='utf-8',dtype=str))
rfail = pd.concat(rfail_list,ignore_index=True)
rfail['gfm_url'] = rfail.gfm_url.where(rfail.gfm_url=='none',other=rfail.gfm_url.apply(clean_gfm_url))
rfail['campaign_id'] = rfail.gfm_url.apply(get_campaign_id)
rfail['url'] = rfail.url.apply(prepare_url_for_update)
rfail = keep_best_duplicate(rfail,subset=['url','campaign_id'],use_tqdm=True)
rfail.set_index(['url','campaign_id'],inplace=True)

Update, only overwrite if the current value is none

In [None]:
dfs_for_update = dfs.assign(url=dfs.url.apply(prepare_url_for_update)).set_index(['url','campaign_id'])
dfs_updated = dfs_for_update.replace('none',np.nan)
dfs_updated.update(myears)
dfs_updated.update(msocial)
dfs_updated.update(ndate)
dfs_updated.update(rfail)
dfs_updated.fillna('none',inplace=True)

In [None]:
# x='zqxbe-support-amy'
x='marias-cancer-fight'
# preview one example before and after update
pd.concat([dfs_for_update.loc[dfs_for_update.gfm_url.str.find(x)>-1,['status','num_likes','num_shares']],
           dfs_updated.loc[dfs_updated.gfm_url.str.find(x)>-1,['status','num_likes','num_shares']]])

In [None]:
dfs_updated.to_csv(data_io.input_raw/'gfm'/'master'/'rescrape_updated_with_missing_info.csv',encoding='utf-8',sep='|')

## 3. Remove duplicates by campaign_id

In [None]:
dfs_updated=dfs_updated.reset_index()

In [None]:
before_drop= dfs.shape
dfs_unique_gfmurl = keep_best_duplicate(dfs_updated,subset=['campaign_id'],use_tqdm=use_tqdm)
after_drop = dfs_unique_gfmurl.shape
print(before_drop,after_drop)

Confirm that we kept the right duplicate 

In [None]:
campaign_id = 't2n8g5n' # one of the example 
dfs_unique_gfmurl[dfs_unique_gfmurl.campaign_id=='t2n8g5n']
# should return the more recent duplicate (wayback_status == 'present:success' one)

From here on, we'll index the rows by their campaign_id since these should be unique now

In [None]:
dfs_unique_gfmurl.set_index('campaign_id',inplace=True)

In [None]:
dfs_unique_gfmurl.to_csv(data_io.input_raw/'gfm'/'master'/'rescrape_unique_gfm_url.csv',encoding='utf-8',sep='|')

## 4. Remove rows that failed completely during scraping 

Get a sample of 5 campaigns for each type of wayback_status

In [None]:
dfs_unique_gfmurl['wayback_status_type'] = dfs_unique_gfmurl.wayback_status.apply(lambda x: ''.join([c for c in x if not c.isnumeric() ]))
wayback_types = dfs_unique_gfmurl.groupby('wayback_status_type').head(5)

Types of wayback status there are:

In [None]:
wayback_status_type_counts=wayback_types.wayback_status_type.value_counts().sort_index(ascending=False)
print(wayback_status_type_counts)
all_wayback_status_types=wayback_status_type_counts.index.values

To view example of a type

In [None]:
i=5
print(all_wayback_status_types[i])
_m=wayback_types.wayback_status_type==all_wayback_status_types[i]
wayback_types[_m].iloc[2]

Construct a list of wayback_status_type that can be consider decently successful,
we don't consider `wayback: scraped but did not meet success standard` as successful because usually these are redirects to gfm homepage not actual campaign webpage

In [None]:
allowed_wayback_status_types = ['present: success',
                                'present: scraped but did not meet success criteria ; wayback: success',
                                'present: scraped but did not meet success criteria ; wayback: inactive',
                                'present: request failed ; wayback: success',
                                'present: request failed ; wayback: inactive'
                               ]
allowed_wayback_status_types += ['present: inactive ; wayback: url not found in archives',
                                'present: inactive ; wayback: success',
                                'present: inactive ; wayback: scraped but did not meet success standard',
                                'present: inactive ; wayback: request failed',
                                'present: inactive ; wayback: no working archives out of  archives',
                                'present: inactive ; wayback: inactive']
allowed_wayback_status_types += ['present: campaign not found ; wayback: success',
                                'present: campaign not found ; wayback: inactive']
allowed_wayback_status_types += ['none'] # this is important one, from old scraping code before we implemented wayback_status

In [None]:
dfs_all_decent = dfs_unique_gfmurl.loc[dfs_unique_gfmurl.wayback_status_type.isin(allowed_wayback_status_types),:]
print(dfs_unique_gfmurl.shape,dfs_all_decent.shape)

Distribution of wayback status type

In [None]:
dfs_all_decent.wayback_status_type.value_counts()

Check the type of wayback_status_type that were excluded, make sure there's no type we still want to keep 

In [None]:
wayback_types.wayback_status_type[~wayback_types.wayback_status_type.isin(dfs_all_decent.wayback_status_type.unique())].unique()

In [None]:
dfs_all_decent.drop(columns=['wayback_status_type'],inplace=True)
dfs_all_decent.to_csv(data_io.input_raw/'gfm'/'master'/'rescrape_successful.csv',encoding='utf-8',sep='|')

## 5. Drop duplicates by title,organizer, date created and location

dfs_updated = pd.read_csv(data_io.input_raw/'gfm'/'master'/'rescrape_updated_with_missing_info.csv',encoding='utf-8',sep='|',index_col=[0])

In [None]:
import re
def construct_weird_title_type_pattern():
    rpatterns=[]
    rtypes=[]
    rpatterns.append(r'^Page Not Found$')
    rpatterns.append(r'^Unknown Error$')
    rpatterns.append(r'^502 Bad Gateway$')
    rpatterns.append(r'^404 Not Found$')
    rpatterns.append(r'^403 Forbidden$')
    rtypes += ['error']*5

    rpatterns.append(r'^none$')
    rtypes+= ['missing']

    # gfm logistic
    rpatterns.append(r'- Local Widget Builder$')
    rpatterns.append(r'(.*)GoFundMe Support$')
    rtypes += ['logistic']*2

    # general home pages
    rpatterns.append(r'^GoFundMe, le 1er site de crowdfunding pour créer une cagnotte en ligne$')
    rpatterns.append(r'^GoFundMe : la plateforme gratuite n°1 de la collecte de fonds$')
    rpatterns.append(r'^GoFundMe, le site n°1 de financement participatif et de collecte de fonds en ligne sans frais de plateforme$')
    rpatterns.append(r'^Donate Online [|] Make Online Donations to People You Know!$')
    rpatterns.append(r'^GoFundMe: Top-Website für Crowdfunding und Fundraising$')
    rpatterns.append(r'^GoFundMe – die weltgrößte Crowdfunding-Seite zum Spendensammeln$')
    rpatterns.append(r'^Funding(.*)[|] Fundraising - GoFundMe$')
    rpatterns.append(r'^Raise Money For (.*?)[|](.*?)Fundraising - GoFundMe$')
    rpatterns.append(r'^Personal & Charity Online Fundraising Websites that WORK!$')
    rpatterns.append(r'(.*?)Fundraising - Start a Free Fundraiser$')
    rpatterns.append(r'^Fundraising für (.*?)[|] Sammle Geld für(.*?)[|] GoFundMe$')
    rpatterns.append(r'^Top Crowdfunding-Seite zum Spendensammeln – GoFundMe$')
    rpatterns.append(r'^Personal Online Fundraising Websites that Work[!]$')
    rpatterns.append(r'^Raise Money for YOU!(.*)!')
    rpatterns.append(r'^GoFundMe:(.*)1')
    rpatterns.append(r'^Raise money for your(.*?)Ideas!$')
    rpatterns.append(r'^Raise Money for(.*)[|] GoFundMe$')
    rpatterns.append(r'^Fundraising Ideas for(.*)')
    rpatterns.append(r'(.*)Fundraising [|] Raise Money for(.*)[|] GoFundMe$')
    rpatterns.append(r'(.*)Fundraising: Raise Money for (.*)')
    rpatterns.append(r'(.*)Fundraising [|] Crowdfunding for(.*)– Free at GoFundMe$')
    rpatterns.append(r'^Fundraising Ideas for(.*)')
    rpatterns.append(r'^Find success with these Creative Fundraising Idea$')
    rpatterns.append(r'^(.*)Fundraising [|] Fundraiser - GoFundMe[!]$')
    rtypes+=['homepage']*24
    return pd.Series(rtypes, index=rpatterns, name='rtype')
WEIRD_TITLE_TYPE_PATTERNS = construct_weird_title_type_pattern()

def detect_weird_title_type(x):
    out = {'type':np.nan}
    for rpattern, rtype in WEIRD_TITLE_TYPE_PATTERNS.iteritems():
        if re.search(rpattern,x):
            out['type'] = rtype
            break
    return out

def construct_title_pattern():
    rpatterns =[]
    rtypes=[]
    rpatterns.append(r'^Fundraiser by(.*?):(.*)')
    rtypes.append(['organizer','campaign_title'])
    rpatterns.append(r'(.*)by(.*)- GoFundMe$')
    rtypes.append(['campaign_title','organizer'])
    rpatterns.append(r'^Fundraiser for(.*?)by(.*?):(.*)')
    rtypes.append(['benefiter','organizer','campaign_title'])
    rpatterns.append(r'^Collecte de fonds pour(.*?)organisée par(.*?):(.*)')
    rtypes.append(['benefiter','organizer','campaign_title'])
    rpatterns.append(r'^Spendenkampagne von(.*?)für(.*?):(.*)')
    rtypes.append(['organizer','benefiter','campaign_title'])
    rpatterns.append(r'^Campanha de arrecadação de fundos para(.*?)por(.*?):(.*)')
    rtypes.append(['organizer','benefiter','campaign_title'])
    rpatterns.append(r'^Campaña de(.*?):(.*)')
    rtypes.append(['organizer','campaign_title'])
    rpatterns.append(r'^Campanha de arrecadação de fundos de (.*?):(.*)')
    rtypes.append(['organizer','campaign_title'])
    rpatterns.append(r'^Cagnotte organisée par(.*?):(.*)')
    rtypes.append(['organizer','campaign_title'])
    rpatterns.append(r'^Spendenkampagne von(.*?):(.*)')
    rtypes.append(['organizer','campaign_title'])
    rpatterns.append(r'^Collecte de fonds organisée par(.*?):(.*)')
    rtypes.append(['organizer','campaign_title'])
    rpatterns.append(r'^Inzamelingsactie van(.*?):(.*)')
    rtypes.append(['organizer','campaign_title'])
    rpatterns.append(r'^Raccolta fondi di(.*?):(.*)')
    rtypes.append(['organizer','campaign_title'])
    rpatterns.append(r'^Cagnotte pour(.*?)organisée par(.*?):(.*)')
    rtypes.append(['benefiter','organizer','campaign_title'])
    rpatterns.append(r'^Inzamelingsactie voor(.*?)van(.*?):(.*)')
    rtypes.append(['benefiter','organizer','campaign_title'])
    return pd.Series(rtypes, index=rpatterns, name='rtype')
TITLE_PATTERNS = construct_title_pattern()

_remove_newline = lambda x: ' '.join(x.split()).strip()
def parse_title(x):
    out = {'benefiter': np.nan,'organizer':np.nan,'campaign_title':np.nan,'campaign_title_type':np.nan}
    if x == 'none': return out
    parsed = False
    x = _remove_newline(x)
    out['campaign_title_type'] = detect_weird_title_type(x)['type']
    if not pd.isnull(out['campaign_title_type']): 
        return out
    else:
        out['campaign_title_type'] = 'campaign'
    for rpattern, rtype in TITLE_PATTERNS.iteritems():
        results = re.findall(rpattern, x)
        if len(results) > 0:
            results=results[0]
            for k, v in zip(rtype, results):
                out[k] = v.strip()
            parsed = True
            break
    if not parsed:
        out['campaign_title'] = x
    return out

_clean_whitespace = lambda x: re.sub(r'\s+', ' ', x).strip()

def contruct_date_pattern():
    rpatterns = []
    rtypes = []
    rpatterns.append(r'Created ([a-zA-Z]+) (\d+), (\d+)')
    rtypes.append(['month', 'day', 'year'])
    rpatterns.append(r'Created (\d+) ([a-zA-z]+) (\d+)')
    rtypes.append(['day', 'month', 'year'])
    rpatterns.append(r'Created by .*?on ([a-zA-z]+) (\d+), (\d+)')
    rtypes.append(['month', 'day', 'year'])
    rpatterns.append(r'Erstellt am (\d+). (\S+) (\d+)')
    rtypes.append(['day', 'month', 'year'])
    rpatterns.append(r'Date de création : (\d+) (\S+) (\d+)')
    rtypes.append(['day', 'month', 'year'])
    rpatterns.append(r'Fecha de creación: (\d+) de (\S+) de (\d+)')
    rtypes.append(['day', 'month', 'year'])
    rpatterns.append(r'Creata il (\d+) (\S+) (\d+)')
    rtypes.append(['day', 'month', 'year'])
    rpatterns.append(r'Gemaakt op (\d+) (\S+) (\d+)')
    rtypes.append(['day', 'month', 'year'])
    rpatterns.append(r'Criada em (\d+) de (\S+) de (\d+)')
    rtypes.append(['day', 'month', 'year'])
    # special case: put this in day field for later processing with archive_timestamp
    # double parenthesis to match regex findall output as other patterns
    rpatterns.append(r'Created ((\d+ days ago))')
    rtypes.append(['day', 'day'])
    return pd.Series(rtypes, index=rpatterns, name='rtype')

DATE_PATTERNS = contruct_date_pattern()

def parse_created_date(x):
    out = {"day": np.nan, "month": np.nan, "year": np.nan}
    if x == 'none': return out
    x = _clean_whitespace(x)
    if x.find('Invalid date') > -1: return out
    if x == 'Created': return out
    for rpattern, rtype in DATE_PATTERNS.iteritems():
        results = re.findall(rpattern, x)
        if len(results) > 0:
            results = results[0]  # pop out results
            for k, v in zip(rtype, results):
                out[k] = v
            break
    if pd.isna([*out.values()]).all(): print(f'failed to parse {x}')
    return out

#### Parse out title for benefiter, organizer, and campaign_title

In [None]:
if use_tqdm: 
    tqdm.pandas(desc='Parsing title')
    title_parsed_dicts=dfs_all_decent.title.progress_apply(parse_title)
else:
    title_parsed_dicts=dfs_all_decent.title.apply(parse_title)
title_parsed_df=pd.DataFrame.from_records(title_parsed_dicts,index=title_parsed_dicts.index)
title_parsed_df.head()

Join parsed results into df

In [None]:
parsed_cols = ['benefiter','organizer','campaign_title','campaign_title_type']
dfs_all_decent.drop(columns=parsed_cols,errors='ignore',inplace=True)
dfs_all_decent = dfs_all_decent.merge(title_parsed_df[parsed_cols],right_index=True,left_index=True,how='left',indicator=True)
# See merge results
print(dfs_all_decent._merge.value_counts())
dfs_all_decent.drop(columns='_merge',inplace=True)

#### Parse day,month & year from created_date

In [None]:
if use_tqdm: 
    tqdm.pandas(desc='Parsing date')
    date_parsed_dicts = dfs_all_decent.created_date.progress_apply(parse_created_date)
else:
    date_parsed_dicts = dfs_all_decent.created_date.apply(parse_created_date)
date_parsed_df = pd.DataFrame.from_records(date_parsed_dicts,index=date_parsed_dicts.index)   

Fix the "days ago" issue in some rows

In [None]:
# Fix the "days ago rows"
days_ago_m = date_parsed_df.day.str.find('days ago') > -1
days_ago_rows = date_parsed_df[days_ago_m]
print(date_parsed_df[days_ago_m].head())
# Parse day month year from archived timestamp 
archived_ts= pd.DataFrame.from_records(dfs_all_decent.loc[days_ago_rows.index, 'archive_timestamp'].apply(
    lambda x: {
        'day': x[6:8],
        'month': x[4:6],
        'year': x[0:4]
    }).values,index=days_ago_rows.index)
archived_ts['day'] = archived_ts['day'].astype(int)-days_ago_rows.day.apply(lambda x: int(x.replace('days ago','')))
archived_ts['day']= archived_ts['day'].astype(str)
# Update day month year fields 
date_parsed_df.update(archived_ts,overwrite=True)
print(date_parsed_df[days_ago_m].head())

Map non-English months to standard English months

In [None]:
month_map_dict = {'février':'february','octobre':'october','juli':'july','junho':'june','09':'september','abril':'april',
'março':'march','mars':'march','februar':'february','januar':'january', 'avril':'april','juin':'june',
'juillet':'july','augustus':'august','mai':'may','mai':'may','märz':'march','juni':'June',
'settembre':'september','gennaio':'january','septiembre':'september','mayo':'may',
'décembre':'december','nisan':'April','maggio':'may','febbraio':'february',
'marzo':'march','janvier':'january','dezember':'december','novembro':'november',
'febrero':'february','aprile':'april','maio':'may','novembre':'november','mei':'may',
'septembre':'september','oktober':'october','junio':'june','enero':'january','februari':'february','januari':'january',
'fevereiro':'february','noviembre':'november','giugno':'june','agosto':'august'}

In [None]:
# map values
mapped = date_parsed_df.month.str.lower().map(month_map_dict).str.capitalize()
# if value was mapped keep the value, else keep original 
print(date_parsed_df[mapped.notna()].head())
date_parsed_df.month=mapped.where(mapped.notna(),date_parsed_df.month)
print(date_parsed_df[mapped.notna()].head())

Join parsed results into df

In [None]:
dfs_all_decent = dfs_all_decent.merge(date_parsed_df[['day','month','year']],right_index=True,left_index=True,how='left',indicator=True)
# See merge results
print(dfs_all_decent._merge.value_counts())
dfs_all_decent.drop(columns='_merge',inplace=True)
dfs_all_decent.loc[:,['created_date','day','month','year']].head()

#### Keep best duplicate by campaign_title, organizer, date created, and location

In [None]:
dfs_all_decent = dfs_all_decent.assign(cleaned_location=dfs_all_decent.location.apply(lambda x: x.lower().strip()))
dfs_unique = keep_best_duplicate(dfs_all_decent,subset=['campaign_title','organizer','day','month','year',
                                                        'cleaned_location'],use_tqdm=True)   
print(dfs_all_decent.shape,dfs_unique.shape)

In [None]:
print(dfs_all_decent.shape,dfs_unique.shape)

Check to see if this is good criteria to identify duplicates, need to do this because sometimes the same campaign have different gfm_urls (when GFM move campaigns around or posters post twice) 

In [None]:
dfs_all_decent_gb = dfs_all_decent.groupby(['campaign_title','organizer','day','month','year','cleaned_location'])
gb_size=dfs_all_decent_gb.size().sort_values(ascending=False)
any_nonu = gb_size>1
nonu_indexes =  any_nonu.index[any_nonu]

These duplicates should be the same campaign

In [None]:
gb_size[any_nonu] # groups that are not unique

In [None]:
sample_dup=dfs_all_decent_gb.get_group(nonu_indexes[8]) # example of one group

In [None]:
sample_dup[['campaign_title','organizer','day','month','year','cleaned_location','story','goal','status']] 
# same campaign but diff campaign_id

Optional: sort by created_date and campaign_id

In [None]:
dfs_unique.sort_values(by=['year','campaign_id'],inplace=True)

In [None]:
dfs_unique.drop(columns='cleaned_location',inplace=True)
dfs_unique.to_csv(data_io.input_raw/'gfm'/'master'/'rescrape_no_duplicate.csv',encoding='utf-8',sep='|')

In [None]:
# review
dfs_unique_in = pd.read_csv(data_io.input_raw/'gfm'/'master'/'rescrape_no_duplicate.csv',encoding='utf-8',sep='|',dtype=str,index_col=[0])