# Analyze GiveSendGo data

In [69]:
%load_ext lab_black

In [6]:
import pandas as pd
import json
import numpy as np
import altair as alt
import os
import glob

In [7]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

### Read and concatenate the most recent json files

In [49]:
path_to_json = 'increments/new' 

json_pattern = os.path.join(path_to_json,'*.json')
file_list = glob.glob(json_pattern)

In [50]:
dfs = []
for file in file_list:
    with open(file) as f:
        json_data = (json.loads(f.read()))
        dfs.append(json_data)

In [51]:
all_data = []

for d in dfs:
    df = pd.DataFrame(d["returnData"]['donations'])
    all_data.append(df)

new_df = pd.concat(all_data)

In [52]:
len(new_df)

26750

### Get the previously scraped data

In [55]:
with open("increments/data_increment_536789.json") as f:
    data_backup = json.load(f)

In [56]:
all_data_old = []

for d in data_backup:
    df = pd.DataFrame(d["returnData"]["donations"])
    all_data_old.append(df)

old_df = pd.concat(all_data_old)

### Combine the two dataframes

In [57]:
df = pd.concat([old_df, new_df])

In [60]:
len(df)

83033

In [61]:
df = df.drop_duplicates().copy()

In [62]:
df.donation_amount = df.donation_amount.astype(float)

In [63]:
df.donation_amount.sum()

7495548.5

In [64]:
df.donation_amount.mean().round(2)

90.27

In [65]:
df.donation_name.count()

83033

In [66]:
df.sort_values("donation_id").head()

Unnamed: 0,donation_id,campaign_id,donation_amount,donation_comment,donation_conversion_rate,donation_name,donation_anonymous,donation_date,lovecount,likes
0,463549,49000,100.0,We support the Truckers standing up for our freedom.,1.0,Jeff Brain,0,6 days ago,0,0
0,464203,49000,100.0,God Bless you and Keep you safe! Freedom for All!,1.0,Edwards,0,6 days ago,0,0
2,464220,49000,25.0,Thank you! God bless you all and keep you safe. Godspeed you on your mission! ‚ù§Ô∏èüòÄ‚ù§Ô∏è,1.0,MaryEllen Stevens,0,6 days ago,0,0
3,464221,49000,10.0,Glory to God.,1.0,Our turn.,0,6 days ago,0,0
1,464223,49000,20.0,Go Truckers!,1.0,,0,6 days ago,0,0


### Dates

In [67]:
df.donation_date = (
    df.donation_date.str.replace("6 days ago", "2022-02-02", regex=False)
    .str.replace("5 days ago", "2022-02-03", regex=False)
    .str.replace("4 days ago", "2022-02-04", regex=False)
    .str.replace("3 days ago", "2022-02-05", regex=False)
    .str.replace("2 days ago", "2022-02-06", regex=False)
    .str.replace("1 days ago", "2022-02-07", regex=False)
)

In [68]:
df.donation_date

0    2022-02-02
0    2022-02-02
0    2022-02-02
1    2022-02-02
2    2022-02-02
        ...    
3     8 hrs ago
4     8 hrs ago
5     8 hrs ago
6     8 hrs ago
7     8 hrs ago
Name: donation_date, Length: 83033, dtype: object

In [69]:
# df["donation_date"] = pd.to_datetime(df["donation_date"])

In [70]:
# df["date_clean"] = df["donation_date"].dt.date

In [71]:
df.dtypes

donation_id                   int64
campaign_id                   int64
donation_amount             float64
donation_comment             object
donation_conversion_rate     object
donation_name                object
donation_anonymous            int64
donation_date                object
lovecount                     int64
likes                         int64
dtype: object

In [72]:
dates = (
    df.groupby(["donation_date"])
    .agg({"donation_id": "count", "donation_amount": sum})
    .reset_index()
)

In [73]:
dates.rename(columns={"donation_id": "count", "donation_amount": "sum"}, inplace=True)

In [74]:
dates

Unnamed: 0,donation_date,count,sum
0,1 hrs ago,848,146879.0
1,10 hrs ago,2183,173818.0
2,10 mins ago,9,522.0
3,11 hrs ago,977,69699.0
4,11 mins ago,33,1441.0
5,12 hrs ago,452,28422.0
6,12 mins ago,23,1890.0
7,13 hrs ago,224,15820.0
8,13 mins ago,27,2182.0
9,14 hrs ago,171,10368.0


In [75]:
alt.Chart(dates).mark_bar(width=20).encode(
    x=alt.X(
        "donation_date:T",
        axis=alt.Axis(format="%b. %d", tickCount=6),
        title="Donation date",
    ),
    y=alt.Y("sum", axis=alt.Axis(tickCount=6), title="Dollars raised"),
)

---

In [76]:
df["donation_name"] = df["donation_name"].str.strip().str.lower()

In [77]:
df.loc[df["donation_anonymous"] > 1, "donation_name"] = "anonymous"

### Group to count and sum donations by donor name

In [78]:
df.groupby(["donation_name"]).agg(
    {"donation_amount": [np.size, np.sum]}
).reset_index().sort_values(("donation_amount", "sum"), ascending=False).head(20)

Unnamed: 0_level_0,donation_name,donation_amount,donation_amount
Unnamed: 0_level_1,Unnamed: 1_level_1,size,sum
0,,42011,3479244.5
24204,processed but not recorded.,1,215000.0
8650,easy kleen pressure systems ltd,1,75000.0
1586,anonymous,499,42931.0
10057,freedom,324,25906.0
12345,holden and carey rhodes,1,25000.0
32249,www the range langley com,1,18000.0
993,american cryptocurrency compatriot,1,17760.0
10232,freedom lover,82,14856.0
10095,freedom convoy,92,9778.0


In [79]:
df.groupby(["donation_name"]).agg(
    {"donation_amount": [np.size, np.sum]}
).reset_index().sort_values(("donation_amount", "size"), ascending=False).head(20)

Unnamed: 0_level_0,donation_name,donation_amount,donation_amount
Unnamed: 0_level_1,Unnamed: 1_level_1,size,sum
0,,42011,3479244.5
1586,anonymous,499,42931.0
10057,freedom,324,25906.0
16092,justin trudeau,104,7625.0
10095,freedom convoy,92,9778.0
10232,freedom lover,82,14856.0
28041,steve,73,4498.0
21155,mike,60,4233.0
6874,david,59,3899.0
15004,john,57,5749.0


In [80]:
# 100 biggest donations
# 100 first donations

In [81]:
df.sort_values("donation_amount", ascending=False).head(100).to_csv(
    "data/processed/100_largest_donations.csv", index=False
)

In [82]:
df.sort_values("donation_date", ascending=True).head(100).to_csv(
    "data/processed/100_earliest_donations.csv", index=False
)