# Analyze GiveSendGo data

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import json
import numpy as np
import altair as alt
import os
import glob

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

## Read data

#### Find, read and concatenate the most recent json files

In [4]:
path_to_json = "increments/new"

json_pattern = os.path.join(path_to_json, "*.json")
file_list = glob.glob(json_pattern)

In [5]:
dfs = []
for file in file_list:
    with open(file) as f:
        json_data = json.loads(f.read())
        dfs.append(json_data)

In [6]:
all_data = []

for d in dfs:
    df = pd.DataFrame(d["returnData"]["donations"])
    all_data.append(df)

new_df = pd.concat(all_data)

In [7]:
len(new_df)

47380

#### Get the previously scraped data

In [8]:
with open("increments/data_increment_536789.json") as f:
    data_backup = json.load(f)

In [9]:
all_data_old = []

for d in data_backup:
    df = pd.DataFrame(d["returnData"]["donations"])
    all_data_old.append(df)

old_df = pd.concat(all_data_old)

#### Combine the two dataframes

In [10]:
df = pd.concat([old_df, new_df]).reset_index(drop=True)

#### Lose columns we don't need and clean up the frame

In [11]:
df.drop(
    [
        "donation_conversion_rate",
        "campaign_id",
        "lovecount",
        "likes",
        "donation_anonymous",
    ],
    axis=1,
    inplace=True,
)

In [12]:
df = df.drop_duplicates().copy()
df.donation_amount = df.donation_amount.astype(float)

---

## Toplines

#### Donations total

In [13]:
df.donation_amount.sum()

8712456.5

#### Average donation

In [14]:
df.donation_amount.mean().round(2)

89.76

#### How many donations?

In [15]:
df.donation_name.count()

97062

#### PCT of the goal? 

In [16]:
((df.donation_amount.sum() / 16000000) * 100).round(2)

54.45

## What's in the donation memo text?

#### Donations with specific amounts

In [18]:
df["mentions_14"] = df["donation_amount"] == 14
df["mentions_16"] = df["donation_amount"] == 16
df["mentions_88"] = df["donation_amount"] == 88

In [19]:
len(df[df["donation_amount"] == 14])

47

#### Donations comments with coded language or specific keywords?

In [20]:
df_w_comment = df[df["donation_comment"] != ""].copy()

In [21]:
df_w_comment["mentions_mandate"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("mandate")
)
df_w_comment["mentions_holdtheline"] = (
    df["donation_comment"].str.lower().str.contains("hold the line")
)
df_w_comment["mentions_brandon"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("brandon")
)
df_w_comment["mentions_wwg1wga"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("wwg1wga")
)
df_w_comment["mentions_wwg1wga"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("wwg1wga")
)
df_w_comment["mentions_trump"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("trump")
)
df_w_comment["mentions_honk"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("honk honk")
)
df_w_comment["mentions_freedom"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("freedom")
)
df_w_comment["mentions_tyranny"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("tyranny")
)
df_w_comment["mentions_trudeau"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("trudeau")
)
df_w_comment["mentions_biden"] = (
    df_w_comment["donation_comment"].str.lower().str.contains("biden")
)

#### List of mention categories

In [22]:
boolean = [
    "mentions_mandate",
    "mentions_holdtheline",
    "mentions_brandon",
    "mentions_14",
    "mentions_16",
    "mentions_88",
    "mentions_wwg1wga",
    "mentions_trump",
    "mentions_trudeau",
    "mentions_biden",
    "mentions_honk",
    "mentions_freedom",
    "mentions_tyranny",
]

#### PCT mentioning a keyword

In [23]:
for b in boolean:
    print((df[b].value_counts()))

KeyError: 'mentions_mandate'

In [None]:
df_w_comment

---

## Dates

#### The dates are vague so categorize them

In [None]:
df.donation_date = (
    df.donation_date.str.replace("6 days ago", "2022-02-02", regex=False)
    .str.replace("5 days ago", "2022-02-03", regex=False)
    .str.replace("4 days ago", "2022-02-04", regex=False)
    .str.replace("3 days ago", "2022-02-05", regex=False)
    .str.replace("2 days ago", "2022-02-06", regex=False)
    .str.replace("1 days ago", "2022-02-07", regex=False)
)

#### The data have also been scraped incrementally, so we have to lump the more recent stuff in a larger bucket

In [None]:
df["donation_date"] = np.where(
    ~df["donation_date"].str.contains("2022", na=False),
    "2022-08-08 or later",
    df["donation_date"],
)

#### Summarize donations by our categories

In [None]:
dates = (
    df.groupby(["donation_date"])
    .agg({"donation_id": "count", "donation_amount": sum})
    .reset_index()
).rename(columns={"donation_id": "count", "donation_amount": "sum"})

In [None]:
dates

#### Export the dates

In [None]:
dates.to_csv("data/processed/dates.csv", index=False)

---

## Distribution

In [None]:
len(df[df["donation_amount"] >= 5000])

In [None]:
df[df["donation_amount"] >= 5000].sort_values("donation_amount").head()

---

In [None]:
df["donation_name"] = df["donation_name"].str.strip().str.lower()

In [None]:
df[df["donation_name"].str.contains("processed but not")].iloc[0]

### Group to count and sum donations by donor name

In [None]:
df.groupby(["donation_name"]).agg(
    {"donation_amount": [np.size, np.sum]}
).reset_index().sort_values(("donation_amount", "sum"), ascending=False).head(40)

In [None]:
### How does this compare to other campaigns

In [None]:
df.groupby(["donation_name"]).agg(
    {"donation_amount": [np.size, np.sum]}
).reset_index().sort_values(("donation_amount", "size"), ascending=False).head(20)

In [None]:
# 100 biggest donations
# 100 first donations

In [None]:
df.sort_values("donation_amount", ascending=False).head(100).to_csv(
    "data/processed/100_largest_donations.csv", index=False
)

In [None]:
df.sort_values("donation_date", ascending=True).head(100).to_csv(
    "data/processed/100_earliest_donations.csv", index=False
)

In [None]:
df.sort_values("donation_date", ascending=True).to_csv(
    "data/processed/all_donations.csv", index=False
)

In [None]:
weird_entries = [
    481566,
    474249,
    492432,
    471986,
    507120,
    499386,
    498521,
    484448,
    493519,
    476071,
    521675,
    516745,
]

In [None]:
df[df["donation_id"].isin(weird_entries)]

In [None]:
df[df["donation_name"].str.contains("processed but not recorded")].iloc[0]

---

In [None]:
df.head()

### Get a dataframe listing all the words mentioned in the 'comment' column and clean it up

In [None]:
words = (
    pd.DataFrame(
        df["donation_comment"]
        .str.replace(",", "", regex=False)
        .str.replace("|", "", regex=False)
        .str.replace("&", "", regex=False)
        .str.replace("-", "", regex=False)
        .str.strip()
        .str.split(expand=True)
        .stack()
        .value_counts(),
        columns=["count"],
    )
    .reset_index()
    .rename(columns={"index": "word"})
)

### Lowercase words

In [None]:
words["word"] = words["word"].str.lower()

### Group by animal and sum

In [None]:
words_grouped = (
    words.groupby("word")
    .agg({"count": sum})
    .reset_index()
    .sort_values("count", ascending=False)
)

### Share for each animal out of all the animals

In [None]:
words_grouped["share"] = (
    (words_grouped["count"] / words_grouped["count"].sum()) * 100
).round(2)

In [None]:
words_grouped.head(20)

In [None]:
df.head()