# Scraping former President Trump's 'desk'

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime as dt

In [3]:
import altair as alt
import altair_latimes as lat
import matplotlib.pyplot as plt

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

---

### XyXy

In [47]:
header = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}

soups = []

for i in range(0, 110, 10):
    r = requests.get("https://www.donaldjtrump.com/desk/P" + str(i), headers=header)
    soups.append(BeautifulSoup(r.text, "html.parser"))

In [62]:
pages = []

for s in soups:
    pages.append(s.find("div", class_="ftdli-main ftd-d"))

In [63]:
for r in pages:
    print(r)

<div class="ftdli-main ftd-d">
<div class="ftdli-main-top ftd-d">
<div class="title ftd-d" onclick="location.href='/desk/desk-893mckjcbm/';">
<h2>Donald J. Trump</h2>
</div>
<div class="date ftd-d">
<p>6:13pm May 28, 2021</p>
</div>
</div>
<div class="ftdli-main-content ftd-d">
<p class="ftd-post-text" id="ftd-post-text-231" onclick="location.href='/desk/desk-893mckjcbm/';">Why are the Radical Left Democrats in Georgia fighting so hard that there not be a Forensic Audit of 150,000 absentee ballots in Fulton County? There can be only one reason, and that is because they know the vote was corrupt and the audit will show it. Republicans must fight hard and win!</p>
</div>
</div>
<div class="ftdli-main ftd-d">
<div class="ftdli-main-top ftd-d">
<div class="title ftd-d" onclick="location.href='/desk/desk-zznr3fcqjn/';">
<h2>Donald J. Trump</h2>
</div>
<div class="date ftd-d">
<p>9:54am May 25, 2021</p>
</div>
</div>
<div class="ftdli-main-content ftd-d">
<p class="ftd-post-text" id="ftd-pos

### Grab everything from each post div

In [57]:
data = []
for r in pages:
    if r.find("img") is not None:
        image = r.find("img")["src"]
    else:
        image = ""
    post_url = r.find("div", class_="title ftd-d").get("onclick")
    post = r.find("p", class_="ftd-post-text").text
    author = r.find("h2").text
    date = r.find("div", class_="date ftd-d").text
    raw = r.find("p", class_="ftd-post-text").text
    data.append(
        dict(
            date=date,
            url=post_url,
            author=author,
            post=post,
            image=image,
        )
    )

AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

### First item from the dictionary

In [46]:
len(data)

11

### Clean up before importing as a dataframe

In [None]:
for d in data:
    d["date"] = d["date"].replace("\n", "")
    d["url"] = (
        d["url"]
        .replace("location.href='", "https://www.donaldjtrump.com")
        .replace("/';", "")
    )

In [None]:
src = pd.DataFrame(data)

In [None]:
len(src)

---

### Pull in early posts

In [None]:
archive_df = pd.read_csv("input/archive.csv")

In [None]:
archive_df.drop(["video"], axis=1, inplace=True)

In [None]:
df = pd.concat([src, archive_df]).drop_duplicates(subset="url", keep="first")

### How many posts total? 

In [None]:
len(archive_df)

### Last five posts

In [None]:
df.head()

### How many mention 'election'?

In [None]:
df["election"] = df["post"].str.contains("election") | df["post"].str.contains(
    "Election"
)

In [None]:
len(df[df["election"] == True])

### Clean up the dates

In [None]:
df["fulldate"] = pd.to_datetime(df["date"])
df["date"] = df["fulldate"].dt.date
df["time"] = df["fulldate"].dt.time

In [None]:
post_urls = list(df["url"])

---

### Posts per day 

In [None]:
election = df.groupby(["date", "election"]).agg({"author": "size"}).reset_index()

In [None]:
election.head()

In [None]:
daily = df.groupby(["date"])["author"].count().reset_index(name="count")

In [None]:
daily.rename(columns={"author": "count"}, inplace=True)

In [None]:
daily["seven-day-avg"] = daily["count"].rolling(7).mean()

In [None]:
daily["date"] = pd.to_datetime(daily["date"])

In [None]:
daily.sort_values("count", ascending=False).head()

### Chart it!

In [None]:
bars = (
    alt.Chart(
        daily,
        title="Trump posts to the 'desk' since it launched",
    )
    .mark_bar(size=10)
    .encode(
        x=alt.X(
            "date:T",
            axis=alt.Axis(grid=False, title="", tickCount=5, format=("%B %-d")),
        ),
        y=alt.Y(
            "count:Q",
            scale=alt.Scale(domain=(0, len("count"))),
            axis=alt.Axis(
                gridColor="#dddddd",
                offset=6,
                tickSize=0,
                domainOpacity=0,
                tickCount=3,
                title="Daily post count and seven-day average",
            ),
        ),
    )
)

rolling = (
    alt.Chart(daily)
    .mark_line(color="red")
    .encode(
        y="seven-day-avg",
        x=alt.X(
            "date:T",
            axis=alt.Axis(grid=False, title="", tickCount=5, format=("%B %-d")),
        ),
    )
)

(bars + rolling).properties(height=350, width=600).configure_view(strokeOpacity=0)

In [None]:
election["date"] = pd.to_datetime(election["date"])

In [None]:
bars_elex = (
    alt.Chart(
        election,
        title="Trump posts to the 'desk' re: election",
    )
    .mark_bar(size=10)
    .encode(
        x=alt.X(
            "date:T",
            axis=alt.Axis(grid=False, title="", tickCount=5, format=("%B %-d")),
        ),
        y=alt.Y(
            "author:Q",
            scale=alt.Scale(domain=(0, len("author:Q"))),
            axis=alt.Axis(
                gridColor="#dddddd",
                offset=6,
                tickSize=0,
                domainOpacity=0,
                tickCount=3,
                title="Daily post count",
            ),
        ),
        color=alt.Color(
            "election",
            title="About election?",
            scale=alt.Scale(domain=["true", "false"], range=["#f1a340", "#998ec3"]),
        ),
    )
)

(bars_elex).properties(height=350, width=600).configure_view(strokeOpacity=0)

In [None]:
(bars + rolling).properties(height=350, width=600).configure_view(strokeOpacity=0).save(
    "visuals/daily_posts.png"
)

In [None]:
(bars_elex).properties(height=350, width=600).configure_view(strokeOpacity=0).save(
    "visuals/daily_posts_re_election.png"
)

---

### Exports

In [None]:
today = dt.date.today().strftime("%m-%d-%Y")

In [None]:
df.to_csv("archive/posts_" + str(today) + ".csv", index=False)
df.to_csv("output/allposts.csv", index=False)