# Scraping former President Trump's 'desk'

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime as dt
import tweepy

In [3]:
import altair as alt
import altair_latimes as lat
import matplotlib.pyplot as plt

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

---

### Read the page

In [6]:
r = requests.get("https://www.donaldjtrump.com/desk")
soup = BeautifulSoup(r.text, "html.parser")

### Grab everything from each post div

In [7]:
rows = soup.find_all("div", class_="ftdli-main ftd-d")

In [8]:
data = []
for r in rows:
    if r.find("img") is not None:
        image = r.find("img")["src"]
    else:
        image = ""
    post_url = r.find("div", class_="title ftd-d").get("onclick")
    post = r.find("p", class_="ftd-post-text").text
    author = r.find("h2").text
    date = r.find("div", class_="date ftd-d").text
    raw = r.find("p", class_="ftd-post-text").text
    data.append(
        dict(
            date=date,
            url=post_url,
            author=author,
            post=post,
            image=image,
        )
    )

### First item from the dictionary

In [9]:
data[0]

{'date': '\n9:48am May 6, 2021\n',
 'url': "location.href='/desk/desk-dqvrd5gscw/';",
 'author': 'Donald J. Trump',
 'post': 'Congratulations to the great Patriots of Windham, New Hampshire for their incredible fight to seek out the truth on the massive Election Fraud which took place in New Hampshire and the 2020 Presidential Election. The spirit for transparency and justice is being displayed all over the Country by media outlets which do not represent Fake News. People are watching in droves as these Patriots work tirelessly to reveal the real facts of the most tainted and corrupt Election in American history. Congratulations Windham—look forward to seeing the results.',
 'image': ''}

### Clean up before importing as a dataframe

In [10]:
for d in data:
    d["date"] = d["date"].replace("\n", "")
    d["url"] = (
        d["url"]
        .replace("location.href='", "https://www.donaldjtrump.com")
        .replace("/';", "")
    )

In [11]:
src = pd.DataFrame(data)

---

### Pull in early posts

In [12]:
archive_df = pd.read_csv("input/archive.csv")

In [13]:
archive_df.drop(["video"], axis=1, inplace=True)

In [14]:
df = pd.concat([src, archive_df]).drop_duplicates(subset="url", keep="first")

In [15]:
len(df)

45

### Last five posts

In [16]:
df.head(5)

Unnamed: 0,date,url,author,post,image
0,"9:48am May 6, 2021",https://www.donaldjtrump.com/desk/desk-dqvrd5gscw,Donald J. Trump,"Congratulations to the great Patriots of Windham, New Hampshire for their incredible fight to seek out the truth on the massive Election Fraud which took place in New Hampshire and the 2020 Presidential Election. The spirit for transparency and justice is being displayed all over the Country by media outlets which do not represent Fake News. People are watching in droves as these Patriots work tirelessly to reveal the real facts of the most tainted and corrupt Election in American history. Congratulations Windham—look forward to seeing the results.",
1,"12:27pm May 5, 2021",https://www.donaldjtrump.com/desk/desk-sg5yzrmuvr,Donald J. Trump,,https://cdn.donaldjtrump.com/djtweb/general/SA_EliseStefanik_EndorsementGraphic_Twitter.jpg
2,"11:45am May 5, 2021",https://www.donaldjtrump.com/desk/desk-s9g4x7x8zk,Donald J. Trump,"Liz Cheney is a warmongering fool who has no business in Republican Party Leadership. We want leaders who believe in the Make America Great Again movement, and prioritize the values of America First. Elise Stefanik is a far superior choice, and she has my COMPLETE and TOTAL endorsement for GOP Conference Chair. Elise is a tough and smart communicator!",
3,"11:21am May 5, 2021",https://www.donaldjtrump.com/desk/desk-htrykas6u6,Donald J. Trump,"What Facebook, Twitter, and Google have done is a total disgrace and an embarrassment to our Country. Free Speech has been taken away from the President of the United States because the Radical Left Lunatics are afraid of the truth, but the truth will come out anyway, bigger and stronger than ever before. The People of our Country will not stand for it! These corrupt social media companies must pay a political price, and must never again be allowed to destroy and decimate our Electoral Process.",
4,"9:51am May 5, 2021",https://www.donaldjtrump.com/desk/desk-w5tycmjzr6,Donald J. Trump,"Warmonger Liz Cheney, who has virtually no support left in the Great State of Wyoming, continues to unknowingly and foolishly say that there was no Election Fraud in the 2020 Presidential Election when in fact, the evidence, including no Legislative approvals as demanded by the U.S. Constitution, shows the exact opposite. Had Mike Pence referred the information on six states (only need two) back to State Legislatures, and had gutless and clueless MINORITY Leader Mitch McConnell (he blew two seats in Georgia that should have never been lost) fought to expose all of the corruption that was presented at the time, with more found since, we would have had a far different Presidential result, and our Country would not be turning into a socialist nightmare! Never give up!",


### How many mention 'election'?

In [17]:
election = df[
    (df["post"].str.contains("election")) | (df["post"].str.contains("Election"))
]

In [18]:
len(election)

25

### Clean up the dates

In [19]:
df["fulldate"] = pd.to_datetime(df["date"])
df["date"] = df["fulldate"].dt.date
df["time"] = df["fulldate"].dt.time

In [20]:
post_urls = list(df["url"])

---

### Posts per day 

In [21]:
daily = df.groupby(["date"])["author"].count().reset_index(name="count")

In [22]:
daily["date"] = pd.to_datetime(daily["date"])

In [23]:
daily.sort_values("count", ascending=False).head()

Unnamed: 0,date,count
25,2021-05-05,4
23,2021-05-03,4
8,2021-04-07,3
19,2021-04-27,3
9,2021-04-08,3


### Chart it!

In [24]:
lines = (
    alt.Chart(
        daily,
        title="Trump posts to the 'desk' since it launched",
    )
    .mark_bar(size=10)
    .encode(
        x=alt.X(
            "date:T",
            axis=alt.Axis(grid=False, title="", tickCount=5, format=("%B %-d")),
        ),
        y=alt.Y(
            "count:Q",
            scale=alt.Scale(domain=(0, 5)),
            axis=alt.Axis(
                gridColor="#dddddd",
                offset=6,
                tickSize=0,
                domainOpacity=0,
                tickCount=3,
                title="Daily post count and mean",
            ),
        ),
    )
)

rule = alt.Chart(daily).mark_rule(color="red").encode(y="mean(count):Q")

# rule label -- would like to add "Average: " annotation
text = rule.mark_text(
    align="center",
    baseline="middle",
    dx=220,
    dy=10,
    fontWeight="bold",
).encode(text=alt.Text("mean(count):Q", format=".2"))

(lines + rule + text).properties(height=350, width=600).configure_view(strokeOpacity=0)

In [28]:
(lines + rule + text).properties(height=350, width=600).configure_view(
    strokeOpacity=0
).save("visuals/daily_posts.png")

---

### Exports

In [26]:
today = dt.date.today().strftime("%m-%d-%Y")

In [27]:
df.to_csv("archive/posts_" + str(today) + ".csv", index=False)
df.to_csv("output/allposts.csv", index=False)