# How often does @realDonaldTrump tweet about California?

In [1]:
# https://github.com/dnanhkhoa/nb_black
%load_ext lab_black

In [2]:
import altair as alt
import pandas as pd
import matplotlib as mpl
import numpy as np
import json
import pytz
from datetime import datetime
import altair_latimes as lat

alt.themes.register("latimes", lat.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('latimes')

In [3]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

### Read dump downloaded from trumptweetarchive.com

In [4]:
%run '00-trump-tweets-processing.ipynb'

In [5]:
src = pd.read_csv(
    "/Users/mhustiles/data/github/notebooks/\
trump-tweets/output/realdonaldtrump.csv",
    dtype={"id": str},
    low_memory=False,
)

In [6]:
src["user"] = "realDonaldTrump"

### How many tweets since May 2009?

In [7]:
len(src)

56118

### Clean up dates

In [8]:
# src['eastern_created_at'] = src['date'].dt.time
# src['eastern_created_at'] = src['date'].dt.tz_localize("US/Eastern")
# src['date'] = pd.to_datetime(src['eastern_created_at']).dt.strftime('%m/%d/%Y')
# src['year'] = src['eastern_created_at'].dt.year
# src['month'] = src['eastern_created_at'].dt.month
# src['day'] = src['eastern_created_at'].dt.day
# src['hour'] = src['eastern_created_at'].dt.hour
# src['minute'] = src['eastern_created_at'].dt.minute
# src['time'] = src['eastern_created_at'].dt.time
src["id"] = src["id"].astype(str)
src["year"] = src["year"].astype(str)
src["month"] = src["month"].astype(str)
src["day"] = src["day"].astype(str)
src["hour"] = src["hour"].astype(str)
src["date"] = pd.to_datetime(src["date"])

---

### Since he took office

In [9]:
trumppres = pd.DataFrame(src[src.date >= "01/20/2017"]).sort_values(
    "date", ascending=True
)

---

### Before we look at California, how often did he use specific words

In [10]:
patriots = trumppres[trumppres["text"].str.contains("patriots")]

In [11]:
len(patriots)

21

In [12]:
sad = trumppres[trumppres["text"].str.contains("sad")]

In [13]:
len(sad)

205

In [14]:
rigged = trumppres[trumppres["text"].str.contains("rigged")]

In [15]:
len(rigged)

34

### Let's remove common stopwords from the text variable

In [16]:
stopwords = (
    "i",
    "me",
    "my",
    "myself",
    "we",
    "our",
    "ours",
    "ourselves",
    "you",
    "your",
    "yours",
    "yourself",
    "yourselves",
    "he",
    "him",
    "his",
    "himself",
    "she",
    "her",
    "hers",
    "herself",
    "it",
    "its",
    "itself",
    "they",
    "them",
    "their",
    "theirs",
    "themselves",
    "what",
    "which",
    "who",
    "whom",
    "this",
    "that",
    "these",
    "those",
    "am",
    "is",
    "are",
    "was",
    "were",
    "be",
    "been",
    "being",
    "have",
    "has",
    "had",
    "having",
    "do",
    "does",
    "did",
    "doing",
    "a",
    "an",
    "the",
    "and",
    "but",
    "if",
    "or",
    "because",
    "as",
    "until",
    "while",
    "of",
    "at",
    "by",
    "for",
    "with",
    "about",
    "against",
    "between",
    "into",
    "through",
    "during",
    "before",
    "after",
    "above",
    "below",
    "to",
    "from",
    "up",
    "down",
    "in",
    "out",
    "on",
    "off",
    "over",
    "under",
    "again",
    "further",
    "then",
    "once",
    "here",
    "there",
    "when",
    "where",
    "why",
    "how",
    "all",
    "any",
    "both",
    "each",
    "few",
    "more",
    "most",
    "other",
    "some",
    "such",
    "no",
    "nor",
    "not",
    "only",
    "own",
    "same",
    "so",
    "than",
    "too",
    "very",
    "s",
    "t",
    "can",
    "will",
    "just",
    "don",
    "should",
    "now",
)

---

### What about the tweets related to California

In [17]:
caliwords = [
    "California",
    "Cali",
    "Newsom",
    "Los Angeles",
    "San Diego",
    "earthquake",
    "wildfire",
    "fires",
    "Pelosi",
    "Crazy Nancy",
    "garcetti",
    "Feinstein",
    "Nunes",
    "Schiff",
    "schiff",
    "San Francisco",
    "homeless",
]

### Select only those tweets in the dataframe that mention our CA words

In [18]:
df_cali = src[src["text"].str.contains("|".join(caliwords))]

In [19]:
df_cali[["date", "text"]].sort_values("date", ascending=False).head(10)

Unnamed: 0,date,text
217,2020-12-26,"The lockdowns in Democrat run states are absolutely ruining the lives of so many people - Far more than the damage that would be caused by the China Virus. Cases in California have risen despite the lockdown, yet Florida &amp; others are open &amp; doing well. Common sense please!"
348,2020-12-18,RT @BuckSexton: How much worse off would California be if it had ordered *none* of the covid lockdowns and just told people to be cautious…
643,2020-12-05,"The answer to the Democrat voter fraud is not to stay at home - that’s what Pelosi and Schumer want you to do. If you want revenge on the Democrats for their efforts to steal the Presidential election, where we are fighting hard, you have to show up and vote in RECORD numbers! https://t.co/XAJ0F2JmeL"
854,2020-11-27,"RT @newsmax: Newsom, 'Bumbling Biden' and more - who are the Top 5 BIGGEST TURKEYS of the week? 🦃 - Newsmax TV's Grant Stinchfield @stinchf…"
855,2020-11-27,"RT @newsmax: ""They got caught."" - President Trump fires off on the perpetrators of 'fraud,' and applauds Rudy Giuliani for his efforts. ht…"
55662,2020-11-21,RT @LindseyGrahamSC: Republican control of the Senate is the only thing that stands in the way of Nancy Pelosi’s radical leftist agenda fro…
1779,2020-10-31,"Vote for @NancyMace! Her opponent, Joe Cunningham, is a puppet for Nancy Pelosi and the Radical Left! #SC01 https://t.co/XCtZNrbrT6"
1545,2020-10-28,"Maria is badly needed in Washington. She is an outstanding person who truly loves her Country and her State. Her opponent, @DonnaShalala, is a political hack who is a puppet of Nancy Pelosi. She does nothing for Florida. Maria has my Complete &amp; Total Endorsement! https://t.co/4GumAgOA3J"
1352,2020-10-27,"Rose is a disaster for New York. Not listened to, or respected, in Washington. A puppet for Pelosi! https://t.co/lvs7UnC2bI"
1406,2020-10-27,"Big GDP projected. Pelosi only looking to Bail Out badly run Democrat Cities. Tap, Tap, Taping us along. She has little interest in helping out the “people”."


### Set the data as an index for resampling

In [20]:
df_cali_dt = df_cali.set_index("date")

### Group and count Cali tweets since Trump took office

In [21]:
g = df_cali_dt[df_cali_dt.index > "01-20-2017"].groupby(pd.Grouper(freq="M"))

In [22]:
cali_months = g.count().reset_index()

### Export those counts for graphics

In [23]:
cali_months[["date", "index"]].sort_values("date", ascending=False).to_csv(
    "output/cali-timeline.csv", index=False
)

### How many times did he mention Cali words? 

In [24]:
len(df_cali[df_cali["isRetweet"] == "f"])

669

In [25]:
df_cali.to_csv("output/cali-tweets.csv", index=False)

---

### Let's look at the words in his Cali tweets

In [26]:
text_analysis = df_cali.copy()

In [27]:
text_analysis = text_analysis[text_analysis["isRetweet"] != False]

### Lower case the tweets for easier clustering

In [28]:
text_analysis["text"] = text_analysis["text"].str.lower()

### Remove stopwords from the text field

In [29]:
text_analysis["tweet_without_stopwords"] = text_analysis["text"].apply(
    lambda x: " ".join([word for word in x.split() if word not in (stopwords)])
)

### Split the words, count them and create a dataframe

In [30]:
from collections import Counter

words = Counter(" ".join(text_analysis["tweet_without_stopwords"]).split()).most_common(
    100
)

In [31]:
word_count = pd.DataFrame(words)

In [32]:
word_count.rename(columns={0: "word", 1: "count"}, inplace=True)

In [33]:
word_count.head(20)

Unnamed: 0,word,count
0,rt,548
1,pelosi,273
2,schiff,242
3,nancy,236
4,adam,184
5,"&amp,",150
6,impeachment,123
7,great,116
8,california,115
9,people,94


---

### Let's count specific state mentions

In [34]:
state_names = [
    "Alaska",
    "Alabama",
    "Arkansas",
    "American Samoa",
    "Arizona",
    "California",
    "Colorado",
    "Connecticut",
    "District ",
    "of Columbia",
    "Delaware",
    "Florida",
    "Georgia",
    "Guam",
    "Hawaii",
    "Iowa",
    "Idaho",
    "Illinois",
    "Indiana",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Massachusetts",
    "Maryland",
    "Maine",
    "Michigan",
    "Minnesota",
    "Missouri",
    "Mississippi",
    "Montana",
    "North Carolina",
    "North Dakota",
    "Nebraska",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "Nevada",
    "New York",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Puerto Rico",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Virginia",
    "Virgin Islands",
    "Vermont",
    "Washington",
    "Wisconsin",
    "West Virginia",
    "Wyoming",
]

In [35]:
for s in state_names:
    print(s + "|" + str(src.text.str.count(s).sum()))

Alaska|31
Alabama|130
Arkansas|22
American Samoa|1
Arizona|183
California|221
Colorado|93
Connecticut|25
District |34
of Columbia|0
Delaware|15
Florida|449
Georgia|236
Guam|6
Hawaii|44
Iowa|372
Idaho|11
Illinois|37
Indiana|92
Kansas|40
Kentucky|87
Louisiana|114
Massachusetts|18
Maryland|34
Maine|45
Michigan|230
Minnesota|97
Missouri|52
Mississippi|57
Montana|50
North Carolina|204
North Dakota|14
Nebraska|29
New Hampshire|179
New Jersey|47
New Mexico|25
Nevada|129
New York|543
Ohio|248
Oklahoma|53
Oregon|18
Pennsylvania|351
Puerto Rico|70
Rhode Island|7
South Carolina|148
South Dakota|10
Tennessee|72
Texas|300
Utah|49
Virginia|212
Virgin Islands|3
Vermont|6
Washington|509
Wisconsin|193
West Virginia|55
Wyoming|5


### We don't have Twitter users by state, but use voting pop to normalize

In [36]:
pop = pd.read_csv("input/vap.csv", skiprows=1)

In [37]:
pop = pop.drop([0, 52], axis=0).rename(
    columns={"Voting-Age Population (VAP)": "vap", "Unnamed: 0": "state"}
)

In [38]:
poptrim = pd.DataFrame(pop[["state", "vap"]])

In [39]:
poptrim["vap"] = poptrim["vap"].str.replace(",", "", regex=False).astype(int)
poptrim["state"] = poptrim["state"].str.replace("*", "", regex=False)

### Read our state mentions

In [40]:
statementions = pd.read_csv("input/trump_state_mentions.csv")

### Merge with pop data

In [41]:
mentions_norm = pd.merge(poptrim, statementions, on="state")

### Create a rate, by state, per 100,000 population

In [42]:
mentions_norm["rate_per_100k"] = (mentions_norm["mentions"] * 100000) / mentions_norm[
    "vap"
]

In [43]:
mentions_norm.sort_values("rate_per_100k", ascending=False).head()

Unnamed: 0,state,vap,mentions,rate_per_100k
29,New Hampshire,1115916,179,16.040634
15,Iowa,2439743,372,15.247508
47,Washington,6070046,509,8.385439
26,Montana,851663,50,5.870867
8,District of Columbia,582065,34,5.841272
