# Scraping Twitter mentions of "bejesus'

### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import jenkspy
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

In [3]:
%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

In [6]:
# https://medium.com/better-programming/how-to-scrape-tweets-with-snscrape-90124ed006af

In [7]:
# https://github.com/JustAnotherArchivist/snscrape

### Use snscrape to grab tweets mentioning a search term, in this case "doomscrolling"

In [8]:
# use --progress to see results in 100 tweet intervals

In [9]:
# !snscrape --jsonl twitter-search "bejesus until:2021-02-13 since:2010-01-01" > input/search/bejesus-tweets.json

In [10]:
src = pd.read_json("input/search/bejesus-tweets.json", lines=True)

In [11]:
len(src)

214904

### Process dates

In [12]:
src["date"] = pd.to_datetime(src["date"], utc=True)
src["pacific_created_at"] = pd.to_datetime(src["date"], utc=True).dt.tz_convert(
    "America/Los_Angeles"
)

In [13]:
src["date"] = pd.to_datetime(src["pacific_created_at"]).dt.strftime("%m/%d/%Y")
src["date"] = pd.to_datetime(src["date"])
src["year"] = src["pacific_created_at"].dt.year
src["month"] = src["pacific_created_at"].dt.month
src["day"] = src["pacific_created_at"].dt.day
src["hour"] = src["pacific_created_at"].dt.hour
src["minute"] = src["pacific_created_at"].dt.minute
src["time"] = src["pacific_created_at"].dt.time

In [14]:
src["id"] = src["id"].astype(str)
src["year"] = src["year"].astype(str)
src["month"] = src["month"].astype(str)
src["day"] = src["day"].astype(str)
src["hour"] = src["hour"].astype(str)

### Make a copy of the dataframe and ensure the text of the tweet contains 'doomscrolling'

In [15]:
bejesus = src[src["content"].str.contains("bejesus")].copy()

### Unpack the nested json

In [16]:
data_dict = bejesus.to_dict("records")

In [18]:
bejesus["user_name"] = pd.json_normalize(bejesus["user"])["username"]
bejesus["user_displayname"] = pd.json_normalize(bejesus["user"])["displayname"]
bejesus["user_verified"] = pd.json_normalize(bejesus["user"])["verified"]
bejesus["user_description"] = pd.json_normalize(bejesus["user"])["description"]
bejesus["user_followersCount"] = pd.json_normalize(bejesus["user"])["followersCount"]
bejesus["user_location"] = pd.json_normalize(bejesus["user"])["location"]

### Slim down the dataframe

In [22]:
bejesus_slim = bejesus[
    [
        "date",
        "content",
        "url",
        "replyCount",
        "retweetCount",
        "likeCount",
        "quoteCount",
        "user_name",
        "user_displayname",
        "user_verified",
        "user_description",
        "user_followersCount",
        "user_location",
        "month",
        "day",
        "hour",
        "minute",
        "time",
        "year",
    ]
]

In [24]:
bejesus_slim.year.value_counts()

---

In [31]:
daily = bejesus_slim.groupby(["date"]).agg("size").reset_index(name="count")

In [32]:
daily.head()

Unnamed: 0,date,count
0,2009-12-31,6
1,2010-01-01,19
2,2010-01-02,16
3,2010-01-03,20
4,2010-01-04,26


In [37]:
daily.sort_values("count", ascending=False).head(5)

Unnamed: 0,date,count
2617,2017-03-01,291
2018,2015-07-11,259
1030,2012-10-26,230
2823,2017-09-23,215
1035,2012-10-31,196


In [38]:
bejesus_slim.head()

Unnamed: 0,date,content,url,replyCount,retweetCount,likeCount,quoteCount,user_name,user_displayname,user_verified,user_description,user_followersCount,user_location,month,day,hour,minute,time,year
1,2021-02-12,@Rozkez67 The bit where the boat turns over with the charred body....I saw a double bill of the first two in the cinema when I was about 12....that bit scared the bejesus out of me lol,https://twitter.com/Alfie19892/status/1360337605523623941,0,0,1,0,CarrotTheTurkey,CarrotsTheTurkey,False,Fighting for my right to live as a free turkey. Count all legal votes!,8.0,"Washington, DC",2,12,13,18,13:18:53,2021
2,2021-02-12,Bejesus! Ned Ryerson didn’t say much during #ImpeachmentTrial2 #bejesus https://t.co/jE44lhSNjC,https://twitter.com/CarrotTheTurkey/status/1360328159669800962,0,0,0,0,hurdville23,Chris,False,"Professional napper,, Xbox fanatic, artist, cook, movie buff and all around good guy, getting through life one day at a time, barely.",63.0,,2,12,12,41,12:41:21,2021
3,2021-02-12,@majornelson @LittleNights Pyramid head from Silent Hill scared the bejesus out of me😱#freecodefridaycontest,https://twitter.com/hurdville23/status/1360324508163448834,0,0,0,0,raykwong,Ray Kwong,True,"WSJ caption contest winner. HIIT | Space | Clean Energy | Also, US-China, USC. Beer snob. RT/Like ≠ endorsement. #ENDALZ #NASASocial",44650.0,Pacific Rim,2,12,12,26,12:26:51,2021
6,2021-02-12,"@educated_guest In short, so far Trump’s defense showed a video with scary music, presented old conspiracy theories, created new conspiracy theories, and used the word “bejesus.”",https://twitter.com/raykwong/status/1360321274438799360,3,0,5,0,LoriLouHoo1974,LoriAnn💙💙💙💙💞🌊⭐🇺🇸,False,💞LiveLifeToTheFullest🦋🌺🌷🌹Rebuilding after I lost account for hurting trumps feelings🥰❤💙❤💙🇺🇸🤣😂😆😍🤪,848.0,Floriduh😂🤣,2,12,12,14,12:14:00,2021
7,2021-02-12,@majornelson @LittleNights #FreeCodeFridayContest @majornelson @LittleNights spiders scare the bejesus out of me,https://twitter.com/WalkinDude69/status/1360319882664030212,0,0,0,0,Jab_law,"Joshua A. Benjamin, Esq.",False,Your friendly neighborhood Coptic Orthodox Lawyer. Owner of @jab_law_offices,252.0,"Cherry Hill, NJ",2,12,12,8,12:08:28,2021


---

### Export

In [21]:
bejesus_slim.to_csv("output/search/bejesus_slim.csv", index=False)