# Scraping Twitter data with search terms using snscrape

### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import jenkspy
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

In [3]:
%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

In [6]:
# https://medium.com/better-programming/how-to-scrape-tweets-with-snscrape-90124ed006af

In [7]:
# https://github.com/JustAnotherArchivist/snscrape

### Use snscrape to grab tweets mentioning a search term, in this case "doomscrolling"

In [8]:
# use --progress to see results in 100 tweet intervals

In [14]:
!snscrape --help

usage: snscrape [-h] [--version] [-v] [--dump-locals] [--retry N] [-n N]
                [-f FORMAT] [--since DATETIME]
                {telegram-channel,vkontakte-user,googleplus-user,facebook-user,facebook-group,gab-user,gab-user-comments,gab-user-media,twitter-search,twitter-thread,twitter-list-members,instagram-user,instagram-hashtag,instagram-location,twitter-user,twitter-hashtag,twitter-list-posts}
                ...

positional arguments:
  {telegram-channel,vkontakte-user,googleplus-user,facebook-user,facebook-group,gab-user,gab-user-comments,gab-user-media,twitter-search,twitter-thread,twitter-list-members,instagram-user,instagram-hashtag,instagram-location,twitter-user,twitter-hashtag,twitter-list-posts}
                        The scraper you want to use

optional arguments:
  -h, --help            show this help message and exit
  --version             show program's version number and exit
  -v, --verbose, --verbosity
                        Increase output verbosity (def

In [17]:
!snscrape twitter-search "doomscrolling until:2021-02-25 since:2020-01-01" > input/search/doomscrolling-tweets.json

usage: snscrape [-h] [--version] [-v] [--dump-locals] [--retry N] [-n N]
                [-f FORMAT] [--since DATETIME]
                {telegram-channel,vkontakte-user,googleplus-user,facebook-user,facebook-group,gab-user,gab-user-comments,gab-user-media,twitter-search,twitter-thread,twitter-list-members,instagram-user,instagram-hashtag,instagram-location,twitter-user,twitter-hashtag,twitter-list-posts}
                ...
snscrape: error: unrecognized arguments: -f json


In [10]:
src = pd.read_json("input/search/doomscrolling-tweets.json", lines=True)

### Process dates

In [11]:
src["date"] = pd.to_datetime(src["date"], utc=True)
src["pacific_created_at"] = pd.to_datetime(src["date"], utc=True).dt.tz_convert(
    "America/Los_Angeles"
)

KeyError: 'date'

In [None]:
src["date"] = pd.to_datetime(src["pacific_created_at"]).dt.strftime("%m/%d/%Y")
src["date"] = pd.to_datetime(src["date"])
src["year"] = src["pacific_created_at"].dt.year
src["month"] = src["pacific_created_at"].dt.month
src["day"] = src["pacific_created_at"].dt.day
src["hour"] = src["pacific_created_at"].dt.hour
src["minute"] = src["pacific_created_at"].dt.minute
src["time"] = src["pacific_created_at"].dt.time

In [None]:
src["id"] = src["id"].astype(str)
src["year"] = src["year"].astype(str)
src["month"] = src["month"].astype(str)
src["day"] = src["day"].astype(str)
src["hour"] = src["hour"].astype(str)

### Make a copy of the dataframe and ensure the text of the tweet contains 'doomscrolling'

In [None]:
doomscrolling = src[src["content"].str.contains("doomscrolling")].copy()

### Unpack the nested json

In [None]:
data_dict = doomscrolling.to_dict("records")

In [None]:
doomscrolling["user_name"] = pd.json_normalize(doomscrolling["user"])["username"]
doomscrolling["user_displayname"] = pd.json_normalize(doomscrolling["user"])[
    "displayname"
]
doomscrolling["user_verified"] = pd.json_normalize(doomscrolling["user"])["verified"]
doomscrolling["user_description"] = pd.json_normalize(doomscrolling["user"])[
    "description"
]
doomscrolling["user_followersCount"] = pd.json_normalize(doomscrolling["user"])[
    "followersCount"
]
doomscrolling["user_location"] = pd.json_normalize(doomscrolling["user"])["location"]

### Slim down the dataframe

In [None]:
doomscrolling_slim = doomscrolling[
    [
        "date",
        "content",
        "url",
        "replyCount",
        "retweetCount",
        "likeCount",
        "quoteCount",
        "user_name",
        "user_displayname",
        "user_verified",
        "user_description",
        "user_followersCount",
        "user_location",
        "month",
        "day",
        "hour",
        "minute",
        "time",
    ]
]

### How many 'doomscrolling' tweets?

In [None]:
len(doomscrolling_slim)

### How many by @karenkho? 

In [None]:
len(doomscrolling_slim[doomscrolling_slim["user_name"] == "karenkho"])

In [None]:
doomscrolling_slim.head()

---

### Export

In [None]:
doomscrolling_slim.to_csv("output/search/doomscrolling_slim.csv", index=False)