# Scraping Twitter users' data with snscrape

### Import Python tools

In [1]:
import pandas as pd
import geopandas as gpd
import jenkspy
import matplotlib.pyplot as plt
import requests
import glob
import os
from bs4 import BeautifulSoup

In [2]:
from datetime import datetime

In [3]:
%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

In [4]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('latimes')

In [5]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("display.max_colwidth", None)

In [6]:
# !pip install --upgrade git+https://github.com/JustAnotherArchivist/snscrape@master

### Get the date as a string to archving data

In [7]:
today = datetime.today().strftime("%Y-%m-%d")

In [8]:
# https://medium.com/better-programming/how-to-scrape-tweets-with-snscrape-90124ed006af

In [9]:
# https://github.com/JustAnotherArchivist/snscrape

### Use snscrape to grab users' timelines

In [10]:
# use --progress to see results in 100 tweet intervals

### Define our users

In [11]:
# https://docs.google.com/spreadsheets/d/1PnPD2CLGvQEF9HOpcy8vB86xjiy0ywDLBzgQ_kozUT8/edit

In [12]:
users_url = "https://docs.google.com/spreadsheets/d/1PnPD2CLGvQEF9HOpcy8vB86xjiy0ywDLBzgQ_kozUT8/gviz/tq?tqx=out:csv"

In [13]:
users_df = pd.read_csv(users_url)

In [14]:
users_df.columns = map(str.lower, users_df.columns)

In [15]:
users_df.head()

Unnamed: 0,name,organization,notes,handle,account
0,Aaron Bate,Three Percenter/Proud Boy,Recall Gavin/El Dorado County,aj.boots.3,Facebook
1,Aniko Bordelon,anti-vaxx,,aniko.bordelon,Facebook
2,Aniko Bordelon,anti-vaxx,,ani.gulagle,Facebook
3,Denise Aguilar,Freedom Angels 2.0,,DenisenRae,Facebook
4,Jason Kraus,former LE,violent manifesto,jkraus.freedom,Facebook


### Get those user's from Paige's list who are on Twitter

In [16]:
handles = list(users_df[users_df["account"] == "Twitter"]["handle"])

In [17]:
# !snscrape --jsonl --progress --max-results 100 twitter-search "from:jack" > user-tweets.json

### Loop through users and fetch their timelines

In [18]:
for h in handles:
    !snscrape --jsonl twitter-search "from:{h}" > output/users/paige-raw/{h}-tweets-{today}.json

### Grab and concatenate all the users' json files

In [21]:
path = "output/users/paige-raw/"
all_files = glob.glob(os.path.join(path, "*.json"))

In [22]:
df_from_each_file = (pd.read_json(f, lines=True) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)

In [23]:
len(concatenated_df)

24460

---

### Process dates (all times converted to Pacific from UTC

In [24]:
concatenated_df["date"] = pd.to_datetime(concatenated_df["date"], utc=True)
concatenated_df["pacific_created_at"] = pd.to_datetime(
    concatenated_df["date"], utc=True
).dt.tz_convert("America/Los_Angeles")

In [25]:
concatenated_df["date"] = pd.to_datetime(
    concatenated_df["pacific_created_at"]
).dt.strftime("%m/%d/%Y")
concatenated_df["date"] = pd.to_datetime(concatenated_df["date"])
concatenated_df["year"] = concatenated_df["pacific_created_at"].dt.year
concatenated_df["month"] = concatenated_df["pacific_created_at"].dt.month
concatenated_df["day"] = concatenated_df["pacific_created_at"].dt.day
concatenated_df["hour"] = concatenated_df["pacific_created_at"].dt.hour
concatenated_df["minute"] = concatenated_df["pacific_created_at"].dt.minute
concatenated_df["time"] = concatenated_df["pacific_created_at"].dt.time

In [26]:
concatenated_df["id"] = concatenated_df["id"].astype(str)
concatenated_df["year"] = concatenated_df["year"].astype(str)
concatenated_df["month"] = concatenated_df["month"].astype(str)
concatenated_df["day"] = concatenated_df["day"].astype(str)
concatenated_df["hour"] = concatenated_df["hour"].astype(str)

---

### Unpack the nested user info

In [27]:
concatenated_df["user_name"] = pd.json_normalize(concatenated_df["user"])["username"]
concatenated_df["user_displayname"] = pd.json_normalize(concatenated_df["user"])[
    "displayname"
]
concatenated_df["user_verified"] = pd.json_normalize(concatenated_df["user"])[
    "verified"
]
concatenated_df["user_description"] = pd.json_normalize(concatenated_df["user"])[
    "description"
]
concatenated_df["user_followersCount"] = pd.json_normalize(concatenated_df["user"])[
    "followersCount"
]
concatenated_df["user_location"] = pd.json_normalize(concatenated_df["user"])[
    "location"
]

### Need to figure out how to deal with mentioned users

In [28]:
# concatenated_df["mentioned_user"] = pd.json_normalize(
#     concatenated_df["mentionedUsers"][0]
# )["username"]

### And quoted tweets

???

### Make a clean copy of the dataframe for export

In [29]:
df = concatenated_df[
    [
        "id",
        "url",
        "user_name",
        "user_displayname",
        "user_verified",
        "user_description",
        "user_followersCount",
        "user_location",
        "date",
        "time",
        "content",
        "replyCount",
        "retweetCount",
        "likeCount",
        "quoteCount",
        "sourceLabel",
        "retweetedTweet",
    ]
].copy()

In [30]:
df.drop_duplicates(inplace=True)

### Export

In [31]:
df.to_csv("output/users/paige-processed/all_twitter_user_timelines.csv", index=False)

In [32]:
df.to_json(
    "output/users/paige-processed/all_twitter_user_timelines.json", indent=2, orient="records"
)

In [33]:
len(df)

16355