# Members of Congress on Twitter

In [1]:
import pandas as pd
import glob
import json

In [2]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

#### Get congressional account details from GovTrack

In [3]:
gov_track_social = pd.read_json('https://theunitedstates.io/congress-legislators/legislators-social-media.json')

In [4]:
gov_track_social[['twitter', 'facebook', 'youtube_id', 'twitter_id', 'youtube',
       'instagram', 'instagram_id']] = pd.json_normalize(gov_track_social['social'])

In [5]:
gov_track_df = pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')

In [6]:
len(gov_track_df)

536

In [7]:
gov_track_df.gender.value_counts()

M    386
F    150
Name: gender, dtype: int64

In [8]:
gov_track_df.gender.value_counts(normalize=True)

M    0.720149
F    0.279851
Name: gender, dtype: float64

In [9]:
twitter_list = []

for handles in gov_track_social['social']:
    twitter_list.append(handles)

In [10]:
social_df = pd.DataFrame(twitter_list)

In [11]:
social_df['twitter_id'].to_csv('/Users/stiles/twarc2/data/reference/congress_twitter.txt', index=False, header=False)

In [12]:
twitter_user_lookup = social_df[['twitter', 'twitter_id']]

---

#### Last update: June 27

In [13]:
# !twarc2 timelines --start-time '2020-01-01' --no-context-annotations data/reference/congress_twitter.txt data/raw/members_congress.jsonl

----

#### Get the path to each member's tweet file

In [14]:
file_path = "/Users/stiles/twarc2/data/raw/"
json_files = glob.glob(file_path + "*.jsonl")

In [15]:
len(json_files)

1

In [16]:
json_files

['/Users/stiles/twarc2/data/raw/members_congress.jsonl']

#### Read all the json files, loop and snag values that interest us 

In [17]:
%%time

jsons = []
data_list = []

for f in json_files:
    with open(f) as file:
        for line in file:
            jsons.append(json.loads(line))

CPU times: user 26.4 s, sys: 45.5 s, total: 1min 11s
Wall time: 2min 47s


In [18]:
%%time

for j in jsons:
    for d in j['data']:
        datadict = {
            "author_id":d['author_id'],
            "conversation_id": d['conversation_id'],
            "tweet_id": d['id'],
            "tweet_text": d['text'],
            "source": d['source'],
            "created_date_uct": d['created_at'],
            "retweets": pd.json_normalize(d['public_metrics'])['retweet_count'][0],
            "likes": pd.json_normalize(d['public_metrics'])['like_count'][0],
            "quotes": pd.json_normalize(d['public_metrics'])['quote_count'][0],
        }
        data_list.append(datadict)

#### Convert the list of dicts to a pandas dataframe

In [19]:
src = pd.DataFrame(data_list)

#### How many Twitter users did we capture? 

In [20]:
len(src.author_id.unique())

526

#### Drop dupes because the requests might overlap

In [21]:
src = src.drop_duplicates(subset='tweet_id')

#### Merge tweets with user info

In [22]:
df = pd.merge(src, twitter_user_lookup, left_on='author_id', right_on='twitter_id')

#### Merge df with member bio info

In [23]:
df_bio = pd.merge(df, gov_track_df[['type', 'party', "twitter", "gender"]], on='twitter')

#### How many tweets? 

In [24]:
len(df_bio)

1109209

---

## Export

In [25]:
df_bio.to_csv('/Users/stiles/twarc2/data/processed/congress_tweets.csv', index=False)

In [26]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:////Users/stiles/twarc2/data/processed/congress_tweets.db', echo=False)

In [27]:
df_bio.to_sql('tweets', con=engine, if_exists='replace')

1109209

In [28]:
January6thCmte = df[df['tweet_text'].str.contains('@January6thCmte')]

In [29]:
len(January6thCmte)

996

In [30]:
January6thCmte.to_csv('/Users/stiles/twarc2/data/processed/congress_mentions_January6thCmte.csv')