In [6]:
import requests
import os
import json
import pandas as pd
import pickle as pkl
import TwitterUtils as TU

## Exploratory Space For Developing Twitter API Workflow
Our first step is to sample some arbitrary number of twitter users. We have done this in the TwitterUtils.py file and are using this document to develop the workflow.

In [7]:
user_seed = TU.TwitterClient()
rules = user_seed.get_rules() # Prints Rule for current Stream; Need to determine if we can repurpose the get_stream to another endpoint
user_sample = user_seed.get_stream(rules, sample_size = 100)

{"data": [{"id": "1637510334255710210", "value": "(a OR the) has:geo lang:en tweets_count:50", "tag": "active_user"}], "meta": {"sent": "2023-03-20T16:35:00.945Z", "result_count": 1}}
{"meta": {"sent": "2023-03-20T16:35:02.392Z", "summary": {"deleted": 1, "not_deleted": 0}}}
{"data": [{"value": "(a OR the) has:geo lang:en tweets_count:50", "tag": "active_user", "id": "1637855276945534976"}], "meta": {"sent": "2023-03-20T16:35:04.252Z", "summary": {"created": 1, "not_created": 0, "valid": 1, "invalid": 0}}}
{"data": [{"id": "1637855276945534976", "value": "(a OR the) has:geo lang:en tweets_count:50", "tag": "active_user"}], "meta": {"sent": "2023-03-20T16:35:04.419Z", "result_count": 1}}
200


In [4]:
user_ids = [tweet['data']['author_id'] for tweet in user_sample]
user_ids

In [9]:
with open("places.pkl", "rb") as file:
    places = pkl.load(file)

In [10]:
places_unpacked = [item  for item in places.values()]
def unpack_place(place):
    return (place.id, place.name, place.full_name, place.country, place.country_code, place.place_type)

unpacked_places = [unpack_place(place) for place in places_unpacked]
place_df = pd.DataFrame(unpacked_places, columns = ("id", "name", "full_name", "country", "country_code", "type"))


In [12]:
len(unpacked_places)

7369

In [13]:
# Load Users
len(place_df['country'].unique())

174

In [14]:
place_df

Unnamed: 0,id,name,full_name,country,country_code,type
0,67b98f17fdcf20be,Boston,"Boston, MA",United States,US,city
1,5c62ffb0f0f3479d,Phoenix,"Phoenix, AZ",United States,US,city
2,8a927a7056322151,Botswana,Botswana,Botswana,BW,country
3,06168d1feda43857,South East,"South East, England",United Kingdom,GB,admin
4,94d47cc557aa35f4,Nicholasville,"Nicholasville, KY",United States,US,city
...,...,...,...,...,...,...
7364,486bc865d347cd73,Florin,"Florin, CA",United States,US,city
7365,35f73f0698fa5f98,Athens,"Athens, TN",United States,US,city
7366,00ace66c41e736ad,Mubi,"Mubi, Nigeria",Nigeria,NG,city
7367,fb4d9b85c6daedc1,Nigel,"Nigel, South Africa",South Africa,ZA,city


## Creating The Dataframe

In [15]:
with open("users.json", "r") as file:
    user_json = file.read()

In [16]:
test = '{"total": [' + user_json.replace("}{", "},{") + "]}"

In [17]:
len(test)

12409475

In [18]:
user_data = json.loads(test)

In [19]:
users = [u['data'] for u in user_data["total"]]

In [20]:
flat_list = [user_id for user in users for user_id in user]

In [22]:
len(flat_list)

51000

In [46]:
len(flat_list)

1700

In [24]:
users_df = pd.DataFrame(flat_list)

In [25]:
users_df

Unnamed: 0,username,description,id,name,location,withheld
0,iam_rousey,🌷I BELIEVE IN GOD✊💪||💐PSALM 51 🙏💐||UDSM Alumni...,1293183874386731008,🦹‍♀️𝐇𝐄𝐑𝐎𝐈𝐍𝐄𝐑𝐎𝐒𝐄 🦋,"Arusha,Tanzania",
1,Pavanasoonu,Global SAP Tech. #Jyotishkatti. Spiritual cou...,1051074201082388480,Pavanasoonu,"Bengaluru, India",
2,JohnCam14394418,,1625506362515152898,John Campbell,,
3,realsast,"14, 🇸🇪, ASD, ADHD, He/Him, #gdtwt, #daputwt, B...",1300118462329769986,Simon,"Helsingborg, Sverige",
4,AmaBoukman1804,I am a moderate; the world is radically unjust...,2586324829,Ama Boukman,United States,
...,...,...,...,...,...,...
50995,Sal_of_Lourdes,An NFP Sanctuary for all. Live by Fr. Bob’s 5 ...,1381214332114010114,Fr. Bob’s Lourdes Camel & Cobber Sanctuary🐪❤️,"Mt. Lonarch, Vic, Aus",
50996,loydhogan2,😉,1508540417821720581,Loyd Hogan,the Disneyland on the Duwamish,
50997,karenfthompson,little village oike from Goffs Oak ..blonde an...,56322343,karen thompson,suffolk/London,
50998,drsandeepdu,Professor of English in Delhi University. Auth...,770513549043064832,Dr Sandeep Yadav,"New Delhi, India",


In [28]:
# users_df = users_df.drop(['withheld'], axis = 1)
len(users_df)

51000

In [51]:
users_df.shape

(1700, 5)

In [54]:
users_list = [users_df['username']]

In [29]:
with open("tweets-with-place.json", "r") as f:
    tweet_json = f.read()

In [31]:
len(tweet_json)

101563

In [32]:
tweets_data = json.loads(tweet_json)

In [33]:
len(tweets_data)

385

In [34]:
tweets_df = pd.DataFrame(tweets_data)

In [35]:
tweets_df.head()

Unnamed: 0,user_id,tweet_id,tweet_text,place_id
0,1051074201082388480,1637850668214960128,@Gajendr70729189 @amitsharma2704 @1shankarshar...,5f55bb82cf16ac81
1,1051074201082388480,1637818231863001090,@JatinPandyaVBNJ @1shankarsharma Indeed. Nadi...,5f55bb82cf16ac81
2,1051074201082388480,1637817169626148864,@Skay7yay @1shankarsharma Namaste. It's OK to...,5f55bb82cf16ac81
3,2586324829,1637854183956045838,@desounds Black people united in pursuit of po...,9902fe95fc7596a7
4,2586324829,1637852746597425164,@desounds What's good for the goose is good fo...,9902fe95fc7596a7


In [66]:
tweets_df.head(20)

Unnamed: 0,user_id,tweet_id,tweet_text,place_id
0,1293183874386731008,1636425265797648385,@Adventure_36 Ulikuwa hujui wakili wangu,003f4a527524b7ee
1,1051074201082388480,1636441725240766464,The unknown gives you unlimited freedom. \n\n...,5f55bb82cf16ac81
2,1051074201082388480,1636438839924498432,@patsing10 Wonderful Sir.,5f55bb82cf16ac81
3,1051074201082388480,1636436439327186944,Ok. Jaya Shree Rama https://t.co/sdW2J4OXvb,5f55bb82cf16ac81
4,1051074201082388480,1636435567197171712,@nallanhara Excellent. Go ahead please,5f55bb82cf16ac81
5,1625506362515152898,1636419324675264514,@mariewalsh18 Hahaha 🤣😂,0079932b106eb4c9
6,1625506362515152898,1636413400808034304,@mariewalsh18 Same.. 3 days is minimal...,0079932b106eb4c9
7,1300118462329769986,1636465093947826176,@cywrId You’re not the only person in the worl...,2406f69310767a43
8,2586324829,1636463062587678727,Get ready for increases in child poverty and t...,7af0fb6f137530df
9,2586324829,1636461534225354752,I hate Republicans. 😒 https://t.co/lXKrejMODb,7af0fb6f137530df


In [37]:
with open("user_data_temp.pkl", "rb") as file:
    tweet_samples = pkl.load(file)

In [41]:
pd.DataFrame(tweet_samples).groupby(['tweet_id']).agg('count')
# 15K Users sampled so far

Unnamed: 0_level_0,user_id,tweet_text,place_id
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1001883984320593920,1,1,1
1005529958263844864,1,1,1
1005586840274038784,1,1,1
1005944998439813121,1,1,1
1008541699688226816,1,1,1
...,...,...,...
996769355441598465,2,2,2
996942546377981952,1,1,1
997705777035919360,1,1,1
998449513843970048,1,1,1
