In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
%load_ext dotenv
%dotenv 

In [3]:
from pymongo import MongoClient
from dataloaders import *
import os
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff


In [4]:
import datasets

datasets.disable_progress_bar()

In [5]:
user_id = "1308026329"
client = MongoClient(os.environ["MONGO_CONN"])

In [13]:
dl = SubjectDataLoader(user_id, db=client.twitter)
dataset = dl.load_data()

Subject has 12 peers.
Function '_load_user_dataset' executed in 0.0141s
Function '_peer_context_loader' executed in 0.0183s
Function '_random_context_loader' executed in 33.0617s


In [14]:
dataset

DatasetDict({
    eval: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    user_context: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 398
    })
    peer_context: Dataset({
        features: ['context_annotations', 'edit_controls', 'public_metrics', 'conversation_id', 'author_id', 'possibly_sensitive', 'created_at', 'reply_settings', 'id', 'lang', 'text', 'edit_history_tweet_ids', 'queried_at'],
        num_rows: 398
    })
    random_context: Dataset({
        features: ['edit_history_tweet_ids', 'in_

In [15]:
def _timelines(dataset, title=""):
    splits = dataset.keys()
    fig = go.Figure()
    for split in splits:
        fig.add_trace(go.Histogram(x=dataset[split]["created_at"], name=split))

    fig.update_layout(barmode='overlay', title_text=title)
    fig.update_traces(opacity=0.75)
    fig.show()

def timelines(dataset, title=""):
    splits = dataset.keys()
    fig = make_subplots(rows=2, cols=1, row_heights=[0.3, 0.7], shared_xaxes=True)
    for split, color in zip(splits, px.colors.qualitative.Plotly):
        created_at = dataset[split]["created_at"]
        fig.add_trace(go.Box(x=created_at, marker_symbol='line-ns-open', boxpoints='all', jitter=0, hoveron="points", hovertext=created_at, name=split, fillcolor='rgba(255,255,255,0)',  marker_color=color, line_color='rgba(255,255,255,0)', showlegend=False), row=1, col=1)
        fig.add_trace(go.Histogram(x=created_at, name=split, marker_color=color), row=2, col=1)

    fig.update_layout(barmode='overlay', title_text=title)
    fig.update_traces(opacity=0.75)
    fig.show()

def __timelines(dataset, title=""):
    splits = dataset.keys()
    hist_data = []
    for split in splits:
        hist_data.append(dataset[split]["created_at"])
    # hist_data = [x1, x2, x3, x4]

    group_labels = splits

    # Create distplot with custom bin_size
    fig = ff.create_distplot(hist_data, group_labels)
    fig.show()


def column_color_timelines(data, column_name, title=""):
    categories = data.unique(column_name)
    fig = make_subplots(rows=2, cols=1, row_heights=[0.3, 0.7], shared_xaxes=True)
    for val, color in zip(categories, px.colors.qualitative.Plotly_r):
        created_at = data.filter(lambda example: example[column_name] == val)["created_at"]
        fig.add_trace(go.Box(x=created_at, marker_symbol='line-ns-open', boxpoints='all', jitter=0, hoveron="points", hovertext=created_at, name=val, fillcolor='rgba(255,255,255,0)',  marker_color=color, line_color='rgba(255,255,255,0)', showlegend=False), row=1, col=1)
        fig.add_trace(go.Histogram(x=created_at, name=val, marker_color=color), row=2, col=1)

    fig.update_layout(barmode='overlay', title_text=title)
    fig.update_traces(opacity=0.75)
    fig.show()


In [16]:
timelines(dataset, title="SubjectDataLoader")

In [17]:
dl2 = TemporallyConsistentSubjectDataLoader(user_id, db=client.twitter)
dataset2 = dl2.load_data()

Subject has 12 peers.
Function '_load_user_dataset' executed in 0.0148s
Function '_peer_context_loader' executed in 0.0529s
Function '_random_context_loader' executed in 0.6180s


In [18]:
dataset2

DatasetDict({
    eval: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    user_context: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 398
    })
    peer_context: Dataset({
        features: ['possibly_sensitive', 'lang', 'author_id', 'id', 'referenced_tweets', 'created_at', 'edit_history_tweet_ids', 'public_metrics', 'reply_settings', 'in_reply_to_user_id', 'text', 'edit_controls', 'conversation_id', 'queried_at'],
        num_rows: 398
    })
    random_context: Dataset({
        features: ['possibly

In [19]:
timelines(dataset2, title="TemporallyConsistentSubjectDataLoader")

In [20]:
dl3 = PeerAdjustedSubjectDataLoader(user_id, db=client.twitter)
dataset3 = dl3.load_data()

Subject has 12 peers.
Function '_load_user_dataset' executed in 0.0143s
Function '_peer_context_loader' executed in 0.0470s
Function '_random_context_loader' executed in 0.0658s


In [21]:
dataset3

DatasetDict({
    eval: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    user_context: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 30
    })
    peer_context: Dataset({
        features: ['possibly_sensitive', 'lang', 'author_id', 'id', 'referenced_tweets', 'created_at', 'edit_history_tweet_ids', 'public_metrics', 'reply_settings', 'in_reply_to_user_id', 'text', 'edit_controls', 'conversation_id', 'queried_at'],
        num_rows: 398
    })
    random_context: Dataset({
        features: ['text', 'p

In [22]:
timelines(dataset3, title="PeerAdjustedSubjectDataLoader")

In [23]:
column_color_timelines(dataset3["peer_context"], column_name="author_id", title="Peer context")

### Random user

In [24]:
def plot_user_dataset(id):
    ds = PeerAdjustedSubjectDataLoader(id, db=client.twitter).load_data()
    timelines(ds, title=f"PeerAdjustedSubjectDataLoader - {id}")
    return ds


def plot_random_user_dataset():
    rand_subject = client.twitter.subjects_collection.aggregate([
                        {"$match": {"timeline_tweets_count": {"$gte": 500}}},
                        {"$sample": {"size": 1}},
                    ]).next()

    ds = plot_user_dataset(rand_subject["id"])
    return ds

In [26]:
plot_random_user_dataset()

Subject has 14 peers.
Function '_load_user_dataset' executed in 0.0223s
Function '_peer_context_loader' executed in 0.0577s
Function '_random_context_loader' executed in 22.4042s


DatasetDict({
    eval: Dataset({
        features: ['public_metrics', 'context_annotations', 'referenced_tweets', 'entities', 'id', 'possibly_sensitive', 'text', 'conversation_id', 'author_id', 'reply_settings', 'lang', 'edit_history_tweet_ids', 'edit_controls', 'created_at', 'queried_at'],
        num_rows: 250
    })
    user_context: Dataset({
        features: ['public_metrics', 'context_annotations', 'referenced_tweets', 'entities', 'id', 'possibly_sensitive', 'text', 'conversation_id', 'author_id', 'reply_settings', 'lang', 'edit_history_tweet_ids', 'edit_controls', 'created_at', 'queried_at'],
        num_rows: 371
    })
    peer_context: Dataset({
        features: ['text', 'created_at', 'edit_controls', 'possibly_sensitive', 'context_annotations', 'conversation_id', 'lang', 'reply_settings', 'entities', 'author_id', 'id', 'edit_history_tweet_ids', 'public_metrics', 'queried_at'],
        num_rows: 271
    })
    random_context: Dataset({
        features: ['lang', 'edit_cont

### Scratchpad

In [194]:
rand_subject

{'_id': ObjectId('63cb0657d7f30964c3fea812'),
 'created_at': '2022-07-10T20:52:53.000Z',
 'description': 'Mother, cook, cleaner, nurse, teacher, taxi service, referee, judge and jury',
 'id': '1546235909041012737',
 'location': 'Boyertown, PA',
 'name': 'Denise Charles',
 'profile_image_url': 'https://pbs.twimg.com/profile_images/1598900431194841094/2DKntvns_normal.jpg',
 'protected': False,
 'public_metrics': {'followers_count': 73,
  'following_count': 247,
  'tweet_count': 649,
  'listed_count': 2},
 'username': '11charlesdenise',
 'verified': False,
 'verified_type': 'none',
 'timeline_tweets_count': 506}

In [29]:
dataset["peer_context"].unique("author_id")

['1060512485319438336',
 '1073702576418304000',
 '1381015116',
 '701771998075162625',
 '857918136640692225',
 '1262376529218150402',
 '543350286',
 '1171694238221918211',
 '389514735',
 '2839485486',
 '365177257',
 '242532537']

In [46]:
sorted_peer_dataset = dataset["peer_context"].sort("author_id").sort("created_at")

In [44]:
dataset["peer_context"].filter(lambda example: example["author_id"] == "2839485486")

Dataset({
    features: ['context_annotations', 'edit_controls', 'public_metrics', 'conversation_id', 'author_id', 'possibly_sensitive', 'created_at', 'reply_settings', 'id', 'lang', 'text', 'edit_history_tweet_ids', 'queried_at'],
    num_rows: 186
})

In [45]:
for author in dataset["peer_context"].unique("author_id"):
    tweets_by_author = dataset["peer_context"].filter(lambda example: example["author_id"] == author)
    print(f"{author} has {len(tweets_by_author)} tweets.")

1060512485319438336 has 195 tweets.
1073702576418304000 has 191 tweets.
1381015116 has 154 tweets.
701771998075162625 has 150 tweets.
857918136640692225 has 123 tweets.
1262376529218150402 has 98 tweets.
543350286 has 144 tweets.
1171694238221918211 has 100 tweets.
389514735 has 194 tweets.
2839485486 has 186 tweets.
365177257 has 183 tweets.
242532537 has 191 tweets.


In [52]:
sorted_peer_dataset[100:175]["author_id"]

['857918136640692225',
 '857918136640692225',
 '365177257',
 '365177257',
 '2839485486',
 '365177257',
 '2839485486',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '2839485486',
 '2839485486',
 '2839485486',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '857918136640692225',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '2839485486',
 '365177257',
 '365177257',
 '857918136640692225',
 '365177257',
 '365177257',
 '365177257',
 '2839485486',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '2839485486',
 '365177257',
 '365177257',
 '365177257',
 '365177257',
 '2839485486',
 '2839485486',
 '2839485486',
 '2839485486',
 '2839485486',
 '2839485486',
 '365177257',
 '2839485486',
 '365177257',
 '2839485486',
 '2839485486',
 '365177257',
 '365177257',
 '2839485486',
 '365177257',
 '2839485486',
 '365177257',
 '365177257',
 '1073702576418304000',
 '107370257

In [53]:
sorted_peer_dataset[100:175]["created_at"]

['2018-12-14T02:21:46.000Z',
 '2018-12-16T20:31:04.000Z',
 '2018-12-19T13:34:18.000Z',
 '2018-12-19T13:59:27.000Z',
 '2018-12-20T01:41:10.000Z',
 '2018-12-20T06:19:00.000Z',
 '2018-12-21T06:00:38.000Z',
 '2018-12-21T14:33:59.000Z',
 '2018-12-21T19:01:14.000Z',
 '2018-12-25T23:11:12.000Z',
 '2018-12-27T19:04:05.000Z',
 '2018-12-27T19:05:14.000Z',
 '2018-12-27T21:23:05.000Z',
 '2018-12-27T23:32:13.000Z',
 '2018-12-27T23:35:48.000Z',
 '2018-12-28T02:32:05.000Z',
 '2018-12-29T11:07:28.000Z',
 '2018-12-29T11:09:31.000Z',
 '2018-12-30T14:04:36.000Z',
 '2018-12-30T17:37:40.000Z',
 '2018-12-31T08:44:14.000Z',
 '2019-01-01T01:44:33.000Z',
 '2019-01-01T12:20:23.000Z',
 '2019-01-03T10:20:19.000Z',
 '2019-01-03T12:24:53.000Z',
 '2019-01-05T13:34:01.000Z',
 '2019-01-07T07:08:31.000Z',
 '2019-01-07T15:37:46.000Z',
 '2019-01-10T07:37:10.000Z',
 '2019-01-10T07:54:02.000Z',
 '2019-01-10T08:33:13.000Z',
 '2019-01-11T10:18:47.000Z',
 '2019-01-11T14:25:14.000Z',
 '2019-01-13T03:29:14.000Z',
 '2019-01-13T1