In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
%load_ext dotenv
%dotenv 

In [3]:
from pymongo import MongoClient
from loaders import *
import os
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff


In [4]:
import datasets

datasets.disable_progress_bar()

In [5]:
user_id = "1308026329"
client = MongoClient(os.environ["MONGO_CONN"])

In [6]:
dl = SubjectDataLoader(user_id, db=client.twitter, temporally_consistent_context=False)
dataset = dl.load_data()

In [7]:
dataset

DatasetDict({
    eval: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    user_context: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    peer_context: Dataset({
        features: ['context_annotations', 'edit_controls', 'public_metrics', 'conversation_id', 'author_id', 'possibly_sensitive', 'created_at', 'reply_settings', 'id', 'lang', 'text', 'edit_history_tweet_ids', 'queried_at'],
        num_rows: 250
    })
    random_context: Dataset({
        features: ['entities', 'reply_settings', 

In [8]:
def _timelines(dataset, title=""):
    splits = dataset.keys()
    fig = go.Figure()
    for split in splits:
        fig.add_trace(go.Histogram(x=dataset[split]["created_at"], name=split))

    fig.update_layout(barmode='overlay', title_text=title)
    fig.update_traces(opacity=0.75)
    fig.show()

def timelines(dataset, title=""):
    splits = dataset.keys()
    fig = make_subplots(rows=2, cols=1, row_heights=[0.3, 0.7], shared_xaxes=True)
    for split, color in zip(splits, px.colors.qualitative.Plotly):
        created_at = dataset[split]["created_at"]
        total = len(created_at)
        fig.add_trace(go.Box(x=created_at, marker_symbol='line-ns-open', boxpoints='all', jitter=0, hoveron="points", hovertext=created_at, name=split, fillcolor='rgba(255,255,255,0)',  marker_color=color, line_color='rgba(255,255,255,0)', showlegend=False), row=1, col=1)
        fig.add_trace(go.Histogram(x=created_at, name=f"{split} ({total})", marker_color=color, xbins=dict(start='2007-01-01',end='2023-02-30', size='D1')), row=2, col=1)

    fig.update_layout(barmode='overlay', title_text=title)
    fig.update_traces(opacity=0.75)
    fig.show()

def __timelines(dataset, title=""):
    splits = dataset.keys()
    hist_data = []
    for split in splits:
        hist_data.append(dataset[split]["created_at"])
    # hist_data = [x1, x2, x3, x4]

    group_labels = splits

    # Create distplot with custom bin_size
    fig = ff.create_distplot(hist_data, group_labels)
    fig.show()


def column_color_timelines(data, column_name, title=""):
    categories = data.unique(column_name)
    fig = make_subplots(rows=2, cols=1, row_heights=[0.3, 0.7], shared_xaxes=True)
    for val, color in zip(categories, px.colors.qualitative.Plotly_r):
        created_at = data.filter(lambda example: example[column_name] == val)["created_at"]
        total = len(created_at)
        fig.add_trace(go.Box(x=created_at, marker_symbol='line-ns-open', boxpoints='all', jitter=0, hoveron="points", hovertext=created_at, name=val, fillcolor='rgba(255,255,255,0)',  marker_color=color, line_color='rgba(255,255,255,0)', showlegend=False), row=1, col=1)
        fig.add_trace(go.Histogram(x=created_at, name=f"{val :*<20} ({total})", marker_color=color, xbins=dict(start='2007-01-01',end='2023-02-30', size='D1')), row=2, col=1)

    fig.update_layout(barmode='overlay', title_text=title)
    fig.update_traces(opacity=0.75)
    fig.show()


In [9]:
timelines(dataset, title="SubjectDataLoader")

In [10]:
dl2 = SubjectDataLoader(user_id, db=client.twitter)
dataset2 = dl2.load_data()

In [11]:
dataset2

DatasetDict({
    eval: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    user_context: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    peer_context: Dataset({
        features: ['possibly_sensitive', 'lang', 'author_id', 'id', 'referenced_tweets', 'created_at', 'edit_history_tweet_ids', 'public_metrics', 'reply_settings', 'in_reply_to_user_id', 'text', 'edit_controls', 'conversation_id', 'queried_at'],
        num_rows: 250
    })
    random_context: Dataset({
        features: ['edit_his

In [12]:
timelines(dataset2, title="TemporallyConsistent SubjectDataLoader")

In [13]:
dl3 = MultiControlSubjectDataLoader(user_id, db=client.twitter)
dataset3 = dl3.load_data()

In [14]:
dataset3

DatasetDict({
    eval: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    user_context: Dataset({
        features: ['created_at', 'in_reply_to_user_id', 'entities', 'public_metrics', 'referenced_tweets', 'possibly_sensitive', 'lang', 'author_id', 'edit_history_tweet_ids', 'edit_controls', 'conversation_id', 'text', 'id', 'reply_settings', 'queried_at'],
        num_rows: 250
    })
    peer_context: Dataset({
        features: ['possibly_sensitive', 'lang', 'author_id', 'id', 'referenced_tweets', 'created_at', 'edit_history_tweet_ids', 'public_metrics', 'reply_settings', 'in_reply_to_user_id', 'text', 'edit_controls', 'conversation_id', 'queried_at'],
        num_rows: 250
    })
    random_user_context: Dataset({
        features: ['tex

In [15]:
timelines(dataset3, title="Multi Control (Random User and Random Tweets)")

In [16]:
column_color_timelines(dataset3["peer_context"], column_name="author_id", title="Peer context")

In [17]:
dataset3.set_format("pandas")
df = dataset3["peer_context"][:]
df.head()

Unnamed: 0,possibly_sensitive,lang,author_id,id,referenced_tweets,created_at,edit_history_tweet_ids,public_metrics,reply_settings,in_reply_to_user_id,text,edit_controls,conversation_id,queried_at
0,False,en,1381015116,1135779578582949890,"[{'id': '1135744173963960322', 'type': 'replie...",2019-06-04T05:25:27.000Z,[1135779578582949890],"{'impression_count': 0, 'like_count': 3, 'quot...",everyone,3256083990.0,@graysonromance I'm glad 😂💛,"{'editable_until': 2019-06-04 05:55:27, 'edits...",1135526026342125569,2023-05-20T02:52:05.831245
1,False,en,389514735,1135762377255743488,,2019-06-04T04:17:06.000Z,[1135762377255743488],"{'impression_count': 0, 'like_count': 15, 'quo...",everyone,,Hi my name is Taneika and my talents include d...,"{'editable_until': 2019-06-04 04:47:06, 'edits...",1135762377255743488,2023-05-20T02:52:16.540806
2,False,en,389514735,1135741250580037632,"[{'id': '1135620875863121920', 'type': 'replie...",2019-06-04T02:53:09.000Z,[1135741250580037632],"{'impression_count': 0, 'like_count': 0, 'quot...",everyone,1.0324185919776847e+18,@BowtiesBooks My stepmum is my BEST FRIEND. Sh...,"{'editable_until': 2019-06-04 03:23:09, 'edits...",1135620875863121920,2023-05-20T02:52:16.540841
3,False,en,2839485486,1135726583778385922,"[{'id': '1135726491843481600', 'type': 'quoted'}]",2019-06-04T01:54:53.000Z,[1135726583778385922],"{'impression_count': 0, 'like_count': 0, 'quot...",everyone,,“planetary and human systems [are] reaching a ...,"{'editable_until': 2019-06-04 02:24:53, 'edits...",1135726583778385922,2023-05-20T02:52:17.684211
4,False,en,242532537,1135720322529816576,,2019-06-04T01:30:00.000Z,[1135720322529816576],"{'impression_count': 0, 'like_count': 5, 'quot...",everyone,,When an author kills off your fave character: ...,"{'editable_until': 2019-06-04 02:00:00, 'edits...",1135720322529816576,2023-05-20T02:52:19.837416


### Random user

In [18]:
def plot_user_dataset(id):
    ds = MultiControlSubjectDataLoader(id, db=client.twitter).load_data()
    timelines(ds, title=f"Multi Context - {id}")
    return ds


def plot_random_user_dataset(subject_id=None):
    if subject_id is None:
        rand_subject = client.twitter.subjects_collection.aggregate([
                            {"$match": {"timeline_tweets_count": {"$gte": 500}}},
                            {"$sample": {"size": 1}},
                        ]).next()
        subject_id = rand_subject["id"]
    print(subject_id)
    ds = plot_user_dataset(subject_id)
    return ds

In [19]:
plot_random_user_dataset()

220154868


DatasetDict({
    eval: Dataset({
        features: ['entities', 'edit_history_tweet_ids', 'lang', 'in_reply_to_user_id', 'id', 'public_metrics', 'possibly_sensitive', 'text', 'conversation_id', 'edit_controls', 'created_at', 'author_id', 'reply_settings', 'context_annotations', 'referenced_tweets', 'queried_at'],
        num_rows: 250
    })
    user_context: Dataset({
        features: ['entities', 'edit_history_tweet_ids', 'lang', 'in_reply_to_user_id', 'id', 'public_metrics', 'possibly_sensitive', 'text', 'conversation_id', 'edit_controls', 'created_at', 'author_id', 'reply_settings', 'context_annotations', 'referenced_tweets', 'queried_at'],
        num_rows: 250
    })
    peer_context: Dataset({
        features: ['edit_history_tweet_ids', 'reply_settings', 'edit_controls', 'conversation_id', 'context_annotations', 'entities', 'text', 'id', 'lang', 'created_at', 'possibly_sensitive', 'author_id', 'public_metrics', 'attachments', 'queried_at'],
        num_rows: 250
    })
    ra

In [20]:
from loaders.disk_loaders import SubjectDataLoaderFromDisk
import pyprojroot

base_path = pyprojroot.find_root(pyprojroot.has_dir(".git"))

In [21]:
data_path = base_path.joinpath("out", "data", "subject_data")
all_subjects = os.listdir(data_path)
all_subjects

['1599012307',
 '1461333578764230657',
 '60141720',
 '26194850',
 '996189990',
 '1726234849',
 '36945210',
 '1594142802547441665',
 '1499514153978843150',
 '1266220812006117376']

In [22]:
for s in all_subjects:
    dl = SubjectDataLoaderFromDisk(user_id=s, data_path=data_path)
    ds = dl.load_data()
    print(s)
    print(f"Total tweets vs unique (peer context): {ds['peer_context'].num_rows} / {len(ds['peer_context'].unique('id'))}")
    print(f"Total tweets vs unique (rand. tweet context): {ds['random_tweet_context'].num_rows} / {len(ds['random_tweet_context'].unique('id'))}")
    peer_ids, rand_tweets_ids = set(ds['peer_context'].unique('conversation_id')), set(ds['random_tweet_context'].unique('conversation_id'))
    print(f"Conversation intersection of peer vs random tweets: {peer_ids.intersection(rand_tweets_ids)}")
    timelines(ds, title=s)

1599012307
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: set()


1461333578764230657
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: set()


60141720
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: {'1598822959866683394'}


26194850
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: set()


996189990
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: set()


1726234849
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: set()


36945210
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: {'1576212379553402880'}


1594142802547441665
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: set()


1499514153978843150
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: set()


1266220812006117376
Total tweets vs unique (peer context): 250 / 250
Total tweets vs unique (rand. tweet context): 250 / 250
Conversation intersection of peer vs random tweets: set()


In [23]:
dl = SubjectDataLoaderFromDisk(user_id="1266220812006117376", data_path=data_path)
ds = dl.load_data()
ds.set_format("pandas")

In [24]:
# df = ds["peer_context"][:]
# df[df["conversation_id"] == "1598822959866683394"]

In [25]:
# df2 = ds["random_tweet_context"][:]
# df2[df2["conversation_id"] == "1598822959866683394"]


In [26]:
dfs = {}
dfs["peer_context"] = ds["peer_context"][:]
dfs["user_context"] = ds["user_context"][:]
dfs["random_tweet_context"] = ds["random_tweet_context"][:]
dfs["random_user_context"] = ds["random_user_context"][:]

convs = {}
for name, data in dfs.items():
    convs[name] = set(data["conversation_id"].unique())

In [27]:
convs["peer_context"].intersection(convs["user_context"])

{'1579166707280551936',
 '1583332034570616832',
 '1584785047525027841',
 '1585463188732055552',
 '1585814340271120385',
 '1589318904886988800',
 '1589433019383025664',
 '1591934871219625984',
 '1592063256075866114',
 '1592783479666016257',
 '1593361206064173061',
 '1593372276514119680',
 '1593436729465135105',
 '1593698735870988288',
 '1593740500229836800',
 '1594113200160546816',
 '1594162740733431809',
 '1594166162656370688',
 '1594244735027523584'}

In [28]:
from itertools import combinations

combos = combinations(convs, 2)
print("Common conversations")
for c1, c2 in combos:
    common = convs[c1].intersection(convs[c2])
    print(f"{c1} and {c2}: {len(common)}")

Common conversations
peer_context and user_context: 19
peer_context and random_tweet_context: 0
peer_context and random_user_context: 0
user_context and random_tweet_context: 0
user_context and random_user_context: 0
random_tweet_context and random_user_context: 0
