## Find the shared **Tweet** network among the following the stream seeded accounts

* how much overlap is there in 
    * the tweets that seed accounts interact with
    * the authors of tweets that seed accounts interact with


In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
from twarc import Twarc2, expansions
from tweet_processing import StreamTweetProcessor, get_time_interval, lookup_users_by_username

load_dotenv()

twarc_client = Twarc2(
    consumer_key=os.environ["consumer_key"], 
    consumer_secret=os.environ["consumer_secret"],
    access_token=os.environ["access_token"], 
    access_token_secret=os.environ["access_token_secret"]
)

data_dir="../stream-seeding/data"

tp = StreamTweetProcessor(twarc_client=twarc_client, data_dir=data_dir)

# group_name = "longevity-pranab"
group_name = "CA-Abundance-Economy"

df_following, df_tweets, df_ref_tweets = tp.load_stream_seed_data(group_name) 


  df_tweets = pd.read_csv(f"{dir_}/tweets.csv")


In [2]:
# df_ref_tweets["id"].value_counts()

In [3]:
df_tweets.columns

Index(['Unnamed: 0', 'id', 'conversation_id',
       'referenced_tweets.replied_to.id', 'referenced_tweets.retweeted.id',
       'referenced_tweets.quoted.id', 'author_id', 'in_reply_to_user_id',
       'retweeted_user_id', 'quoted_user_id', 'created_at', 'text', 'lang',
       'source', 'public_metrics.like_count', 'public_metrics.quote_count',
       'public_metrics.reply_count', 'public_metrics.retweet_count',
       'reply_settings', 'possibly_sensitive', 'withheld.scope',
       'withheld.copyright', 'withheld.country_codes', 'entities.annotations',
       'entities.cashtags', 'entities.hashtags', 'entities.mentions',
       'entities.urls', 'context_annotations', 'attachments.media',
       'attachments.media_keys', 'attachments.poll.duration_minutes',
       'attachments.poll.end_datetime', 'attachments.poll.id',
       'attachments.poll.options', 'attachments.poll.voting_status',
       'attachments.poll_ids', 'author.id', 'author.created_at',
       'author.username', 'author.

# Question: Which tweets have multiple seed accounts seed accounts replied to the last X weeks?

This doesn't seem to get much action. Going to move to look at overlap among accounts they are interacting with, instead of specific tweets


In [4]:
X_weeks = 100000 #not seeing enough action in smaller timespans

end_time, start_time = get_time_interval(hours=24*7*X_weeks)

df_x_tweets = df_tweets[df_tweets["created_at"]>start_time]
df_x_ref_tweets = df_ref_tweets[df_ref_tweets["created_at"]>start_time]

print(df_tweets.shape, df_x_tweets.shape)
print(df_ref_tweets.shape, df_x_ref_tweets.shape)

reply_tweets = df_x_tweets[df_x_tweets["referenced_tweets.replied_to.id"].notna()]
no_self_replies = reply_tweets[~reply_tweets["tweet_type"].str.contains("self-reply")]

# index is replied_to.id, value is the count of unique authors who replied
grouped = no_self_replies.groupby("referenced_tweets.replied_to.id")
multiple_reply_ids = [id_ for id_, count in grouped["author.username"].nunique().sort_values().items() if count > 1]
multiple_reply_ids

(9607, 81) (9607, 81)
(8152, 82) (8152, 82)


[1.4080697399816643e+18,
 1.5190991782485524e+18,
 1.4230538316051292e+18,
 1.426329837912789e+18,
 1.4770479424479928e+18]

In [5]:
for id_ in multiple_reply_ids:
    print("--------")
    print(f"tweet link for id_ {id_}")
    if df_ref_tweets[df_ref_tweets["id"] == id_].shape[0] > 0:
        print(df_ref_tweets[df_ref_tweets["id"] == id_].tweet_link.iloc[0], "tweeted at ", df_ref_tweets[df_ref_tweets["id"] == id_].created_at.iloc[0])
    else: 
        print(f"nothing foudn for id {int(id_)}")
    for tweet in df_x_tweets[df_x_tweets["referenced_tweets.replied_to.id"] == id_].tweet_link.tolist():
            print(tweet)

--------
tweet link for id_ 1.4080697399816643e+18
https://twitter.com/kookie13/status/1408069739981664259 tweeted at  2021-06-24 14:29:21+00:00
https://twitter.com/hanlonbt/status/1408071225990029323
https://twitter.com/anniefryman/status/1408135987734089728
--------
tweet link for id_ 1.5190991782485524e+18
https://twitter.com/MarketUrbanism/status/1519099178248552449 tweeted at  2022-04-26 23:40:42+00:00
https://twitter.com/hanlonbt/status/1519114588465229824
https://twitter.com/anniefryman/status/1519115464802291712
--------
tweet link for id_ 1.4230538316051292e+18
https://twitter.com/kimmaicutler/status/1423053831605129216 tweeted at  2021-08-04 22:50:47+00:00
https://twitter.com/hanlonbt/status/1423077939541774339
https://twitter.com/anniefryman/status/1423054184476119042
--------
tweet link for id_ 1.426329837912789e+18
https://twitter.com/Muhammad_Speaks/status/1426329837912788995 tweeted at  2021-08-13 23:48:27+00:00
https://twitter.com/hanlonbt/status/1426330890469183491
htt

# Question: Which accounts have multiple of the seed accounts interacted with in the last X weeks?

likely to reveal more than looking at a single type of interaction, or single tweets, plus we are looking to add more accounts, not filter to specific tweets.. 

* interaction: reply, qt, rt 
* doing accounts instead of tw

In [9]:
X_weeks = 4 #not seeing enough action in smaller timespans

end_time, start_time = get_time_interval(hours=24*7*X_weeks)

df_x_tweets = df_tweets[df_tweets["created_at"]>start_time]
df_x_ref_tweets = df_ref_tweets[df_ref_tweets["created_at"]>start_time]

print(df_tweets.shape, df_x_tweets.shape)
print(df_ref_tweets.shape, df_x_ref_tweets.shape)

reply_tweets = df_x_tweets[df_x_tweets["referenced_tweets.replied_to.id"].notna()]
no_self_replies = reply_tweets[~reply_tweets["tweet_type"].str.contains("self-reply")]

interaction_columns = ["referenced_tweets.replied_to.id", "referenced_tweets.quoted.id", "referenced_tweets.retweeted.id"]

interaction_overlap = {}
for i_column in interaction_columns: 
    for author_username, referenced_id_, tweet_id in zip(df_x_tweets["author.username"].tolist(), df_x_tweets[i_column].tolist(), df_x_tweets["id"].tolist()):
        ref_ = df_ref_tweets[df_ref_tweets["id"] == referenced_id_]
        if ref_.shape[0] > 0:
            # interaction_overlap.setdefault(ref_.iloc[0]["author.username"], set()).add(author_username)
            
            interaction_overlap.setdefault(ref_.iloc[0]["author.username"], {"stream_users":set(), "interaction_ids": []}) #.add(author_username)
            interaction_overlap[ref_.iloc[0]["author.username"]]["stream_users"].add(author_username)
            interaction_overlap[ref_.iloc[0]["author.username"]]["interaction_ids"].append(tweet_id)

(9607, 81) (307, 81)
(8152, 82) (240, 82)


In [11]:
df_data = []
for interacted_user, interaction_data in interaction_overlap.items():
    df_data.append(
        [
            interacted_user, 
            interaction_data["stream_users"],
            len(interaction_data["stream_users"]),
            interaction_data["interaction_ids"], 
            f"https://twitter.com/{interacted_user}"
        ]
    )
    
# for interacted_user, stream_users in interaction_overlap.items():
#     df_data.append(
#         [
#             interacted_user, 
#             stream_users, 
#             len(stream_users), 
#             f"https://twitter.com/{interacted_user}"]
#     )
    
    
    
overlap_df = pd.DataFrame(
    df_data, 
    columns=[
        "interacted.username", 
        "stream_users", 
        "num_stream_users", 
        "interaction_ids",
        "profile_link"
    ]
)

overlap_df["num_stream_users"].value_counts()

1    160
2      4
Name: num_stream_users, dtype: int64

In [13]:
NUM_SEED_ACCOUNTS = 2
filtered_overlap_df = overlap_df[overlap_df["num_stream_users"]>=NUM_SEED_ACCOUNTS]
usernames = filtered_overlap_df["interacted.username"].tolist()
user_df = lookup_users_by_username(twarc_client, usernames).set_index("username")
filtered_overlap_df["num_followers_of_interacted"] = [user_df.loc[username]["public_metrics.followers_count"] for username in filtered_overlap_df["interacted.username"].tolist()]
filtered_overlap_df.sort_values("num_followers_of_interacted")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_overlap_df["num_followers_of_interacted"] = [user_df.loc[username]["public_metrics.followers_count"] for username in filtered_overlap_df["interacted.username"].tolist()]


Unnamed: 0,interacted.username,stream_users,num_stream_users,interaction_ids,profile_link,num_followers_of_interacted
9,JohnGonzalesLA1,"{anniefryman, hanlonbt}",2,"[1545514686552186882, 1541946786637443072]",https://twitter.com/JohnGonzalesLA1,190
27,thuddwhirr,"{rklau, hanlonbt}",2,"[1542763705259028482, 1545539157333647360]",https://twitter.com/thuddwhirr,1791
33,opinonhaver,"{anniefryman, hanlonbt}",2,"[1542010706039435264, 1542347062221500416]",https://twitter.com/opinonhaver,4079
28,daguilarcanabal,"{anniefryman, hanlonbt}",2,"[1542743976763043841, 1542737208855052288, 154...",https://twitter.com/daguilarcanabal,4166


### Show Interactions for Particular User


In [20]:
from tweet_processing.display_tweets import display_feed 

username = "thuddwhirr"
row = filtered_overlap_df[filtered_overlap_df["interacted.username"] == username].iloc[0]

tweet_links = []
for id_ in row.interaction_ids:
    tweet_links.append(df_x_tweets[df_x_tweets["id"] == id_].iloc[0].tweet_link)
    
display_feed(tweet_links)

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Check out the Fuck Buttons’ Sweet Love for Planet Earth. <br><br>(It’s better if you play it loud and don’t bother trying to understand the lyrics)</p>&mdash; Brian Hanlon (@hanlonbt) <a href="https://twitter.com/hanlonbt/status/1542763705259028482?ref_src=twsrc%5Etfw">July 1, 2022</a></blockquote>
<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
<blockquote class="twitter-tweet"><p lang="qme" dir="ltr"> <a href="https://t.co/psoGnHLIjY">pic.twitter.com/psoGnHLIjY</a></p>&mdash; Rick Klau (@rklau) <a href="https://twitter.com/rklau/status/1545539157333647360?ref_src=twsrc%5Etfw">July 8, 2022</a></blockquote>
<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>


## Question: Who are these accounts replying to in the last X weeks? 

* removing all self-replies!

In [None]:
from tweet_processing import get_time_interval

In [None]:
X_weeks = 4

end_time, start_time = get_time_interval(hours=24*7*X_weeks)

df_x_tweets = df_tweets[df_tweets["created_at"]>start_time]
df_x_ref_tweets = df_ref_tweets[df_ref_tweets["created_at"]>start_time]

print(df_tweets.shape, df_x_tweets.shape)
print(df_ref_tweets.shape, df_x_ref_tweets.shape)

In [None]:
reply_tweets = df_x_tweets[df_x_tweets["referenced_tweets.replied_to.id"].notna()]
no_self_replies = reply_tweets[~reply_tweets["tweet_type"].str.contains("self-reply")]

print(reply_tweets.shape)
print(no_self_replies.shape)

replied_to_tweets = df_x_ref_tweets[df_x_ref_tweets.id.isin(no_self_replies["referenced_tweets.replied_to.id"])]
replied_to_tweets.shape

In [None]:
print(replied_to_tweets["tweet_type"].value_counts())
replied_to_tweets["author.username"].value_counts()

In [None]:
reply_tweets["referenced_tweets.replied_to.id"].value_counts()