In [21]:
import pandas as pd
import numpy as np
import re
import json

In [2]:
# Comprehensive regex including figures, organizations, and variants
# Grouped logically for maintenance, compiled once for performance.
covid_expanded_pattern = re.compile(
    r'\b('
    # 1. Core Virus Identifiers
    r'covid(-?19)?|corona(virus)?|sars-?cov-?2|n?cov(-?19|2019)?|'
    
    # 2. Key Variants
    r'omicron|delta|alpha|beta|ba\.\d+|xbb|'
    
    # 3. Medical & Vaccine Manufacturers
    r'pfizer|moderna|astrazeneca|biontech|j&j|johnson & johnson|'
    r'novavax|sinovac|sputnik v|'
    
    # 4. Slang & Colloquial
    r'the\s?rona|miss\s?rona|covidiot|vax(xed)?|antivax(xer)?|'
    
    # 5. High-Signal Context Specifics
    r'quarantine|lockdown|pandemic|epidemic|'
    r'social distanc(ing|e)|herd immunity|'
    r'wuhan (lab|market)|'
    r'super-?spreader|long covid'
    r')\b',
    re.IGNORECASE
)

def is_covid_relevant(tweet_text):
    """
    Returns True if the tweet contains any major COVID-19 keywords, 
    key figures, or organizations associated with the pandemic.
    """
    if not tweet_text:
        return False
    return bool(covid_expanded_pattern.search(tweet_text))

In [3]:
df = pd.read_csv("../data/FolloweeIDs2_tweets_df_AugustPull.csv")

  df = pd.read_csv("../data/FolloweeIDs2_tweets_df_AugustPull.csv")


In [4]:
df = df[df["reply_to_id"].astype(str) == "nan"]
df = df[df["retweeted_user_ID"].astype(str) == "nan"]
df = df[df["reply_to_user"].astype(str) == "nan"]
df["created_at"] = pd.to_datetime(df["created_at"])
#df = df[(df["created_at"] > "2023-08-13") & (df["created_at"] <= "2023-08-15")]
df = df[df["full_text"].str.contains("http|www") == False]
df["is_covid"] = df["full_text"].apply(is_covid_relevant)
df_covid = df[df["is_covid"] == True]
df_non_covid = df[df["is_covid"] == False]

In [11]:
df_covid.sort_values(by="created_at", ascending=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_covid.sort_values(by="created_at", ascending=False, inplace=True)


In [19]:
df_covid[:2000]

Unnamed: 0,full_text,tweet_id,created_at,screen_name,original_user_id,retweeted_user_ID,collected_at,reply_to_id,reply_to_user,expandedURL,is_covid,created_at_interval,created_at_timestamp
5822389,Meds arrived. Paxlovid by Pfizer. I don't fee...,1.691564e+18,2023-08-15 21:33:58+00:00,DrShayPhD,1.349137e+18,,2023-08-15 21:35:05.762682,,,[],True,129,129
5951257,Genuinely curious: is there a link in the covi...,1.691544e+18,2023-08-15 20:15:52+00:00,cerisecastle,8.988093e+08,,2023-08-15 21:45:27.976949,,,[],True,121,121
6149428,What genius named the latest Covid strain Eris...,1.691538e+18,2023-08-15 19:49:53+00:00,JudiHayesFL,1.423606e+18,,2023-08-15 21:59:24.158286,,,[],True,118,118
2989752,"1/ üåç ""The Great Reset"" is a concept by World E...",1.691514e+18,2023-08-15 18:14:25+00:00,BriticalThinker,4.529736e+09,,2023-08-15 18:34:22.348035,,,[],True,109,109
7489958,Hey @Delta your clubs have turned to üí©with t...,1.691509e+18,2023-08-15 17:54:11+00:00,idontexistTore,1.472968e+18,,2023-08-15 22:46:15.925618,,,[],True,107,107
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297181,Ivermectin is great for some types of respirat...,1.630242e+18,2023-02-27 16:23:52+00:00,eugenegu,6.549748e+07,,2023-08-15 22:05:24.727371,,,[],True,98,98
2977553,They lied to you about COVID.\n\nHowever \n\nT...,1.630241e+18,2023-02-27 16:17:48+00:00,camrynbaylee,3.213176e+08,,2023-08-15 18:33:50.855149,,,[],True,97,97
5594726,Many of us used deductive reasoning and conclu...,1.630238e+18,2023-02-27 16:08:41+00:00,MariaRyanNH,1.088162e+18,,2023-08-15 21:20:24.628684,,,[],True,96,96
5025670,"Repeatedly dismissed as a conspiracy theory, n...",1.630222e+18,2023-02-27 15:02:57+00:00,VernBuchanan,2.046716e+07,,2023-08-15 20:45:24.824186,,,[],True,90,90


In [22]:
# format created_at into 144 10-minute intervals on both days
df_covid["created_at_interval"] = ((df_covid["created_at"].dt.hour * 60 + df_covid["created_at"].dt.minute) // 10).astype(int)

df_covid["created_at_timestamp"] = df_covid["created_at_interval"] + np.where(
    df_covid["created_at"].dt.date == pd.to_datetime("2023-08-14").date(), 
    144, 
    0
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_covid["created_at_interval"] = ((df_covid["created_at"].dt.hour * 60 + df_covid["created_at"].dt.minute) // 10).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_covid["created_at_timestamp"] = df_covid["created_at_interval"] + np.where(


In [25]:
df_covid = df_covid[:2000]

In [26]:
covid_feed = []

for idx, row in df_covid.iterrows():
    covid_feed.append({
        "user": {"id": row["screen_name"]},
        "content": row["full_text"],
        "id": idx,
        "timestamp": int(row["created_at_timestamp"]),
        "reads": [],
        "likes": []
    })

In [27]:
with open("../data/snapshot_covid_large/feed.json", "w") as f:
    json.dump(covid_feed, f)