In [4]:
import pandas as pd
import numpy as np
import re

In [5]:
# Comprehensive regex including figures, organizations, and variants
# Grouped logically for maintenance, compiled once for performance.
covid_expanded_pattern = re.compile(
    r'\b('
    # 1. Core Virus Identifiers
    r'covid(-?19)?|corona(virus)?|sars-?cov-?2|n?cov(-?19|2019)?|'
    
    # 2. Key Variants
    r'omicron|delta|alpha|beta|ba\.\d+|xbb|'
    
    # 3. Medical & Vaccine Manufacturers
    r'pfizer|moderna|astrazeneca|biontech|j&j|johnson & johnson|'
    r'novavax|sinovac|sputnik v|'
    
    # 4. Slang & Colloquial
    r'the\s?rona|miss\s?rona|covidiot|vax(xed)?|antivax(xer)?|'
    
    # 5. High-Signal Context Specifics
    r'quarantine|lockdown|pandemic|epidemic|'
    r'social distanc(ing|e)|herd immunity|'
    r'wuhan (lab|market)|'
    r'super-?spreader|long covid'
    r')\b',
    re.IGNORECASE
)

def is_covid_relevant(tweet_text):
    """
    Returns True if the tweet contains any major COVID-19 keywords, 
    key figures, or organizations associated with the pandemic.
    """
    if not tweet_text:
        return False
    return bool(covid_expanded_pattern.search(tweet_text))

In [6]:
df = pd.read_csv("../data/FolloweeIDs2_tweets_df_AugustPull.csv")

  df = pd.read_csv("../data/FolloweeIDs2_tweets_df_AugustPull.csv")


In [7]:
df = df[df["reply_to_id"].astype(str) == "nan"]
df = df[df["retweeted_user_ID"].astype(str) == "nan"]
df = df[df["reply_to_user"].astype(str) == "nan"]
df["created_at"] = pd.to_datetime(df["created_at"])
df = df[(df["created_at"] > "2023-08-13") & (df["created_at"] <= "2023-08-15")]
df = df[df["full_text"].str.contains("http|www") == False]
df["is_covid"] = df["full_text"].apply(is_covid_relevant)
df_covid = df[df["is_covid"] == True]
df_non_covid = df[df["is_covid"] == False]

In [8]:
# format created_at into 144 10-minute intervals on both days
df_non_covid["created_at_interval"] = ((df_non_covid["created_at"].dt.hour * 60 + df_non_covid["created_at"].dt.minute) // 10).astype(int)

df_non_covid["created_at_timestamp"] = df_non_covid["created_at_interval"] + np.where(
    df_non_covid["created_at"].dt.date == pd.to_datetime("2023-08-14").date(), 
    144, 
    0
)


# format created_at into 144 10-minute intervals on both days
df_covid["created_at_interval"] = ((df_covid["created_at"].dt.hour * 60 + df_covid["created_at"].dt.minute) // 10).astype(int)

df_covid["created_at_timestamp"] = df_covid["created_at_interval"] + np.where(
    df_covid["created_at"].dt.date == pd.to_datetime("2023-08-14").date(), 
    144, 
    0
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_covid["created_at_interval"] = ((df_non_covid["created_at"].dt.hour * 60 + df_non_covid["created_at"].dt.minute) // 10).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_covid["created_at_timestamp"] = df_non_covid["created_at_interval"] + np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

In [9]:
non_covid_feed = []

for idx, row in df_non_covid.iterrows():
    non_covid_feed.append({
        "user": {"id": row["screen_name"]},
        "content": row["full_text"],
        "id": idx,
        "timestamp": int(row["created_at_timestamp"]),
        "reads": [],
        "likes": []
    })

covid_feed = []

for idx, row in df_covid.iterrows():
    covid_feed.append({
        "user": {"id": row["screen_name"]},
        "content": row["full_text"],
        "id": idx,
        "timestamp": int(row["created_at_timestamp"]),
        "reads": [],
        "likes": []
    })

In [10]:
import json
with open("../data/snapshot_non_covid/feed.json", "w") as f:
    json.dump(non_covid_feed, f)

with open("../data/snapshot_covid/feed.json", "w") as f:
    json.dump(covid_feed, f)