In [9]:
import pandas as pd

# Define file paths (assuming the notebooks and data directories are siblings)
input_file_path = "../data/x_bot_data.csv"
output_file_path = "../data/x_bot_data_phase1.csv"

# Read the CSV file with semicolon delimiter
df = pd.read_csv(input_file_path, sep=";")

# Replace the substring "pinto_serg4991" with "Meme__Fact" in the URL columns
df["meme_post_url"] = df["meme_post_url"].str.replace("pinto_serg4991", "Meme__Fact", regex=False)
df["disclaimer_url"] = df["disclaimer_url"].str.replace("pinto_serg4991", "Meme__Fact", regex=False)

# Define columns to drop (including fact_check_verdict and additional unwanted columns)
cols_to_drop = [
    "misinformation_topic_tags", "meme_explains_verdict", "meme_humor_rating", "meme_relevance_rating",
    "reply_texts_disclaimer_post", "reply_texts_meme_post",
    "reposted_by_user", "liked_by_user", "replied_by_user", "blocked_by_user", "followed_by_user",
    "engagement_growth_rate_24h", "engagement_growth_rate_7d", "engagement_growth_rate_1m", "engagement_growth_rate_3m",
    "views_7d", "likes_7d", "reposts_7d", "replies_7d", "quote_tweets_7d",
    "views_1m", "likes_1m", "reposts_1m", "replies_1m", "quote_tweets_1m",
    "views_3m", "likes_3m", "reposts_3m", "replies_3m", "quote_tweets_3m",
    "views_24h", "likes_24h", "reposts_24h", "replies_24h", "quote_tweets_24h",
    "fact_check_verdict", "error"
]
df.drop(columns=cols_to_drop, inplace=True, errors="ignore")

# Rename the 1-hour metric columns to temporary names for later processing.
rename_dict = {
    "views_1h": "views",
    "likes_1h": "likes",
    "reposts_1h": "reposts",
    "replies_1h": "replies",
    "quote_tweets_1h": "quote_tweets"
}
df.rename(columns=rename_dict, inplace=True)

# Group by 'fact_check_timestamp' to merge duplicate rows.
# For numeric columns, sum their values; otherwise, take the first occurrence.
numeric_cols = ["views", "likes", "reposts", "replies", "quote_tweets"]
agg_methods = {
    col: ("sum" if col in numeric_cols else "first")
    for col in df.columns if col != "fact_check_timestamp"
}
df_grouped = df.groupby("fact_check_timestamp", as_index=False).agg(agg_methods)

# Now drop the columns: 'reposts', 'replies', 'quote_tweets'
df_grouped.drop(columns=["reposts", "replies", "quote_tweets"], inplace=True, errors="ignore")

# Rename the metric columns for the meme post.
df_grouped.rename(columns={"views": "views_meme_post", "likes": "likes_meme_post"}, inplace=True)

# Insert the new columns for disclaimer posts right after the meme post columns.
# We will initialize these with missing values (pd.NA)
views_index = df_grouped.columns.get_loc("views_meme_post")
df_grouped.insert(views_index + 1, "views_disclaimer_post", pd.NA)

likes_index = df_grouped.columns.get_loc("likes_meme_post")
df_grouped.insert(likes_index + 1, "likes_disclaimer_post", pd.NA)

# Sort the DataFrame by fact_check_timestamp in ascending order and reset the index
df_grouped.sort_values(by="fact_check_timestamp", inplace=True)
df_grouped.reset_index(drop=True, inplace=True)

# Save the processed DataFrame to the new CSV file in the data directory
df_grouped.to_csv(output_file_path, index=False)
print(f"Processed data has been saved to {output_file_path}")

Processed data has been saved to ../data/x_bot_data_phase1.csv
