# Debunks per month

In [1]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Convert the 'fact_check_timestamp' column to datetime
df['fact_check_timestamp'] = pd.to_datetime(df['fact_check_timestamp'])

# Group by month (year-month period) and count the number of debunks per month
debunks_per_month = df.groupby(df['fact_check_timestamp'].dt.to_period("M")).size()

# Print output text with counts per month
print("Debunks performed per month:")
print(debunks_per_month)

Debunks performed per month:
fact_check_timestamp
2024-10    15
2024-11    40
2024-12    15
2025-01    26
2025-02    22
2025-03     1
Freq: M, dtype: int64


# Was the debunked tweet already community noted?

In [2]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Check the unique values in the 'is_original_tweet_community_noted' column 
# to understand its structure (e.g., boolean values True/False, strings, or other)
print("Unique values in 'is_original_tweet_community_noted':", df["is_original_tweet_community_noted"].unique())

# Calculate value counts and percentages for the community noted column
value_counts = df["is_original_tweet_community_noted"].value_counts(dropna=False)
total_count = df.shape[0]
percentages = (value_counts / total_count) * 100

# Print out the plots in a human-friendly way
print("\nPercentage of debunked tweets that were already community noted:")
for noted_status, percent in percentages.items():
    print(f"{noted_status}: {percent:.2f}%")


Unique values in 'is_original_tweet_community_noted': [False  True]

Percentage of debunked tweets that were already community noted:
False: 59.66%
True: 40.34%


# How many articles per org

In [3]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Filter to rows that have a non-null fact_check_article_url
df_articles = df.dropna(subset=["fact_check_article_url"]).copy()

# Count the number of fact-check articles that contain each substring (case-insensitive)
fullfact_count = df_articles["fact_check_article_url"].str.contains("fullfact", case=False, na=False).sum()
politifact_count = df_articles["fact_check_article_url"].str.contains("politifact", case=False, na=False).sum()
factcheck_count = df_articles["fact_check_article_url"].str.contains("factcheck", case=False, na=False).sum()

# Total number of fact-check article URLs present
total_articles = len(df_articles)

# Calculate percentages for each source
percent_fullfact = (fullfact_count / total_articles) * 100
percent_politifact = (politifact_count / total_articles) * 100
percent_factcheck = (factcheck_count / total_articles) * 100

# Print out the plots
print("Fact-check articles percentage by source:")
print(f"Fullfact: {percent_fullfact:.2f}%")
print(f"Politifact: {percent_politifact:.2f}%")
print(f"Factcheck.org: {percent_factcheck:.2f}%")

Fact-check articles percentage by source:
Fullfact: 62.18%
Politifact: 24.37%
Factcheck.org: 37.82%


# How many tweets debunked per user

In [27]:
import pandas as pd

# Define the input file path (processed CSV file)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Group by user_id and count the number of debunks per user
user_debunk_counts = df.groupby("user_id").size()

# Identify users with multiple debunks (more than one tweet debunked)
multiple_debunks = user_debunk_counts[user_debunk_counts > 1]

# Identify users with a single debunk for percentage calculation
single_debunks = user_debunk_counts[user_debunk_counts == 1]

# Total unique users
total_users = user_debunk_counts.shape[0]
multiple_count = multiple_debunks.shape[0]
single_count = single_debunks.shape[0]

# Calculate percentages
percentage_multiple = (multiple_count / total_users) * 100
percentage_single = (single_count / total_users) * 100

# Print the user IDs with multiple debunks along with the count of debunks per user,
# ensuring that the user IDs are printed as full numbers (not in scientific notation).
print("User IDs with multiple debunked tweets:")
for user_id, debunk_count in multiple_debunks.items():
    # Convert user_id to a full integer string for printing if it is numeric.
    try:
        # This handles cases where user_id may be read as a float
        user_id_str = str(int(float(user_id)))
    except Exception:
        # If user_id is already a string or conversion fails, just use it as-is.
        user_id_str = str(user_id)
    print(f"User ID: {user_id_str}, Debunks: {debunk_count}")

# Print summary percentages
print("\nSummary:")
print(f"Total unique users: {total_users}")
print(f"Users with multiple debunks: {multiple_count} ({percentage_multiple:.2f}%)")
print(f"Users with a single debunk: {single_count} ({percentage_single:.2f}%)")


User IDs with multiple debunked tweets:
User ID: 629698642, Debunks: 3
User ID: 1214407152896369920, Debunks: 2
User ID: 1231314387119429888, Debunks: 3
User ID: 1247525230404219904, Debunks: 3
User ID: 1257985162572420096, Debunks: 2
User ID: 1711926993065860096, Debunks: 2
User ID: 1731773353915040000, Debunks: 4

Summary:
Total unique users: 107
Users with multiple debunks: 7 (6.54%)
Users with a single debunk: 100 (93.46%)


# Statistics user followers 

In [9]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure the 'user_followers_count' column is numeric
df['user_followers_count'] = pd.to_numeric(df['user_followers_count'], errors='coerce')

# Compute descriptive statistics for the 'user_followers_count' column
followers_stats = df['user_followers_count'].describe()

# Print out the statistics with formatted values (using commas and fixed decimals)
print("Statistics for 'user_followers_count':")
print(f"Count: {followers_stats['count']:,.0f}")
print(f"Mean: {followers_stats['mean']:,.2f}")
print(f"Std Dev: {followers_stats['std']:,.2f}")
print(f"Min: {followers_stats['min']:,.0f}")
print(f"25%: {followers_stats['25%']:,.0f}")
print(f"50% (Median): {followers_stats['50%']:,.0f}")
print(f"75%: {followers_stats['75%']:,.0f}")
print(f"Max: {followers_stats['max']:,.0f}")

Statistics for 'user_followers_count':
Count: 119
Mean: 3,104,319.49
Std Dev: 21,924,432.55
Min: 165
25%: 22,954
50% (Median): 82,846
75%: 364,754
Max: 218,001,509


# Is user blue verfied

In [10]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# --- Option 1: Calculation per tweet ---

# Calculate the percentages of blue verified status per tweet (assuming the column is boolean or strings like 'True'/'False')
tweet_verification_percentages = df["is_user_blue_verified"].value_counts(normalize=True) * 100

print("Percentage of debunks by blue verified status (per tweet):")
for status, percentage in tweet_verification_percentages.items():
    print(f"{status}: {percentage:.2f}%")

# --- Option 2: Calculation for unique users ---

# Remove duplicate users based on the 'user_id' column.
unique_users = df.drop_duplicates(subset=["user_id"])

# Calculate the percentages among unique users
user_verification_percentages = unique_users["is_user_blue_verified"].value_counts(normalize=True) * 100

print("\nPercentage of unique users by blue verified status:")
for status, percentage in user_verification_percentages.items():
    print(f"{status}: {percentage:.2f}%")


Percentage of debunks by blue verified status (per tweet):
True: 87.39%
False: 12.61%

Percentage of unique users by blue verified status:
True: 87.85%
False: 12.15%


# Time between misinformation post and meme tweet correction

In [12]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Convert the 'time_difference_hours' column to numeric values
df['time_difference_hours'] = pd.to_numeric(df['time_difference_hours'], errors='coerce')

# Convert hours to days by dividing by 24
df['time_difference_days'] = df['time_difference_hours'] / 24

# Compute descriptive statistics for the 'time_difference_days' column
time_stats_days = df['time_difference_days'].describe()

# Print the statistics using fixed decimal formatting for clarity (in days)
print("Statistics for 'time_difference_days' (in days):")
print(f"Count: {time_stats_days['count']:,.0f}")
print(f"Mean: {time_stats_days['mean']:,.2f} days")
print(f"Std Dev: {time_stats_days['std']:,.2f} days")
print(f"Min: {time_stats_days['min']:,.2f} days")
print(f"25%: {time_stats_days['25%']:,.2f} days")
print(f"50% (Median): {time_stats_days['50%']:,.2f} days")
print(f"75%: {time_stats_days['75%']:,.2f} days")
print(f"Max: {time_stats_days['max']:,.2f} days")

Statistics for 'time_difference_days' (in days):
Count: 119
Mean: 14.87 days
Std Dev: 30.97 days
Min: 0.75 days
25%: 4.08 days
50% (Median): 7.00 days
75%: 14.04 days
Max: 259.54 days


# Statistics views correction posts

In [13]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure the views columns are treated as numeric (coerce errors to NaN if necessary)
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')

# Compute the average views per debunk correction.
# The average is computed row-wise, ignoring NaN values (if one value is missing, it'll use the other)
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Compute descriptive statistics for the average views column
avg_views_stats = df['avg_views'].describe()

# Print the descriptive statistics with clear formatting
print("Statistics for Average Views of Correction Posts (per debunk):")
print(f"Count: {avg_views_stats['count']:,.0f}")
print(f"Mean: {avg_views_stats['mean']:,.2f}")
print(f"Std Dev: {avg_views_stats['std']:,.2f}")
print(f"Min: {avg_views_stats['min']:,.0f}")
print(f"25%: {avg_views_stats['25%']:,.0f}")
print(f"50% (Median): {avg_views_stats['50%']:,.0f}")
print(f"75%: {avg_views_stats['75%']:,.0f}")
print(f"Max: {avg_views_stats['max']:,.0f}")

Statistics for Average Views of Correction Posts (per debunk):
Count: 119
Mean: 12.36
Std Dev: 11.16
Min: 4
25%: 7
50% (Median): 10
75%: 14
Max: 108


# Percentages of misinformation post deleted by user

In [14]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Calculate the percentage of misinformation posts deleted by user
# This assumes that the column contains boolean values or a similar indicator (e.g., "True"/"False")
deletion_percentages = df["misinformation_post_deleted_by_user"].value_counts(normalize=True) * 100

# Print the plots in a human-friendly format
print("Percentage of misinformation posts deleted by user:")
for deletion_status, percentage in deletion_percentages.items():
    print(f"{deletion_status}: {percentage:.2f}%")

Percentage of misinformation posts deleted by user:
no: 92.44%
yes: 7.56%


# Debunks per tone and ai disclosure

In [15]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# --- Calculate percentage breakdown for x_post_tone ---
tone_pct = df["x_post_tone"].value_counts(normalize=True) * 100
print("Percentage of debunks by x_post_tone:")
for tone, pct in tone_pct.items():
    print(f"{tone}: {pct:.2f}%")

# --- Calculate percentage breakdown for x_post_ai_gen_disclosure ---
ai_pct = df["x_post_ai_gen_disclosure"].value_counts(normalize=True) * 100
print("\nPercentage of debunks by x_post_ai_gen_disclosure:")
for disclosure, pct in ai_pct.items():
    print(f"{disclosure}: {pct:.2f}%")

# --- Optional: Joint distribution (cross-tab) ---
joint_pct = pd.crosstab(df["x_post_tone"], df["x_post_ai_gen_disclosure"], normalize="all") * 100
print("\nJoint distribution percentages between x_post_tone and x_post_ai_gen_disclosure:")
print(joint_pct)

Percentage of debunks by x_post_tone:
factual: 57.14%
humorous: 42.86%

Percentage of debunks by x_post_ai_gen_disclosure:
no_ai: 52.94%
ai: 47.06%

Joint distribution percentages between x_post_tone and x_post_ai_gen_disclosure:
x_post_ai_gen_disclosure         ai      no_ai
x_post_tone                                   
factual                   31.092437  26.050420
humorous                  15.966387  26.890756


# Corr views community noted

In [17]:
import pandas as pd
from scipy.stats import pearsonr

# Define the input file path (processed CSV file)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Convert the view columns to numeric values (coercing errors to NaN)
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')

# Compute the average views per debunk correction by averaging meme and disclaimer views
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Helper function to convert a value to a binary numeric value (1 for true, 0 for false)
def to_binary(val):
    if isinstance(val, bool):
        return int(val)
    val_str = str(val).strip().lower()
    return 1 if val_str in ['true', 'yes', '1'] else 0

# Convert 'is_original_tweet_community_noted' to a numeric binary column
df['is_original_tweet_community_noted_numeric'] = df['is_original_tweet_community_noted'].apply(to_binary)

# Drop rows with missing values in 'avg_views' or 'is_original_tweet_community_noted_numeric'
df_corr = df.dropna(subset=['avg_views', 'is_original_tweet_community_noted_numeric'])

# Determine the number of samples taken into account
n_samples = df_corr.shape[0]

# Calculate the Pearson correlation coefficient and the corresponding p-value
corr_coeff, p_value = pearsonr(df_corr['avg_views'], df_corr['is_original_tweet_community_noted_numeric'])

# Print the plots
print("Correlation Type: Pearson correlation")
print(f"Number of samples: {n_samples}")
print(f"Correlation coefficient (r): {corr_coeff:.4f}")
print(f"P-value: {p_value:.4e}")

Correlation Type: Pearson correlation
Number of samples: 119
Correlation coefficient (r): 0.0583
P-value: 5.2848e-01


# Corr views followers count

In [18]:
import pandas as pd
from scipy.stats import pearsonr

# Define file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure the relevant columns are numeric
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')
df['user_followers_count'] = pd.to_numeric(df['user_followers_count'], errors='coerce')

# Compute the average views per debunk correction (row-wise average of meme and disclaimer views)
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Drop rows with missing values in either 'avg_views' or 'user_followers_count'
df_corr = df.dropna(subset=['avg_views', 'user_followers_count'])

# Determine the number of samples used in the correlation
n_samples = df_corr.shape[0]

# Calculate the Pearson correlation coefficient and the p-value
corr_coeff, p_value = pearsonr(df_corr['avg_views'], df_corr['user_followers_count'])

# Print the plots
print("Correlation Type: Pearson correlation")
print(f"Number of samples: {n_samples}")
print(f"Correlation coefficient (r): {corr_coeff:.4f}")
print(f"P-value: {p_value:.4e}")

Correlation Type: Pearson correlation
Number of samples: 119
Correlation coefficient (r): -0.0763
P-value: 4.0940e-01


# Corr views user blue verified

In [19]:
import pandas as pd
from scipy.stats import pearsonr

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure the view columns are numeric (coerce any errors to NaN)
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')

# Compute the average views per debunk by averaging meme and disclaimer views
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Helper function to convert the 'is_user_blue_verified' column to a binary numeric variable
def to_binary(val):
    if isinstance(val, bool):
        return int(val)
    val_str = str(val).strip().lower()
    return 1 if val_str in ['true', 'yes', '1'] else 0

df['is_user_blue_verified_numeric'] = df['is_user_blue_verified'].apply(to_binary)

# Drop rows with missing values for avg_views or the verified status
df_corr = df.dropna(subset=['avg_views', 'is_user_blue_verified_numeric'])

# Determine the number of samples used in the correlation
n_samples = df_corr.shape[0]

# Calculate the Pearson correlation coefficient and the corresponding p-value
corr_coeff, p_value = pearsonr(df_corr['avg_views'], df_corr['is_user_blue_verified_numeric'])

# Print the plots
print("Correlation Type: Pearson correlation")
print(f"Number of samples: {n_samples}")
print(f"Correlation coefficient (r): {corr_coeff:.4f}")
print(f"P-value: {p_value:.4e}")

Correlation Type: Pearson correlation
Number of samples: 119
Correlation coefficient (r): -0.0664
P-value: 4.7314e-01


# Corr views time difference between misinformation creation and correction

In [21]:
import pandas as pd
from scipy.stats import pearsonr

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure the view columns are numeric (coercing errors to NaN)
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')

# Compute the average views per debunk (row-wise average of meme and disclaimer views)
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Ensure the time_difference_hours column is numeric
df['time_difference_hours'] = pd.to_numeric(df['time_difference_hours'], errors='coerce')

# Drop rows with missing values in avg_views or time_difference_hours
df_corr = df.dropna(subset=['avg_views', 'time_difference_hours'])

# Determine the number of samples included in the correlation analysis
n_samples = df_corr.shape[0]

# Calculate the Pearson correlation coefficient and the corresponding p-value
corr_coeff, p_value = pearsonr(df_corr['avg_views'], df_corr['time_difference_hours'])

# Print the plots in a human-friendly format
print("Correlation Type: Pearson correlation")
print(f"Number of samples: {n_samples}")
print(f"Correlation coefficient (r): {corr_coeff:.4f}")
print(f"P-value: {p_value:.4e}")

Correlation Type: Pearson correlation
Number of samples: 119
Correlation coefficient (r): -0.0198
P-value: 8.3043e-01


# Corr views fact check org

In [22]:
import pandas as pd
from scipy.stats import pearsonr

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure the view columns are numeric
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')

# Compute the average views per debunk by averaging the meme and disclaimer views
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Define a helper function to extract the fact-checking organization from the article URL
def extract_org(url):
    url = str(url).lower()
    if "fullfact" in url:
        return "fullfact"
    elif "politifact" in url:
        return "politifact"
    elif "factcheck" in url:
        return "factcheck"
    else:
        return None

# Create a new column for the fact-checking organization
df['fact_check_org'] = df['fact_check_article_url'].apply(extract_org)

# Filter rows to only include those with a recognized organization
df = df[df['fact_check_org'].isin(["fullfact", "politifact", "factcheck"])].copy()

# Drop rows with missing avg_views values
df_corr = df.dropna(subset=['avg_views'])

print("Correlation between average views and fact-checking organizations:\n")

# For each organization, create a binary flag and compute the correlation.
for org in ['fullfact', 'politifact', 'factcheck']:
    # Create a binary indicator (1 if the row's organization equals the current org, 0 otherwise)
    df_corr[org + '_flag'] = (df_corr['fact_check_org'] == org).astype(int)
    
    # Determine the number of samples used in the correlation calculation
    n_samples = df_corr.shape[0]
    
    # Compute Pearson (point-biserial) correlation between avg_views and the binary flag
    r, p = pearsonr(df_corr['avg_views'], df_corr[org + '_flag'])
    
    # Print the plots
    print(f"Organization: {org}")
    print("Correlation Type: Point-Biserial (via Pearson correlation)")
    print(f"Number of samples: {n_samples}")
    print(f"Correlation coefficient (r): {r:.4f}")
    print(f"P-value: {p:.4e}\n")

Correlation between average views and fact-checking organizations:

Organization: fullfact
Correlation Type: Point-Biserial (via Pearson correlation)
Number of samples: 119
Correlation coefficient (r): 0.0235
P-value: 7.9974e-01

Organization: politifact
Correlation Type: Point-Biserial (via Pearson correlation)
Number of samples: 119
Correlation coefficient (r): -0.0209
P-value: 8.2165e-01

Organization: factcheck
Correlation Type: Point-Biserial (via Pearson correlation)
Number of samples: 119
Correlation coefficient (r): -0.0071
P-value: 9.3871e-01



# Corr views post deleted by user

In [23]:
import pandas as pd
from scipy.stats import pearsonr

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file
df = pd.read_csv(input_file_path)

# Ensure the view columns are numeric (coercing errors to NaN)
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')

# Compute the average views per debunk correction (row-wise average of meme and disclaimer views)
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Helper function to convert the deletion status into a numeric binary variable
def to_binary(val):
    if isinstance(val, bool):
        return int(val)
    val_str = str(val).strip().lower()
    # Interpret common truthy values as 1, else 0.
    return 1 if val_str in ['true', 'yes', '1'] else 0

# Convert 'misinformation_post_deleted_by_user' to a numeric binary variable
df['misinformation_post_deleted_by_user_numeric'] = df['misinformation_post_deleted_by_user'].apply(to_binary)

# Drop rows with missing values in either 'avg_views' or 'misinformation_post_deleted_by_user_numeric'
df_corr = df.dropna(subset=['avg_views', 'misinformation_post_deleted_by_user_numeric'])

# Determine the number of samples used in the correlation calculation
n_samples = df_corr.shape[0]

# Calculate the Pearson correlation (point-biserial correlation) between avg_views and the deletion binary variable
corr_coeff, p_value = pearsonr(df_corr['avg_views'], df_corr['misinformation_post_deleted_by_user_numeric'])

# Print the plots
print("Correlation Type: Pearson (Point-Biserial) correlation")
print(f"Number of samples: {n_samples}")
print(f"Correlation coefficient (r): {corr_coeff:.4f}")
print(f"P-value: {p_value:.4e}")


Correlation Type: Pearson (Point-Biserial) correlation
Number of samples: 119
Correlation coefficient (r): 0.3082
P-value: 6.4876e-04


# Corr views tone and ai disclosure

In [24]:
import pandas as pd
from scipy.stats import pearsonr

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure the view columns are numeric (coerce errors to NaN)
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')

# Compute the average views per debunk as the row-wise average
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Drop rows with missing values in avg_views, x_post_tone, or x_post_ai_gen_disclosure
df_corr = df.dropna(subset=['avg_views', 'x_post_tone', 'x_post_ai_gen_disclosure'])

# Create a composite column combining tone and AI gen disclosure (values become lowercase)
df_corr['tone_ai_combo'] = df_corr['x_post_tone'].str.lower() + "_" + df_corr['x_post_ai_gen_disclosure'].str.lower()

# Get unique combinations
unique_combos = df_corr['tone_ai_combo'].unique()

print("Correlation (Point-Biserial via Pearson) between average views and combinations of tone and AI gen disclosure:\n")

# For each combination, create a binary flag and calculate the correlation with avg_views.
for combo in unique_combos:
    # Create binary indicator: 1 if this row has the given combo, 0 otherwise
    dummy = (df_corr['tone_ai_combo'] == combo).astype(int)
    
    # Number of samples included in the analysis
    n_samples = df_corr.shape[0]
    
    # Compute the Pearson (point-biserial) correlation coefficient and p-value
    r, p = pearsonr(df_corr['avg_views'], dummy)
    
    print(f"Combination: {combo}")
    print("Correlation Type: Point-Biserial (via Pearson correlation)")
    print(f"Number of samples: {n_samples}")
    print(f"Correlation coefficient (r): {r:.4f}")
    print(f"P-value: {p:.4e}\n")

Correlation (Point-Biserial via Pearson) between average views and combinations of tone and AI gen disclosure:

Combination: humorous_no_ai
Correlation Type: Point-Biserial (via Pearson correlation)
Number of samples: 119
Correlation coefficient (r): -0.0544
P-value: 5.5645e-01

Combination: humorous_ai
Correlation Type: Point-Biserial (via Pearson correlation)
Number of samples: 119
Correlation coefficient (r): -0.0027
P-value: 9.7715e-01

Combination: factual_no_ai
Correlation Type: Point-Biserial (via Pearson correlation)
Number of samples: 119
Correlation coefficient (r): 0.0679
P-value: 4.6295e-01

Combination: factual_ai
Correlation Type: Point-Biserial (via Pearson correlation)
Number of samples: 119
Correlation coefficient (r): -0.0102
P-value: 9.1275e-01



# Views per tone and ai disclosure

In [28]:
import pandas as pd

# Define the input file path (processed CSV file)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Convert the view columns to numeric values (coercing errors to NaN)
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')

# Compute the average views per debunk correction (row-wise average of meme and disclaimer views)
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Drop rows missing tone or AI disclosure information
df = df.dropna(subset=['x_post_tone', 'x_post_ai_gen_disclosure'])

# Create a composite column combining tone and AI disclosure (e.g., 'humorous_no_ai', 'factual_ai', etc.)
df['tone_ai_combo'] = df['x_post_tone'].str.lower() + "_" + df['x_post_ai_gen_disclosure'].str.lower()

# Group by the composite column and calculate aggregated view statistics
grouped = df.groupby('tone_ai_combo')['avg_views'].agg(
    mean_views = 'mean', 
    median_views = 'median', 
    total_views = 'sum',
    count_debunks = 'count'
).reset_index()

# Print the aggregated views per combination of tone and AI disclosure
print("Aggregated Views per Combination of Tone and AI Disclosure:")
print(grouped)


Aggregated Views per Combination of Tone and AI Disclosure:
    tone_ai_combo  mean_views  median_views  total_views  count_debunks
0      factual_ai   12.189189         11.00        451.0             37
1   factual_no_ai   13.629032          9.00        422.5             31
2     humorous_ai   12.289474         10.50        233.5             19
3  humorous_no_ai   11.359375         10.75        363.5             32


# Urls for the tweets of user that were debunked more than one time

In [29]:
import pandas as pd

# Define the input file path (assuming the processed file is in the data directory)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Group the data by the username column
user_groups = df.groupby("misinformation_user_username")

print("URLs of debunked tweets for users that were debunked more than once:\n")
# Iterate over each group (each user)
for username, group in user_groups:
    if len(group) > 1:  # Only consider users with more than one debunk
        # Get the unique URLs for the original misinformation tweets from that group
        urls = group["original_misinformation_post_url"].dropna().unique()
        count_urls = len(urls)
        print(f"Username: {username}, Count of URLs: {count_urls}")
        for url in urls:
            print(f"  {url}")
        print()

URLs of debunked tweets for users that were debunked more than once:

Username: Concerned Citizen, Count of URLs: 3
  https://x.com/BGatesIsaPyscho/status/1855705055522881611
  https://x.com/BGatesIsaPyscho/status/1870069733359304853
  https://x.com/BGatesIsaPyscho/status/1887011190078844973

Username: HustleBitch, Count of URLs: 3
  https://x.com/HustleBitch_/status/1853795691362951677
  https://x.com/HustleBitch_/status/1853882904474157337
  https://x.com/HustleBitch_/status/1877770103229485090

Username: Jack, Count of URLs: 4
  https://x.com/jackunheard/status/1848931111147036830
  https://x.com/jackunheard/status/1841549601188483269
  https://x.com/jackunheard/status/1850243828424597550
  https://x.com/jackunheard/status/1854374044524568822

Username: Pugh Himple, Count of URLs: 2
  https://x.com/gbullstein/status/1859847926752661710
  https://x.com/GBullstein/status/1873653486476579109

Username: SilencedSirs◼️, Count of URLs: 2
  https://x.com/SilentlySirs/status/185648096499140

# Characteristics user that deleted post

In [30]:
import pandas as pd

# Define the input file path (processed CSV file)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure numeric conversion for the view columns and followers count
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')
df['views_disclaimer_post'] = pd.to_numeric(df['views_disclaimer_post'], errors='coerce')
df['user_followers_count'] = pd.to_numeric(df['user_followers_count'], errors='coerce')

# Compute the average views per debunk correction
df['avg_views'] = df[['views_meme_post', 'views_disclaimer_post']].mean(axis=1, skipna=True)

# Create a composite column for tone and AI disclosure (e.g., "humorous_no_ai", "factual_ai", etc.)
df['tone_ai_combo'] = df['x_post_tone'].str.lower() + "_" + df['x_post_ai_gen_disclosure'].str.lower()

# Filter for rows where the misinformation post was deleted by the user.
# We handle different possible formats by converting the value to a string and then checking for common "truthy" values.
df_deleted = df[df['misinformation_post_deleted_by_user']
                .astype(str)
                .str.lower()
                .isin(['true', 'yes', '1'])]

print("Debunks where the user deleted the misinformation post:\n")

# For each debunk, print the requested information
for index, row in df_deleted.iterrows():
    print(f"Misinformation Post URL: {row['original_misinformation_post_url']}")
    print(f"Debunking Org Article: {row['fact_check_article_url']}")
    print(f"User Followers: {row['user_followers_count']}")
    print(f"Blue Verified: {row['is_user_blue_verified']}")
    print(f"Average Views: {row['avg_views']}")
    print(f"Tone and AI Disclosure Combo: {row['tone_ai_combo']}")
    print("-" * 50)

Debunks where the user deleted the misinformation post:

Misinformation Post URL: https://x.com/ZAINABALI_72/status/1841182994595565706
Debunking Org Article: https://www.politifact.com/factchecks/2024/oct/03/tweets/no-this-video-does-not-show-iran-missile-attack/
User Followers: 79593
Blue Verified: False
Average Views: 6.5
Tone and AI Disclosure Combo: factual_no_ai
--------------------------------------------------
Misinformation Post URL: https://x.com/AmericaPapaBear/status/1852123157802684476
Debunking Org Article: https://www.politifact.com/factchecks/2024/nov/01/tweets/ohio-voting-officials-say-they-have-not-received-r/
User Followers: 103713
Blue Verified: True
Average Views: 6.0
Tone and AI Disclosure Combo: humorous_no_ai
--------------------------------------------------
Misinformation Post URL: https://x.com/TheRoyalSerf/status/1853162645425447265
Debunking Org Article: https://www.politifact.com/factchecks/2024/nov/04/tweets/no-michael-jordan-hadnt-endorsed-donald-trump-a

# Analysis of top engaging memes 

In [31]:
import pandas as pd

# Define the input file path (processed CSV file)
input_file_path = "../data/x_bot_data_phase1.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(input_file_path)

# Ensure the 'views_meme_post' column is numeric
df['views_meme_post'] = pd.to_numeric(df['views_meme_post'], errors='coerce')

# Sort the DataFrame by 'views_meme_post' in descending order and take the top 10 rows
top10_memes = df.sort_values(by="views_meme_post", ascending=False).head(10)

print("Top 10 Memes with the Most Views:")
for index, row in top10_memes.iterrows():
    print("Meme Upload URL:", row['meme_upload_url'])
    print("Meme Image URL:", row['meme_image_url'])
    print("-" * 50)


Top 10 Memes with the Most Views:
Meme Upload URL: https://i.imgflip.com/9df62c.jpg
Meme Image URL: https://i.imgflip.com/2odckz.jpg
--------------------------------------------------
Meme Upload URL: https://i.imgflip.com/9md54c.jpg
Meme Image URL: https://i.imgflip.com/2m20oc.jpg
--------------------------------------------------
Meme Upload URL: https://i.imgflip.com/9g4y34.jpg
Meme Image URL: https://i.imgflip.com/dzrtk.jpg
--------------------------------------------------
Meme Upload URL: https://i.imgflip.com/9gnuvd.jpg
Meme Image URL: https://i.imgflip.com/3xog.jpg
--------------------------------------------------
Meme Upload URL: https://i.imgflip.com/9cgmvq.jpg
Meme Image URL: https://i.imgflip.com/5312.jpg
--------------------------------------------------
Meme Upload URL: https://i.imgflip.com/9awdpc.jpg
Meme Image URL: https://i.imgflip.com/4acd7j.png
--------------------------------------------------
Meme Upload URL: https://i.imgflip.com/9hp6wf.jpg
Meme Image URL: https