In [1]:
# ===== 1. SETUP =====
import pandas as pd
import os
from tqdm.auto import tqdm
import requests
from newspaper import Article
import urllib.parse
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ===== 2. CLONE DATASET IF NEEDED =====
if not os.path.exists('FakeNewsNet'):
    !git clone https://github.com/KaiDMML/FakeNewsNet.git
os.chdir('FakeNewsNet')

# Download individual files if missing
required_files = [
    'dataset/politifact_fake.csv',
    'dataset/politifact_real.csv',
    'dataset/gossipcop_fake.csv',
    'dataset/gossipcop_real.csv'
]

base_url = "https://raw.githubusercontent.com/KaiDMML/FakeNewsNet/master/dataset/"
missing_files = [f for f in required_files if not os.path.exists(f)]


In [3]:
# ===== 3. LOAD, CLEAN, AND REPORT =====
def load_all_datasets():
    datasets = []
    base_path = 'dataset'

    files = [
        ('politifact', 'fake'),
        ('politifact', 'real'),
        ('gossipcop', 'fake'),
        ('gossipcop', 'real')
    ]

    for source, label in files:
        try:
            df = pd.read_csv(f'{base_path}/{source}_{label}.csv')
            df['source'] = source
            df['label'] = label
            datasets.append(df)
        except Exception as e:
            print(f"Error loading {source}_{label}: {e}")

    return pd.concat(datasets, ignore_index=True)

full_df = load_all_datasets()

# Print TRUE original distribution (before URL filtering)
df_raw = load_all_datasets()  # reload original
print("True original distribution (including rows with missing URLs):")
print(df_raw.groupby(['source', 'label']).size())

# Filter for valid URLs only
full_df = full_df[full_df['news_url'].notna()].copy()
full_df['domain'] = full_df['news_url'].apply(lambda x: urllib.parse.urlparse(x).netloc)

# Print full distribution
print("Original distribution (all rows with non-null URL):")
print(full_df.groupby(['source', 'label']).size())


True original distribution (including rows with missing URLs):
source      label
gossipcop   fake      5323
            real     16817
politifact  fake       432
            real       624
dtype: int64
Original distribution (all rows with non-null URL):
source      label
gossipcop   fake      5067
            real     16804
politifact  fake       428
            real       567
dtype: int64


In [4]:
display(full_df)

Unnamed: 0,id,news_url,title,tweet_ids,source,label,domain
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,politifact,fake,
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,politifact,fake,
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,politifact,fake,
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,politifact,fake,howafrica.com
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,politifact,fake,washingtonsources.org
...,...,...,...,...,...,...,...
23191,gossipcop-891749,https://omgcheckitout.com/2017-hollywood-film-...,2017 Hollywood Film Awards: The Complete List ...,927385466357260288\t927386601034010625\t927387...,gossipcop,real,omgcheckitout.com
23192,gossipcop-941486,https://pagesix.com/2018/06/04/jada-pinkett-sm...,Jada Pinkett Smith explains why son Jaden move...,1004044947006386178\t1004045964401889285\t1004...,gossipcop,real,pagesix.com
23193,gossipcop-953143,https://www.etonline.com/tinsley-mortimer-reac...,Tinsley Mortimer Reacts to Luann de Lesseps' R...,1019924845889572864\t1019925702676709377\t1019...,gossipcop,real,www.etonline.com
23194,gossipcop-954366,https://www.healthbreakingnews.net/2018/07/pri...,Prince Harry Carries on Princess Diana’s Legac...,1021766291139584000\t1021772054599802880\t1021...,gossipcop,real,www.healthbreakingnews.net


In [6]:
# ===== 4. CREATE CUSTOM SUBSET BEFORE SCRAPING =====
def sample_custom_subset(df):
    sampling_plan = {
        ('politifact', 'fake'): 428,
        ('politifact', 'real'): 567,
        ('gossipcop', 'fake'): 1002,
        ('gossipcop', 'real'): 1003
    }

    sampled_frames = []

    for (source, label), count in sampling_plan.items():
        group_df = df[(df['source'] == source) & (df['label'] == label)]

        available = group_df.shape[0]
        if available < count:
            raise ValueError(f"Not enough data for {source} - {label}: requested {count}, available {available}")

        sampled = group_df.sample(n=count, random_state=42)
        sampled_frames.append(sampled)

    return pd.concat(sampled_frames, ignore_index=True)

# Create the subset and override full_df
subset_df = sample_custom_subset(full_df)

# Report distribution
print("Custom-sampled subset distribution (before scraping):")
print(subset_df.groupby(['source', 'label']).size())


Custom-sampled subset distribution (before scraping):
source      label
gossipcop   fake     1002
            real     1003
politifact  fake      428
            real      567
dtype: int64


In [7]:
subset_df

Unnamed: 0,id,news_url,title,tweet_ids,source,label,domain
0,politifact15356,https://miamipost.co/2018/05/06/breaking-3-lib...,BREAKING: 3 Liberal Celebrities Arrested For C...,848177172330237952\t848184488072970240\t848185...,politifact,fake,miamipost.co
1,politifact15109,https://www.nyfoxnews.co/michelle-obama-just-r...,Michelle Obama Just Received LIFE-SHATTERING N...,970400155357798400\t970400532723478528\t970401...,politifact,fake,www.nyfoxnews.co
2,politifact15189,http://viralactions.com/80-of-black-men-in-atl...,"80% of black men in Atlanta are Homosexuals, s...",700371289886707712\t700381413758103553\t700388...,politifact,fake,viralactions.com
3,politifact14544,www.cnn.com/2017/09/13/politics/susan-rice-hou...,Rice told investigators why she unmasked Trump...,909955253365235712\t909965037275537409\t910024...,politifact,fake,
4,politifact13987,vote.us.org/memo/thread/15671/obama%E2%80%99s-...,Obama’s Tax- Skipping – Audit Shows Millions I...,876981802942050305\t877368875435778052\t877750...,politifact,fake,
...,...,...,...,...,...,...,...
2995,gossipcop-836336,https://www.thrillist.com/entertainment/nation...,"The 13 Most Tear-Jerking 'This Is Us' Moments,...",842289491813502976\t912490167830585344,gossipcop,real,www.thrillist.com
2996,gossipcop-874392,https://deadline.com/2017/08/lin-manuel-mirand...,Lin-Manuel Miranda & James Corden Strip Down T...,,gossipcop,real,deadline.com
2997,gossipcop-888391,https://www.thewrap.com/julia-louis-dreyfus-th...,"Julia Louis-Dreyfus Thanks Katy Perry, Wraps 2...",921439956681351168\t921442734103285761\t921442...,gossipcop,real,www.thewrap.com
2998,gossipcop-859228,https://www.thewrap.com/jimmy-kimmel-learns-se...,Jimmy Kimmel Learns Secret of the ‘Whaboom’ Fr...,872085950142095361\t872086266350882816\t872087...,gossipcop,real,www.thewrap.com


In [10]:
# ===== 5. FILTER AND COLLECT ONLY NECESSARY 6 COLUMNS =====
target_n = 3000
scan_results = []
final_cols = ["domain", "title", "text", "publish_date", "image_path", "score", "label"]
SAVE_DIR = 'FakeNewsNet/'
os.makedirs(f"{SAVE_DIR}/images", exist_ok=True)
os.makedirs(f"{SAVE_DIR}/metadata", exist_ok=True)

for idx, row in tqdm(subset_df.iterrows(), total=len(subset_df), desc="Collecting usable articles"):
    try:
        article = Article(row['news_url'])
        article.download()
        article.parse()

        # Ensure both text and top_image are present
        if not article.text.strip() or not article.top_image:
            continue

        # === Compute score using favorite_count ===
        tweet_ids = row['tweet_ids'].split('\t') if isinstance(row['tweet_ids'], str) else []
        total_fav_count = 0
        for tid in tweet_ids:
            tweet_path = f'tweets/{tid}.json'
            try:
                with open(tweet_path, 'r') as f:
                    tweet = json.load(f)
                    total_fav_count += tweet.get('favorite_count', 0)
            except Exception:
                continue  # missing or unreadable tweet file

        # Save image to disk
        image_url = article.top_image
        ext = image_url.split('.')[-1][:3].lower()
        img_path = f"{SAVE_DIR}/images/{row['id']}.{ext}"

        response = requests.get(image_url, timeout=10)
        with open(img_path, 'wb') as f:
            f.write(response.content)

        scan_results.append({
            'domain': urllib.parse.urlparse(row['news_url']).netloc,
            'title': row['title'],
            'text': article.text,
            'publish_date': article.publish_date,
            'image_path': img_path,
            'score': total_fav_count,
            'label': 1 if row['label'] == 'fake' else 0,  # 1 = fake, 0 = real
            'source': row['source'],    # <-- Add this
            'real_fake_label': row['label']
        })

        # Autosave every 100 successful entries
        if len(scan_results) % 100 == 0:
            temp_df = pd.DataFrame(scan_results)[final_cols]
            temp_df.to_pickle(f'{SAVE_DIR}/metadata/partial_progress.pkl')
            print(f"Autosaved {len(scan_results)} articles.")

        if len(scan_results) >= target_n:
            break

    except Exception:
        continue  # skip on error

print(f"\n Collected {len(scan_results)} usable articles with top image + text")

final_df = pd.DataFrame(scan_results)
print("\n Final scraped dataset breakdown:")
print(final_df.groupby(['source', 'real_fake_label']).size())


Collecting usable articles:  10%|█         | 315/3000 [07:58<1:26:34,  1.93s/it]

Autosaved 100 articles.


Collecting usable articles:  21%|██▏       | 638/3000 [20:11<1:07:56,  1.73s/it]

Autosaved 200 articles.


Collecting usable articles:  31%|███       | 933/3000 [29:55<27:30,  1.25it/s]  

Autosaved 300 articles.


Collecting usable articles:  70%|██████▉   | 2090/3000 [34:25<13:13,  1.15it/s] 

Autosaved 400 articles.


Collecting usable articles:  74%|███████▍  | 2222/3000 [37:25<24:22,  1.88s/it]

Autosaved 500 articles.


Collecting usable articles:  79%|███████▊  | 2362/3000 [48:25<39:39,  3.73s/it]   

Autosaved 600 articles.


Collecting usable articles:  83%|████████▎ | 2494/3000 [52:03<37:53,  4.49s/it]

Autosaved 700 articles.


Collecting usable articles:  88%|████████▊ | 2626/3000 [54:53<09:18,  1.49s/it]

Autosaved 800 articles.


Collecting usable articles:  92%|█████████▏| 2767/3000 [57:44<13:33,  3.49s/it]

Autosaved 900 articles.


Collecting usable articles:  97%|█████████▋| 2902/3000 [1:00:17<01:27,  1.12it/s]

Autosaved 1000 articles.


Collecting usable articles: 100%|██████████| 3000/3000 [1:02:52<00:00,  1.26s/it]


 Collected 1079 usable articles with top image + text

 Final scraped dataset breakdown:
source      real_fake_label
gossipcop   real               755
politifact  fake               132
            real               192
dtype: int64





In [7]:
# ===== 6. SAVE FINAL DATASET (ONLY 6 COLUMNS) =====
# final_df = pd.DataFrame(scan_results)
# final_df = final_df[final_cols]

# Save to metadata folder
SAVE_DIR = 'FakeNewsNet/FakeNewsNet/'
csv_out_path = f"{SAVE_DIR}/metadata/fakenewsnet_processed4.csv"
pkl_out_path = f"{SAVE_DIR}/metadata/fakenewsnet_processed4.pkl"

# final_df.to_csv(csv_out_path, index=False)
# final_df.to_pickle(pkl_out_path)

# # Also save to Google Drive
# final_df.to_csv('/content/drive/MyDrive/fakenewsnet_processed4.csv', index=False)
# final_df.to_pickle('/content/drive/MyDrive/fakenewsnet_processed4.pkl')

# print(f"Final dataset saved with shape: {final_df.shape}")
# print(final_df.head())


In [8]:
final_df = pd.read_csv(csv_out_path)

In [9]:
final_df

Unnamed: 0,domain,title,text,publish_date,image_path,score,label
0,yournewswire.com,Bill Gates’ Former Doctor Says Billionaire ‘Re...,Massachusetts Urged to Pass Sports Betting Bil...,2022-06-01 09:18:59+00:00,FakeNewsNet//images/politifact15342.jpe,0,1
1,web.archive.org,TRUMP WANTS TO DEPORT AMERICAN INDIANS TO INDIA,Washington (dpo) – As part of his plan to impr...,2017-04-25 00:00:00,FakeNewsNet//images/politifact14043.jpg,0,1
2,beforeitsnews.com,Target to Discontinue Sale of Holy Bible,\n\n\n\nTarget CEO Brian Cornell announced tod...,2016-05-18 05:57:35,FakeNewsNet//images/politifact13775.jpg,0,1
3,www.wilmingtonfilm.com,Wilmington Regional Film Commission,LOCATION. LOCATION. LOCATION.\n\nSince the 198...,,FakeNewsNet//images/politifact13943.ico,0,1
4,actionnews3.com,"Legendary Actor Kirk Douglas Dead, 4 Days Befo...",News reports have confirmed that actor Kirk Do...,2017-12-06 03:56:22+00:00,FakeNewsNet//images/politifact15095.jpg,0,1
...,...,...,...,...,...,...,...
1074,www.thrillist.com,"The 13 Most Tear-Jerking 'This Is Us' Moments,...",NBC NBC\n\nThis Is Us wrapped up its powerful ...,2018-03-14 21:33:52+00:00,FakeNewsNet//images/gossipcop-836336.jpg,0,0
1075,deadline.com,Lin-Manuel Miranda & James Corden Strip Down T...,Lin-Manuel Miranda and James Corden shed their...,2017-08-18 17:04:16+00:00,FakeNewsNet//images/gossipcop-874392.jpg,0,0
1076,www.thewrap.com,"Julia Louis-Dreyfus Thanks Katy Perry, Wraps 2...",Julia Louis-Dreyfus just wrapped her second ro...,2017-10-20 14:25:41+00:00,FakeNewsNet//images/gossipcop-888391.jpg,0,0
1077,www.thewrap.com,Jimmy Kimmel Learns Secret of the ‘Whaboom’ Fr...,(Spoiler alert: Please do not read on if you h...,2017-06-06 05:21:09+00:00,FakeNewsNet//images/gossipcop-859228.jpg,0,0


In [10]:
final_df.isna().sum()

domain            0
title             0
text              0
publish_date    435
image_path        0
score             0
label             0
dtype: int64

In [16]:
final_df
final_df["image_path"] = final_df["image_path"].apply(lambda x : x.split('/')[-1])

In [20]:
import os

# Define the source folder where images are currently stored
source_folder = 'image_dump'  # Folder where images are currently stored

# final_df["image_path"] = final_df['image_path'].str.replace("image_dump\\", "", regex=False)

# Ensure the source folder exists
if not os.path.exists(source_folder):
    print(f"The source folder {source_folder} does not exist!")

else:
    # Assuming your dataframe is named `weibo_df` and the 'valid_image' column contains the image names/paths
    exist = final_df['image_path'].apply(
        lambda image_filename: os.path.exists(os.path.join(source_folder, image_filename)) if image_filename else False)


In [21]:
exist.sum()

np.int64(1079)

In [None]:
import json
import os

# Check favorite_count for a few entries in the original sampled subset
print("🔍 Checking favorite_count in tweet JSONs for first 5 articles:")

for i in range(5):  # Adjust the range to see more
    row = subset_df.iloc[i]
    tweet_ids = row['tweet_ids'].split('\t') if isinstance(row['tweet_ids'], str) else []

    print(f"\n--- Article {i+1} ({row['source']} - {row['label']}) ---")
    print(f"URL: {row['news_url']}")
    print(f"Tweet IDs: {tweet_ids}")

    for tid in tweet_ids[:3]:  # Show first 3 tweet IDs per article
        tweet_path = f'tweets/{tid}.json'
        if os.path.exists(tweet_path):
            try:
                with open(tweet_path, 'r') as f:
                    tweet = json.load(f)
                    fav = tweet.get('favorite_count', 'MISSING')
                    print(f"  Tweet ID: {tid} | favorite_count: {fav}")
            except Exception as e:
                print(f"  Tweet ID: {tid} | ERROR: {e}")
        else:
            print(f"  Tweet ID: {tid} | FILE NOT FOUND")


🔍 Checking favorite_count in tweet JSONs for first 5 articles:

--- Article 1 (politifact - fake) ---
URL: https://miamipost.co/2018/05/06/breaking-3-liberal-celebrities-arrested-for-conspiracy-to-assassinate-president-trump/
Tweet IDs: ['848177172330237952', '848184488072970240', '848185917592526848', '848191022186323968', '848194550061383680', '848194809256755201', '848197447402668033', '848197448539361280', '848198059964063744', '848198093187170304', '848198094369898497', '848214672381693952', '848214670372618242', '848214670137638913', '848214675988766720', '848228253198200832', '848228252879511552', '848228258898366464', '848228268138418177', '848232779812753408', '848234521602523136', '848240775779561475', '848249708216799232', '848257873335521280', '848259893379321856', '848261163473403908', '848262341498212353', '848266257099096064', '848269545487958018', '848280568349556737', '848282291260366848', '848287208234766336', '848287510837026818', '848287646786957313', '8482896601407