# Extracting JSON content

In [None]:
import pandas as pd
import numpy as np

In [3]:
full_fakeddit = pd.read_csv("./fakeddit_collated.csv")

In [4]:
full_fakeddit["2_way_label"].value_counts()

2_way_label
1    48504
0     4955
Name: count, dtype: int64

In [5]:
label_0 = full_fakeddit[full_fakeddit['2_way_label'] == 0]
label_1 = full_fakeddit[full_fakeddit['2_way_label'] == 1]

sampled_label_0 = label_0.sample(n=4955, random_state=42)  # random_state ensures reproducibility
sampled_label_1 = label_1.sample(n=4955, random_state=42)

# Concatenate both sampled DataFrames
sampled_df = pd.concat([sampled_label_0, sampled_label_1])

In [6]:
sampled_df

Unnamed: 0.1,Unnamed: 0,index,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,...,upvote_ratio,2_way_label,3_way_label,6_way_label,reddit_url,external_url,is_link_post,has_valid_url,scraped,article_name
5576,5576,12354,residentchubbychaser,there are no good options in syria sighs man w...,1492109533,theonion.com,True,657h33,https://external-preview.redd.it/QGOLuH4_e7M5f...,\N,...,0.99,0,2,1,https://www.reddit.com/r/TheOnion/comments/657...,,False,False,False,usanews_theonion_scraped_articles_12354.json
3962,3962,8782,dwaxe,not very scary honestly this little girl is po...,1568059431,clickhole.com,True,d1w7ur,https://external-preview.redd.it/nkiyXncogSDx5...,\N,...,0.91,0,2,1,https://www.reddit.com/r/TheOnion/comments/d1w...,https://www.clickhole.com/not-very-scary-hones...,True,True,False,usanews_theonion_scraped_articles_8782.json
5841,5841,12927,residentchubbychaser,god deploys more mosquitoes to us,1497908290,theonion.com,True,6i9l51,https://external-preview.redd.it/mjKGy8mJI-JO4...,\N,...,0.87,0,2,1,https://www.reddit.com/r/TheOnion/comments/6i9...,,False,False,False,usanews_theonion_scraped_articles_12927.json
4162,4162,9210,dwaxe,god sick of new angels annoying fucking voice,1488207252,theonion.com,True,5wh4p2,https://external-preview.redd.it/4THAR36V7kNI-...,\N,...,0.95,0,2,1,https://www.reddit.com/r/TheOnion/comments/5wh...,http://www.theonion.com/article/god-sick-new-a...,True,True,False,usanews_theonion_scraped_articles_9210.json
357,357,801,dwaxe,gordon ramsay said what,1553191320,clickhole.com,True,b3tree,https://external-preview.redd.it/5W3TOi9qCR1Ko...,\N,...,0.88,0,2,1,https://www.reddit.com/r/TheOnion/comments/b3t...,https://www.clickhole.com/gordon-ramsay-said-w...,True,True,False,usanews_theonion_scraped_articles_801.json
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39265,6515,9032,Amamazing,babies with bubble boy disease have had it fix...,1555543127,cbc.ca,True,beehfa,https://external-preview.redd.it/l4xGaaf-tOWIH...,\N,...,0.95,1,0,0,https://www.reddit.com/r/UpliftingNews/comment...,https://www.cbc.ca/news/health/bubble-boy-1.51...,True,True,False,upliftingnews_usnews_scraped_articles_9032.json
14875,8938,13218,CarolinianRevolution,germany considers recruiting foreign eu nation...,1533311838,thelocal.de,True,94asa3,https://external-preview.redd.it/a-FNG7KA2TuSU...,\N,...,0.80,1,0,0,https://www.reddit.com/r/neutralnews/comments/...,https://www.thelocal.de/20180802/german-army-c...,True,True,True,neutralnews_nottheonion_scraped_articles_13218...
39034,6284,8707,tanukisuit,local mother is reunited with the three teens ...,1426714887,m.kirotv.com,True,2zikv9,https://external-preview.redd.it/ki_Ojx1tt42-z...,\N,...,0.89,1,0,0,https://www.reddit.com/r/UpliftingNews/comment...,http://m.kirotv.com/news/news/national/local-m...,True,True,True,upliftingnews_usnews_scraped_articles_8707.json
12852,6915,10246,gen54,cat mistaken for burglar gets detained by police,1567638036,foxnews.com,True,czsa3q,https://external-preview.redd.it/repNDBxtgjB3o...,\N,...,0.88,1,0,0,https://www.reddit.com/r/nottheonion/comments/...,https://www.foxnews.com/lifestyle/cat-burglar-...,True,True,False,neutralnews_nottheonion_scraped_articles_10246...


# Extracting json information

In [7]:
import os
import json
import requests
import pandas as pd
from urllib.parse import urlparse
from tqdm import tqdm

In [8]:
# Assuming 'sampled_df' is your sampled DataFrame containing the 'article_name' column
json_folder = 'article_json_dump'
image_folder = 'image_dump'

# Create the image folder if it doesn't exist
if not os.path.exists(image_folder):
    os.makedirs(image_folder)


# Function to extract data from JSON and combine it with the DataFrame
def extract_json_data(sampled_df, json_folder, image_folder):
    extracted_data = []
    
    for index, row in tqdm(sampled_df.iterrows(), total=sampled_df.shape[0]):
        article_name = row['article_name']  # Get the article name (JSON filename)
        json_path = os.path.join(json_folder, article_name)
        
        if os.path.exists(json_path):
            # Read the JSON file
            with open(json_path, 'r') as f:
                article_data = json.load(f)
            
            # Extract the necessary fields from the JSON data
            url = article_data.get('url', '')
            title = article_data.get('title', '')
            text = article_data.get('text', '')
            publish_date = article_data.get('publish_date', '')
            authors = article_data.get('authors', '')
            top_image = article_data.get('top_image', '')
            images = article_data.get('images', [])
            source = article_data.get('source', '')

            # Append the extracted data to the list
            extracted_data.append({
                'article_name': article_name,
                'url': url,
                'title': title,
                'text': text,
                'publish_date': publish_date,
                'authors': authors,
                'top_image': top_image,
                'images': images,
                'source': source
            })
    
    # Create a DataFrame from the extracted data
    result_df = pd.DataFrame(extracted_data)
    return result_df

# Extract and merge the data
result_df = extract_json_data(sampled_df, json_folder, image_folder)

# Check the result
display(result_df)

# Optionally, save the DataFrame to a CSV
result_df.to_csv('extracted_data.csv', index=False)


100%|██████████| 9910/9910 [03:08<00:00, 52.65it/s]


Unnamed: 0,article_name,url,title,text,publish_date,authors,top_image,images,source
0,usanews_theonion_scraped_articles_12354.json,https://www.texastribune.org/2016/10/28/uttt-p...,UT/TT Poll: Texas voters feeling high anxiety ...,Voters in the party that has not lost a statew...,2016-10-28,"[Ross Ramsey, Oct., Am Central]",https://thumbnails.texastribune.org/YS7n_7vGhg...,[https://thumbnails.texastribune.org/YS7n_7vGh...,https://www.texastribune.org
1,usanews_theonion_scraped_articles_8782.json,http://www.nj.com/politics/index.ssf/2017/11/m...,Phil Murphy beats Kim Guadagno to succeed Chri...,"TRENTON -- Phil Murphy, a former Wall Street e...",2017-11-08,"[Brent Johnson, Bjohnson Njadvancemedia.Com, N...",http://www.nj.com/pf/resources/images/nj/favic...,[http://www.nj.com/pf/resources/images/common/...,http://www.nj.com
2,usanews_theonion_scraped_articles_12927.json,https://www.clickhole.com/devastating-blow-to-...,Devastating Blow To Feminism: This Woman Sucks,Devastating Blow To Feminism: This Woman Sucks...,2019-06-12,"[Clickhole Editorial, June]",https://clickhole.com/wp-content/uploads/2019/...,[https://clickhole.com/wp-content/uploads/2025...,https://www.clickhole.com
3,usanews_theonion_scraped_articles_9210.json,http://www.latimes.com/local/lanow/la-me-ln-sa...,Feds probe possible terrorism links in San Ber...,Evacuated workers join in a circle to pray on ...,2015-12-03,[Jack Dolan Is An Investigative Reporter For T...,https://ca-times.brightspotcdn.com/dims4/defau...,[https://ca-times.brightspotcdn.com/dims4/defa...,http://www.latimes.com
4,usanews_theonion_scraped_articles_801.json,http://www.theverge.com/2017/1/29/14430082/fou...,"In the last 24 hours, four federal courts have...","Last night, a Federal District court issued an...",2017-01-29,[Andrew Liptak],https://platform.theverge.com/wp-content/uploa...,[https://platform.theverge.com/wp-content/uplo...,http://www.theverge.com
...,...,...,...,...,...,...,...,...,...
9905,upliftingnews_usnews_scraped_articles_9032.json,https://www.cbc.ca/news/health/bubble-boy-1.51...,Doctors use gene therapy to fix 'bubble boy' d...,Relying on the trickery used by HIV to infect ...,,[Thomson Reuters],https://i.cbc.ca/1.5102836.1555543003!/httpIma...,"[https://www.cbc.ca/a/assets/nojsimg.gif, http...",https://www.cbc.ca
9906,neutralnews_nottheonion_scraped_articles_13218...,https://www.thelocal.de/20180802/german-army-c...,Germany considers recruiting foreign EU nation...,"Lagging behind other NATO powers in personnel,...",2018-08-02,[],https://assets.thelocal.com/cdn-cgi/rs:fit:120...,[https://assets.thelocal.com/cdn-cgi/rs:fit:12...,https://www.thelocal.de
9907,upliftingnews_usnews_scraped_articles_8707.json,http://m.kirotv.com/news/news/national/local-m...,This website is unavailable in your location. ...,We’re Sorry!\n\nThis website is unavailable in...,,[],https://www.kiro7.com/resizer/lbgSvgYuvw9xc9F7...,[https://www.kiro7.com/resizer/lbgSvgYuvw9xc9F...,http://m.kirotv.com
9908,neutralnews_nottheonion_scraped_articles_10246...,https://www.foxnews.com/lifestyle/cat-burglar-...,"Cat mistaken for burglar, gets 'detained' by p...",Police in Florida caught a very unusual suspec...,,[Michael Hollan],https://static.foxnews.com/foxnews.com/content...,[https://a57.foxnews.com/cf-images.us-east-1.p...,https://www.foxnews.com


## Fetching Images

In [None]:
import requests
import os
from urllib.parse import urlparse

# Image folder where images will be stored
image_folder = 'image_dump'

# Create the image folder if it doesn't exist
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

# Function to download and save images
def download_image(url, image_folder):
    try:
        # Get the image content
        img_data = requests.get(url, timeout=10).content
        # Extract the image file name from the URL
        img_name = os.path.basename(urlparse(url).path)
        img_path = os.path.join(image_folder, img_name)
        # Save the image
        with open(img_path, 'wb') as f:
            f.write(img_data)
        return img_path  # Return the path of the saved image
    except Exception as e:
        print(f"Failed to download image from {url}: {e}")
        return None

# Function to download images for each article
def fetch_images_for_articles(result_df):
    image_paths = []
    for index, row in tqdm(result_df.iterrows(), total=result_df.shape[0]):
        top_image_url = row['top_image']
        
        if top_image_url:
            image_path = download_image(top_image_url, image_folder)
            image_paths.append(image_path)
        else:
            image_paths.append(None)
    
    # Add image paths to the result dataframe
    result_df['image_path'] = image_paths
    return result_df

# Fetch images after extracting the JSON data
result_df = fetch_images_for_articles(result_df)

# Check the result
print(result_df.head())

# Optionally, save the updated DataFrame with image paths
result_df.to_csv('final_data_with_images.csv', index=False)


In [13]:
result_df

Unnamed: 0,article_name,url,title,text,publish_date,authors,top_image,images,source,image_path
0,usanews_theonion_scraped_articles_12354.json,https://www.texastribune.org/2016/10/28/uttt-p...,UT/TT Poll: Texas voters feeling high anxiety ...,Voters in the party that has not lost a statew...,2016-10-28,"[Ross Ramsey, Oct., Am Central]",https://thumbnails.texastribune.org/YS7n_7vGhg...,[https://thumbnails.texastribune.org/YS7n_7vGh...,https://www.texastribune.org,image_dump\TT-RossCharts_FRI-01.jpeg
1,usanews_theonion_scraped_articles_8782.json,http://www.nj.com/politics/index.ssf/2017/11/m...,Phil Murphy beats Kim Guadagno to succeed Chri...,"TRENTON -- Phil Murphy, a former Wall Street e...",2017-11-08,"[Brent Johnson, Bjohnson Njadvancemedia.Com, N...",http://www.nj.com/pf/resources/images/nj/favic...,[http://www.nj.com/pf/resources/images/common/...,http://www.nj.com,image_dump\favicon.ico
2,usanews_theonion_scraped_articles_12927.json,https://www.clickhole.com/devastating-blow-to-...,Devastating Blow To Feminism: This Woman Sucks,Devastating Blow To Feminism: This Woman Sucks...,2019-06-12,"[Clickhole Editorial, June]",https://clickhole.com/wp-content/uploads/2019/...,[https://clickhole.com/wp-content/uploads/2025...,https://www.clickhole.com,image_dump\sng0fvtlhsq8btayt79t.jpg
3,usanews_theonion_scraped_articles_9210.json,http://www.latimes.com/local/lanow/la-me-ln-sa...,Feds probe possible terrorism links in San Ber...,Evacuated workers join in a circle to pray on ...,2015-12-03,[Jack Dolan Is An Investigative Reporter For T...,https://ca-times.brightspotcdn.com/dims4/defau...,[https://ca-times.brightspotcdn.com/dims4/defa...,http://www.latimes.com,
4,usanews_theonion_scraped_articles_801.json,http://www.theverge.com/2017/1/29/14430082/fou...,"In the last 24 hours, four federal courts have...","Last night, a Federal District court issued an...",2017-01-29,[Andrew Liptak],https://platform.theverge.com/wp-content/uploa...,[https://platform.theverge.com/wp-content/uplo...,http://www.theverge.com,image_dump\DSC_0178.0.jpg
...,...,...,...,...,...,...,...,...,...,...
9905,upliftingnews_usnews_scraped_articles_9032.json,https://www.cbc.ca/news/health/bubble-boy-1.51...,Doctors use gene therapy to fix 'bubble boy' d...,Relying on the trickery used by HIV to infect ...,,[Thomson Reuters],https://i.cbc.ca/1.5102836.1555543003!/httpIma...,"[https://www.cbc.ca/a/assets/nojsimg.gif, http...",https://www.cbc.ca,
9906,neutralnews_nottheonion_scraped_articles_13218...,https://www.thelocal.de/20180802/german-army-c...,Germany considers recruiting foreign EU nation...,"Lagging behind other NATO powers in personnel,...",2018-08-02,[],https://assets.thelocal.com/cdn-cgi/rs:fit:120...,[https://assets.thelocal.com/cdn-cgi/rs:fit:12...,https://www.thelocal.de,image_dump\watermarks-logo-458dceef88e6a632a7a...
9907,upliftingnews_usnews_scraped_articles_8707.json,http://m.kirotv.com/news/news/national/local-m...,This website is unavailable in your location. ...,We’re Sorry!\n\nThis website is unavailable in...,,[],https://www.kiro7.com/resizer/lbgSvgYuvw9xc9F7...,[https://www.kiro7.com/resizer/lbgSvgYuvw9xc9F...,http://m.kirotv.com,image_dump\5NCQNQR6OZD3TCMK4FRDOF7UFA.jpeg
9908,neutralnews_nottheonion_scraped_articles_10246...,https://www.foxnews.com/lifestyle/cat-burglar-...,"Cat mistaken for burglar, gets 'detained' by p...",Police in Florida caught a very unusual suspec...,,[Michael Hollan],https://static.foxnews.com/foxnews.com/content...,[https://a57.foxnews.com/cf-images.us-east-1.p...,https://www.foxnews.com,image_dump\cat-burglar.jpg


In [None]:
result_df = result_df.dropna(subset=["image_path", "text"])

In [None]:
result_df.to_csv("./fakeddit_sampled.csv")

In [33]:
import os
from PIL import Image

def clean_invalid_images(folder_path):
    total = 0
    deleted = 0

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        total += 1

        try:
            # Try opening the image
            with Image.open(file_path) as img:
                img.verify()  # Verify image integrity (does not load into memory fully)
        except Exception as e:
            print(f"Deleting invalid image: {filename} ({e})")
            os.remove(file_path)
            deleted += 1

    print(f"Checked {total} images. Deleted {deleted} invalid images.")

# Example usage
clean_invalid_images('image_dump')


Deleting invalid image: 05-05-2015Albinism_NotGhost12-e1450368605635.jpg (cannot identify image file 'image_dump\\05-05-2015Albinism_NotGhost12-e1450368605635.jpg')
Deleting invalid image: 0515FEXONN-ADAMS2300px.jpg (cannot identify image file 'image_dump\\0515FEXONN-ADAMS2300px.jpg')
Deleting invalid image: 100196100-162747689.jpg (cannot identify image file 'image_dump\\100196100-162747689.jpg')
Deleting invalid image: 100532117-161627831.jpg (cannot identify image file 'image_dump\\100532117-161627831.jpg')
Deleting invalid image: 101281627-heels.jpg (cannot identify image file 'image_dump\\101281627-heels.jpg')
Deleting invalid image: 101621444-485932943.jpg (cannot identify image file 'image_dump\\101621444-485932943.jpg')
Deleting invalid image: 101809472-463012707.jpg (cannot identify image file 'image_dump\\101809472-463012707.jpg')
Deleting invalid image: 101850685-156854964.jpg (cannot identify image file 'image_dump\\101850685-156854964.jpg')
Deleting invalid image: 10192781

In [None]:
import os

# Keep only rows where the image file exists
result_df = result_df[result_df['image_path'].apply(os.path.exists)]


In [None]:
result_df["article_name"].nunique()

7423

In [None]:
result_df.to_csv("fakeddit_sample.csv")