In [1]:
import pandas as pd
import os
from PIL import Image
import numpy as np

# read data from file
df = pd.read_pickle('filtered_data.pkl')
df.info()

# Folder path where images are stored
folder_path = 'images'
output_folder = 'processed_images'

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Define target size for resizing (smaller for less space)
target_size = (128, 128)  # Resize to 128x128 for more space-efficient storage

# Function to process and save images
def process_image(file_path, output_path):
    try:
        # Open the image
        img = Image.open(file_path)
        
        # Convert to RGB (if not already in RGB)
        img = img.convert('RGB')
        
        # Resize the image
        img = img.resize(target_size)
        
        # Save the processed image as JPEG with reduced quality to reduce size
        img.save(output_path, format='JPEG', quality=75, optimize=True)  # Quality = 75 for compression

        return np.array(img)  # Return image as array for further processing
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Process each image in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    # Only process files with specific extensions
    if file_name.lower().endswith(('jpg', 'jpeg', 'png')):
        output_path = os.path.join(output_folder, file_name.replace('.png', '.jpg').replace('.jpeg', '.jpg').replace('.jpg', '.jpg'))
        
        # Process the image and save or get the processed result
        processed_image = process_image(file_path, output_path)

        # You can now work with `processed_image` (which is a numpy array)
        # For example, you can add it to a list or store it in a DataFrame

# Folder path where images are stored
folder_path = output_folder  # Make sure you have processed images here

# Assuming you already have the 'headlines' and 'is_satire' columns
# For example:
# headlines = df['headline']  # Your headlines column
# is_satire = df['is_satire']  # Your is_satire column

# Function to load images as numpy arrays
def load_image_as_numpy(file_path):
    try:
        # Load image (JPEG)
        img = Image.open(file_path)
        img = img.convert('RGB')  # Ensure it's in RGB format
        img_array = np.array(img) / 255.0  # Normalize to [0, 1]
        return img_array
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

# Assuming you have a list of file names matching the images (e.g., 'image_0.jpg', 'image_1.jpg', etc.)
image_files = sorted(os.listdir(folder_path))  # List all image files and sort them

# Create a list of images as numpy arrays
images = []
for file_name in image_files:
    if file_name.lower().endswith(('jpg', 'jpeg')):  # Only load JPEG files
        file_path = os.path.join(folder_path, file_name)
        img_array = load_image_as_numpy(file_path)
        images.append(img_array)

# Create the DataFrame with headlines, images (numpy arrays), and is_satire
df_processed = pd.DataFrame({
    'headline': df.headline,
    'images': images,
    'is_satire': df.is_satire
})

# Save the DataFrame to a pickle file (which will contain the image arrays)
pickle_file_path = 'processed_data.pkl'
df_processed.to_pickle(pickle_file_path)

print(f"Dataframe saved to {pickle_file_path}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7971 entries, 0 to 7970
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   headline    7971 non-null   object
 1   image_path  7971 non-null   object
 2   is_satire   7971 non-null   int32 
dtypes: int32(1), object(2)
memory usage: 155.8+ KB




Dataframe saved to processed_data.pkl
