In [2]:
#part1 - Image Preprocessing 
import os
from PIL import Image
from tqdm.notebook import tqdm
import numpy as np

# Directory where your original images are stored
image_dir = 'flickr30k_images/flickr30k_images'

# Directory where we want to save the preprocessed images
processed_image_dir = 'flickr30k_images/processed_images'
os.makedirs(processed_image_dir, exist_ok=True)  

# List to keep track of processed files
processed_files = []
not_processed_files = []

# Function to preprocess images
def transform(image): 
    return image

# Process and save images as NumPy arrays
for filename in tqdm(os.listdir(image_dir)):
    if filename.lower().endswith('.jpg'):  # Check if the file is an image
        try:
            file_path = os.path.join(image_dir, filename)
            with Image.open(file_path) as img:
                transformed_img = transform(img)
                npy_filename = os.path.splitext(filename)[0] + '.npy'
                save_path = os.path.join(processed_image_dir, npy_filename)
                np.save(save_path, transformed_img)
                processed_files.append(filename)  # Add to the list of processed files
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            not_processed_files.append(filename)  # Add to the list if not processed

# Print the results
print(f"Total number of images processed: {len(processed_files)}")
print(f"Total number of images not processed: {len(not_processed_files)}")

# Optionally, print the filenames of images not processed
if not_processed_files:
    print("Images not processed:")
    for filename in not_processed_files:
        print(filename)

        
num_images_to_print = 5  # Adjust as needed
for filename in processed_files[:num_images_to_print]:
    npy_filename = os.path.splitext(filename)[0] + '.npy'
    load_path = os.path.join(processed_image_dir, npy_filename)
    image_array = np.load(load_path)
    print(f"Array for {filename}:")
    print(image_array)


  0%|          | 0/31785 [00:00<?, ?it/s]

Error processing 4904808403.jpg: cannot identify image file 'flickr30k_images/flickr30k_images/4904808403.jpg'
Total number of images processed: 31782
Total number of images not processed: 1
Images not processed:
4904808403.jpg
Array for 2609797461.jpg:
[[[103 123 148]
  [105 125 150]
  [109 126 152]
  ...
  [198 203 206]
  [196 201 204]
  [196 201 204]]

 [[103 123 148]
  [105 125 150]
  [109 126 152]
  ...
  [201 205 206]
  [200 204 205]
  [199 204 207]]

 [[103 123 148]
  [105 125 150]
  [106 126 151]
  ...
  [203 207 208]
  [202 206 207]
  [201 205 206]]

 ...

 [[ 95  76  46]
  [ 99  80  50]
  [ 97  78  48]
  ...
  [ 35  21   8]
  [ 39  26  10]
  [ 39  26   9]]

 [[ 94  73  42]
  [105  84  53]
  [104  85  52]
  ...
  [ 31  21   9]
  [ 21  11   1]
  [ 16   7   0]]

 [[ 83  62  33]
  [101  80  53]
  [ 85  65  40]
  ...
  [ 13  10   5]
  [  9   6   1]
  [  9   9   1]]]
Array for 1788892671.jpg:
[[[236 245 252]
  [230 234 243]
  [231 240 249]
  ...
  [232 220 206]
  [232 221 203]
  [2

In [4]:
import pandas as pd
import re

# Path to your original captions file
captions_file_path = 'flickr30k_images/results.csv'

# Read the CSV file
captions_df = pd.read_csv(captions_file_path, delimiter='|')

# Rename the columns to remove leading/trailing spaces
captions_df.columns = captions_df.columns.str.strip()

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and numbers
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply the cleaning function to the comment column
captions_df['comment'] = captions_df['comment'].astype(str).apply(clean_text)

# Save the cleaned DataFrame to a new CSV file
#processed_captions_file_path = 'flickr30k_images/processed_results.csv'
#captions_df.to_csv(processed_captions_file_path, index=False)


In [13]:
captions_df

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,two young guys with shaggy hair look at their ...
1,1000092795.jpg,1,two young white males are outside near many bu...
2,1000092795.jpg,2,two men in green shirts are standing in a yard
3,1000092795.jpg,3,a man in a blue shirt standing in a garden
4,1000092795.jpg,4,two friends enjoy time spent together
...,...,...,...
158910,998845445.jpg,0,a man in shorts and a hawaiian shirt leans ove...
158911,998845445.jpg,1,a young man hanging over the side of a boat wh...
158912,998845445.jpg,2,a man is leaning off of the side of a blue and...
158913,998845445.jpg,3,a man riding a small boat in a harbor with fog...


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization and vectorization
MAX_NUM_WORDS = 10000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(captions_df['comment'])
sequences = tokenizer.texts_to_sequences(captions_df['comment'])
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=100)  

images = [np.load(os.path.join(processed_image_dir, img_file)) for img_file in os.listdir(processed_image_dir) if img_file.endswith('.npy')]
combined_dataset = list(zip(images, data))


In [10]:
# Print datatypes of the first few elements in the combined_dataset
for i, (image_data, text_data) in enumerate(combined_dataset[:5]):  
    print(f"Element {i}:")
    print(f"Image data type: {type(image_data)}")
    print(f"Text data type: {type(text_data)}")
    print("---")


Element 0:
Image data type: <class 'numpy.ndarray'>
Text data type: <class 'numpy.ndarray'>
---
Element 1:
Image data type: <class 'numpy.ndarray'>
Text data type: <class 'numpy.ndarray'>
---
Element 2:
Image data type: <class 'numpy.ndarray'>
Text data type: <class 'numpy.ndarray'>
---
Element 3:
Image data type: <class 'numpy.ndarray'>
Text data type: <class 'numpy.ndarray'>
---
Element 4:
Image data type: <class 'numpy.ndarray'>
Text data type: <class 'numpy.ndarray'>
---


In [12]:
image_filenames = [os.path.splitext(filename)[0] for filename in os.listdir(processed_image_dir) if filename.endswith('.npy')]

export_df = pd.DataFrame({
    'image_filename': image_filenames,
    'text_data': [text for _, text in combined_dataset]
})

# Export to CSV
export_df.to_csv('flickr30k_images/combined_dataset.csv', index=False)

