This jupyter notebook presents the preprocessing steps for text to image GANs. This could be applied to any dataset with images and captions. This is an essential step if the dataset of images and text is not represenced with pixels for images and numerical representation for text as in case of MNIST fashion. The preprocessing is applied on flickr8k dataset.

# Import Required Modules

In [5]:
# Import Tokenizer module to transform text data to numerical format
from tensorflow.keras.preprocessing.text import Tokenizer
# Import pad_sequences to make all numerical input to same size
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Import the Natural Language Toolkit (nltk) for text processing
import nltk
# Download the required datasets from nltk
nltk.download('wordnet')
nltk.download('stopwords')
# Import stopwords from nltk
from nltk.corpus import stopwords
# Import lemmatizer from nltk
from nltk.stem.wordnet import WordNetLemmatizer 
# Import string module for processing text
import string
# Import image preprocessing functions from Keras
from tensorflow.keras.preprocessing.image import img_to_array, load_img
# Import Image from PIL for image processing
from PIL import Image
# Import matplotlib for plotting
import matplotlib.pyplot as plt
# Import glob for matching file patterns
from glob import glob
# Import Path for file system operations
from pathlib import Path

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Download the dataset from kaggle

In [4]:
#steps found in "https://www.kaggle.com/general/74235"
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download adityajn105/flickr8k

# Unzip Dataset

In [None]:
!unzip -q "/content/flickr8k.zip" -d "/content/"

# Load Images and Captions

## Load Images

In [None]:
all_image_data = []
image_id_all = []
width, height = 224, 224
# Enumerate over all image files
for image_file in glob('/content/Images/*.jpg'):
  # Save all image file names in a list named “image id”
  image_id_all.append(Path(image_file).name.split(".")[0])
  # Open image file and save it in a variable
  single_image = Image.open(image_file)
  # Reshape image file to 224*224
  single_image = single_image.resize((width, height))
  # Transform image to a numerical array of pixels
  img = img_to_array(single_image)
  # Normalize images between 0 and 1
  img = img / 255
  # Save all image features in a list
  all_image_data.append(img)
  
# Save image features and their corresponding names (id’s) in a dictionary
mapped = dict(zip(image_id_all, all_image_data))

## Load Captions

In [None]:
# Load corresponding text labels file
text = pd.read_csv("captions.txt", sep=",")

# Split the image column to two columns to have the image name in a column as id
text[['image_id', 'extension']] = text['image'].apply(lambda x: pd.Series(str(x).split(".")))
text = text.drop(['image', 'extension'], axis = 1)

## Map each label with its corresponding image

In [None]:
# Define an empty list to store ordered image
real_images = []
# Enumerate over image ID
for id in text['image_id']:
  # For each id save the corresponding image in the real_images list
  real_images.append(mapped[id])

# Preprocess Captions

## Clean Text

In [None]:
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
  # Remove punctuation from the text
  text = text.translate(str.maketrans("", "", string.punctuation))
  # Convert to lowercase
  text = text.lower()
  # Tokenize the text
   tokens = word_tokenize(text)
   # Remove stop words
   tokens = [token for token in tokens if token not in stop_words]
   # Initialize the lemmatizer
   lemmatizer = WordNetLemmatizer()
   # Lemmatize and stem each token
   lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
   return 'startseq ' + " ".join(lemmatized_tokens)+ ' endseq'

In [None]:
# Apply the processing function to captions column in the DataFrame
text["preprocessed_captions"] = text["caption"].apply(preprocess_text)
text.head(10)

Unnamed: 0,caption,image_id,preprocessed_captions
0,A child in a pink dress is climbing up a set o...,1000268201_693b08cb0e,startseq child pink dress climbing set stair e...
1,A girl going into a wooden building .,1000268201_693b08cb0e,startseq girl going wooden building endseq
2,A little girl climbing into a wooden playhouse .,1000268201_693b08cb0e,startseq little girl climbing wooden playhouse...
3,A little girl climbing the stairs to her playh...,1000268201_693b08cb0e,startseq little girl climbing stair playhouse ...
4,A little girl in a pink dress going into a woo...,1000268201_693b08cb0e,startseq little girl pink dress going wooden c...
5,A black dog and a spotted dog are fighting,1001773457_577c3a7d70,startseq black dog spotted dog fighting endseq
6,A black dog and a tri-colored dog playing with...,1001773457_577c3a7d70,startseq black dog tricolored dog playing road...
7,A black dog and a white dog with brown spots a...,1001773457_577c3a7d70,startseq black dog white dog brown spot starin...
8,Two dogs of different breeds looking at each o...,1001773457_577c3a7d70,startseq two dog different breed looking road ...
9,Two dogs on pavement moving toward each other .,1001773457_577c3a7d70,startseq two dog pavement moving toward endseq


## Transform text into numerical representations

In [None]:
def text_preprocessing(data):
  # Count the number of unique words
  tokenizer = Tokenizer(num_words=None, lower=True, split=' ')
  tokenizer.fit_on_texts(data)
  num_words = len(tokenizer.word_index) + 1
  # Find the length of the longest sentence
  max_length = max([len(s.split()) for s in data])
  # Tokenize the sentences and pad the sequences
  tokenized_data = tokenizer.texts_to_sequences(data)
  padded_data = pad_sequences(tokenized_data, maxlen=max_length, padding='post')
  return padded_data, num_words, max_length, tokenizer

In [None]:
# Apply the function to a dataframe column
data = text["preprocessed_captions"].values
padded_data, num_words, max_length, tokenizer= text_preprocessing(data)