In [2]:
import os
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from sklearn.decomposition import PCA
import joblib
import numpy as np


In [4]:
# Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the path to the Flickr8k dataset
images_path = "C:/Users/Pratik Senapati/Downloads/Flickr8k_Dataset/Flicker8k_Dataset"
captions_path = "C:/Users/Pratik Senapati/Downloads/Flickr8k_text/Flickr8k.token.txt"

# Load the captions
captions = {}
with open(captions_path, 'r') as f:
    for line in f:
        tokens = line.strip().split('\t')
        image_id, caption = tokens[0], tokens[1]
        image_id = image_id.split('#')[0]
        if image_id not in captions:
            captions[image_id] = []
        captions[image_id].append(caption)

# Extract features
image_features_list = []
text_features_list = []

for image_id, caption_list in captions.items():
    image_path = os.path.join(images_path, image_id)
    image = Image.open(image_path).convert("RGB")
    
    # Resize and normalize the image
    image = image.resize((224, 224))
    image_np = np.array(image).astype(np.float32) / 255.0
    
    # Ensure the image is a 3D array (height, width, channels)
    if image_np.ndim == 2:
        image_np = np.stack([image_np] * 3, axis=-1)
    elif image_np.shape[2] == 4:
        image_np = image_np[..., :3]
    
    # Convert numpy array back to PIL Image
    image = Image.fromarray((image_np * 255).astype(np.uint8))
    
    for caption in caption_list:
        inputs = clip_processor(text=[caption], images=image, return_tensors="pt", padding=True)
        
        with torch.no_grad():
            outputs = clip_model(**inputs)
            image_features = outputs.image_embeds
            text_features = outputs.text_embeds
        
        # Normalize features
        image_features = torch.nn.functional.normalize(image_features, p=2, dim=1)
        text_features = torch.nn.functional.normalize(text_features, p=2, dim=1)
        
        image_features_list.append(image_features.cpu().numpy())
        text_features_list.append(text_features.cpu().numpy())

# Convert lists to numpy arrays
image_features_array = np.vstack(image_features_list)
text_features_array = np.vstack(text_features_list)

# Combine features
combined_features = np.hstack((image_features_array, text_features_array))

# Apply PCA
pca = PCA(n_components=100)  # Adjust the number of components as needed
reduced_features = pca.fit_transform(combined_features)

# Save the PCA model
joblib.dump(pca, "pca_model_flickr8k.pkl")

print("PCA model saved successfully.")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Pratik Senapati\\Downloads\\Flickr8k_Dataset\\Flicker8k_Dataset\\2258277193_586949ec62.jpg.1'