# Import Libraries

In [9]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from transformers import BlipProcessor, BlipForConditionalGeneration
from torch.utils.data import Dataset
import os
import json
from torch.utils.data import DataLoader
from transformers import AdamW, CLIPProcessor, CLIPModel, GPT2LMHeadModel, GPT2Tokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from PIL import Image
import torch

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# Load the BLIP model and processor
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")



In [7]:
# Define a function to generate captions
def generate_caption_with_blip(image_path):
    # Load and process the image
    image = Image.open(image_path).convert("RGB")
    inputs = blip_processor(image, return_tensors="pt").to(device)

    # Generate caption using BLIP model
    with torch.no_grad():
        output = blip_model.generate(**inputs)
    caption = blip_processor.decode(output[0], skip_special_tokens=True)
    return caption

In [8]:
# Test the function on a set of images
image_folder = 'Dataset/SSID_Images/'
image_paths = [f"{image_folder}/{i}.jpg" for i in range(1, 11)]

captions = {image_path: generate_caption_with_blip(image_path) for image_path in image_paths}

for img, caption in captions.items():
    print(f"{img}: {caption}")



Dataset/SSID_Images//1.jpg: a group of people walking up a snowy slope
Dataset/SSID_Images//2.jpg: a person on a snowboard on a mountain
Dataset/SSID_Images//3.jpg: a man climbing up a snowy mountain
Dataset/SSID_Images//4.jpg: a man standing on top of a mountain
Dataset/SSID_Images//5.jpg: a man sitting on top of a snowy mountain
Dataset/SSID_Images//6.jpg: a man climbing up a mountain with a helmet on
Dataset/SSID_Images//7.jpg: a field with a fence and mountains in the background
Dataset/SSID_Images//8.jpg: a man with a backpack on a trail
Dataset/SSID_Images//9.jpg: the summit of the mountain is covered in snow
Dataset/SSID_Images//10.jpg: a man wearing a blue shirt


In [10]:
# Load the Vision Encoder Decoder model and necessary processors
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

In [11]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [12]:
# Define a function to generate captions
def generate_caption_with_vit_gpt2(image_path):
    # Load and process the image
    image = Image.open(image_path).convert("RGB")
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)

    # Generate caption using the model
    with torch.no_grad():
        output_ids = model.generate(pixel_values, max_length=50, num_beams=4, do_sample=False)

    # Decode the output tokens to a string
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

In [14]:
# Test the function on a set of images
image_folder = 'Dataset/SSID_Images/'
image_paths = [f"{image_folder}/{i}.jpg" for i in range(1, 100)]

captions = {image_path: generate_caption_with_vit_gpt2(image_path) for image_path in image_paths}

for img, caption in captions.items():
    print(f"{img}: {caption}")

Dataset/SSID_Images//1.jpg: people walking on top of a snow covered slope 
Dataset/SSID_Images//2.jpg: a mountain range with snow capped mountains 
Dataset/SSID_Images//3.jpg: a person riding skis on top of a snow covered slope 
Dataset/SSID_Images//4.jpg: a person standing on top of a snow covered mountain 
Dataset/SSID_Images//5.jpg: a man sitting in the snow next to a pile of snow 
Dataset/SSID_Images//6.jpg: a person on a snowboard in the snow 
Dataset/SSID_Images//7.jpg: a field with a fence and a mountain range 
Dataset/SSID_Images//8.jpg: a man riding skis down a snow covered slope 
Dataset/SSID_Images//9.jpg: a mountain range with snow capped mountains 
Dataset/SSID_Images//10.jpg: a man in a hat talking on a cell phone 
Dataset/SSID_Images//11.jpg: a woman standing next to a parking meter 
Dataset/SSID_Images//12.jpg: two people riding skis on top of a dirt road 
Dataset/SSID_Images//13.jpg: a man on skis standing on top of a snow covered slope 
Dataset/SSID_Images//14.jpg: a 

In [18]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Define the image paths
image_paths = [
    'Dataset/SSID_Images//1.jpg',
    'Dataset/SSID_Images//2.jpg',
    'Dataset/SSID_Images//3.jpg',
    'Dataset/SSID_Images//4.jpg',
    'Dataset/SSID_Images//5.jpg'
]

# Function to generate a caption (concept) for each image
def generate_concept(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")

    # Generate a caption for the image
    caption_ids = model.generate(**inputs)
    caption = processor.decode(caption_ids[0], skip_special_tokens=True)
    return caption

# Generate concepts for each image and store them
all_concepts = []
for path in image_paths:
    concept = generate_concept(path)
    all_concepts.append(concept)

# Deduplicate the concepts if needed
def remove_noise(concepts):
    return list(set(concepts))  # Deduplicate concepts

# Get the final concepts for all images
final_concepts = remove_noise(all_concepts)
print("Final concepts:", final_concepts)


Final concepts: ['a man standing on top of a mountain', 'a group of people walking up a snowy slope', 'a person on a snowboard on a mountain', 'a man sitting on top of a snowy mountain', 'a man climbing up a snowy mountain']


# Image to feature vector

In [5]:
from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load the BLIP model for caption generation
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load the CLIP model for feature extraction
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def generate_overall_feature_vector(image_paths):
    """
    Generates an overall feature vector for a list of images by averaging the CLIP feature vectors
    of BLIP-generated captions for each image.

    Args:
        image_paths (list): List of image file paths.

    Returns:
        torch.Tensor: Averaged feature vector representing the input images.
    """
    # Function to generate a caption (concept) for each image
    def generate_concept(image_path):
        image = Image.open(image_path).convert("RGB")
        inputs = blip_processor(images=image, return_tensors="pt")
        caption_ids = blip_model.generate(**inputs)
        caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
        return caption

    # Function to encode captions into feature vectors using CLIP
    def encode_concept(concept):
        text_inputs = clip_processor(text=[concept], return_tensors="pt", padding=True)
        text_features = clip_model.get_text_features(**text_inputs)
        return text_features

    # Generate concepts for each image and encode them
    concept_vectors = []
    for path in image_paths:
        concept = generate_concept(path)
        vector = encode_concept(concept)
        concept_vectors.append(vector)

    # Calculate the final feature vector by averaging
    overall_feature_vector = torch.mean(torch.stack(concept_vectors), dim=0)
    
    return overall_feature_vector

# Example usage
image_paths = [
    'Dataset/SSID_Images//1.jpg',
    'Dataset/SSID_Images//2.jpg',
    'Dataset/SSID_Images//3.jpg',
    'Dataset/SSID_Images//4.jpg',
    'Dataset/SSID_Images//5.jpg'
]

# Call the function and get the overall feature vector
overall_feature_vector_image = generate_overall_feature_vector(image_paths)
print("Overall feature vector:", overall_feature_vector_image)




Overall feature vector: tensor([[-2.6171e-01,  1.1769e-01, -5.0845e-02,  1.1002e-01, -2.2947e-01,
         -8.2764e-02, -8.0033e-02, -3.9292e-01, -5.5811e-02,  1.0190e-01,
          1.3146e-01, -7.7623e-02,  4.3106e-01,  2.9742e-02,  1.7267e-01,
          3.0587e-02, -1.0321e-01,  3.0394e-01, -1.0508e-01,  6.1711e-02,
         -1.0026e-01,  3.1109e-01,  5.8501e-02, -4.9589e-01,  2.5234e-02,
          3.2454e-01,  7.1826e-02,  1.4119e-02, -9.1528e-02, -1.3554e-01,
          2.4361e-01,  9.9473e-04, -1.6967e-02, -2.2359e-01, -1.2834e-01,
         -7.6284e-02,  1.0232e-01,  1.9060e-01, -1.1656e-01,  8.8653e-03,
          9.0522e-05, -1.0759e-01, -3.0686e-01,  2.3899e-01,  3.8309e-01,
          1.5262e-01, -2.4723e-01, -1.4980e-01, -2.0201e-02, -3.8295e-02,
          4.3377e-01, -7.6373e-02, -7.5963e-02,  2.3580e-01, -2.0909e-02,
         -3.5132e-01,  1.3786e-01,  2.5395e-01,  3.5936e-02, -1.3418e-01,
          1.4532e-01, -1.2905e-01,  7.3284e-03,  1.2222e-01,  1.0340e-01,
         -2.98

# Story to feature vector

In [6]:
from transformers import CLIPProcessor, CLIPModel
import torch
from rake_nltk import Rake

# Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_story_feature_vector(story_text):
    """
    Generates an overall feature vector for a story text by averaging the CLIP feature vectors
    of keywords extracted using RAKE.

    Args:
        story_text (str): The story text.

    Returns:
        torch.Tensor: Averaged feature vector representing the story text.
    """
    # Initialize RAKE for keyword extraction
    rake = Rake()
    rake.extract_keywords_from_text(story_text)
    keywords = rake.get_ranked_phrases()  # Get the keywords extracted by RAKE

    # Function to encode a keyword into a feature vector using CLIP
    def encode_keyword(keyword):
        text_inputs = clip_processor(text=[keyword], return_tensors="pt", padding=True)
        text_features = clip_model.get_text_features(**text_inputs)
        return text_features

    # Encode each keyword and store the vectors
    keyword_vectors = []
    for keyword in keywords:
        vector = encode_keyword(keyword)
        keyword_vectors.append(vector)

    # Calculate the final feature vector by averaging the keyword vectors
    overall_feature_vector = torch.mean(torch.stack(keyword_vectors), dim=0)
    
    return overall_feature_vector

# Example usage
story_text = """
Once upon a time, in the heart of the mountains, the group was stopped by a man. "He's a member of this group. He's called a soldier of an army. You can tell he's an enemy of mine. So, why don't you get a look at him?" he stared at the snowman, who had no idea where he was. The man had a strange look on his face, and he seemed to be a bit of a weirdo. However, he didn't say anything to the other snow he climbed up to a mountain, his right hand being on the mountain's top. A snowflake-shaped, red-colored mountain that was very steep, like a cave. It was about three meters tall, with a peak that could be reached he looked at his snow-covered face. There was a faint smile on its face as it looked down at a white-haired man, wearing a snow mask. After a moment, it began to grow a little taller. His face was dark red the man's face grew even more, making it seem as though he had been a very powerful and powerful person. As he walked, a small voice spoke from behind him, "I heard you're coming. Why are you here?"
"""

# Call the function and get the overall feature vector
overall_feature_vector_story = get_story_feature_vector(story_text)
print("Overall feature vector for the story text based on RAKE keywords:", overall_feature_vector_story)




Overall feature vector for the story text based on RAKE keywords: tensor([[-5.4812e-02, -2.7397e-02,  1.1542e-02,  1.0613e-01, -8.0487e-02,
         -1.7179e-01, -9.4563e-02, -1.2755e+00, -2.9355e-02,  2.1567e-01,
          2.7565e-02, -3.3601e-02, -8.7047e-02, -7.9168e-02,  1.7152e-01,
          3.9473e-02,  2.7349e-01,  6.0431e-02, -9.6591e-02, -1.4670e-01,
          2.7230e-01,  3.9014e-02,  6.8657e-02, -6.6495e-02, -1.2100e-01,
          2.5792e-02, -2.6740e-02,  2.0362e-02, -4.2674e-02,  8.4710e-02,
         -2.3487e-02, -1.4950e-01,  6.3557e-03, -5.1198e-03, -3.2215e-02,
          2.4282e-01,  6.3096e-02,  8.8220e-03, -4.3158e-02, -2.0463e-02,
         -5.5950e-02, -9.6385e-02, -6.9654e-02,  1.0052e-01,  9.4145e-02,
          3.0459e-01, -6.6751e-02,  7.0225e-02,  1.0621e-01,  3.9043e-02,
         -1.6095e-02, -6.2083e-02,  1.0096e-01, -1.5872e-01,  3.8540e-02,
          7.2946e-03, -2.3128e-02,  8.1000e-03, -6.9978e-02, -3.0512e-02,
          2.2941e-01, -6.3825e-02,  1.3822e-01

# Ground truths to feature vector

In [7]:
from transformers import CLIPProcessor, CLIPModel
import torch

# Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_caption_feature_vector(captions):
    """
    Generates an overall feature vector for a list of captions by averaging the CLIP feature vectors.

    Args:
        captions (list): List of caption strings.

    Returns:
        torch.Tensor: Averaged feature vector representing the captions.
    """
    # Function to encode a caption into a feature vector using CLIP
    def encode_caption(caption):
        text_inputs = clip_processor(text=[caption], return_tensors="pt", padding=True)
        text_features = clip_model.get_text_features(**text_inputs)
        return text_features

    # Encode each caption and store the vectors
    caption_vectors = [encode_caption(caption) for caption in captions]

    # Calculate the final feature vector by averaging the caption vectors
    overall_feature_vector = torch.mean(torch.stack(caption_vectors), dim=0)
    
    return overall_feature_vector

# Example usage
captions = [
    "3 mountain climbers are on their way to the mountain.",
    "A steep mountainside covered with snow in patches.",
    "A mountain climber climbing a snow-covered mountain slope.",
    "A mountain climber standing and resting near the top of the mountain.",
    "A view from the campground while climbing the mountain peak."
]

# Call the function and get the overall feature vector
overall_feature_vector_ground_truth = get_caption_feature_vector(captions)
print("Overall feature vector for the ground truths:", overall_feature_vector_ground_truth)


Overall feature vector for the ground truths: tensor([[-2.0492e-01, -4.7512e-02,  1.5831e-01,  1.0831e-01, -9.8873e-02,
         -2.2198e-01, -6.6798e-02, -6.0664e-01, -3.7203e-02,  1.7869e-01,
          2.3644e-01, -1.1605e-01,  1.6149e-01, -8.7494e-02,  1.5276e-01,
          8.5210e-02, -2.7141e-02,  1.9447e-01, -1.0456e-01,  2.0103e-01,
         -2.5094e-01,  4.0982e-04,  1.1917e-02, -7.1495e-01, -1.1327e-01,
          2.9432e-01,  1.7969e-02, -8.1345e-02, -1.3319e-02, -6.7199e-02,
          2.0573e-01, -2.7942e-02, -9.4614e-03, -2.2664e-01, -2.3932e-01,
         -6.3323e-02,  1.3956e-01,  8.6716e-02, -9.1183e-02,  1.1044e-01,
         -3.2266e-03, -1.2013e-01, -2.7474e-01,  2.5388e-01,  1.6143e-01,
         -9.4454e-02, -1.3122e-01, -1.6551e-01, -7.3807e-02, -2.6146e-01,
          2.7608e-01, -1.5043e-01,  2.3500e-02,  1.5725e-01,  2.1333e-01,
         -7.3675e-02,  6.6722e-03, -1.1314e-01, -1.3811e-01, -2.4468e-01,
          8.8673e-02, -2.3037e-01, -6.2919e-03,  1.0825e-01,  1.15

# Cosine similarity between image feature vector and the story feature vector

In [15]:
import torch

# Assuming overall_feature_vector_image and overall_feature_vector_story are your two feature vectors
def calculate_cosine_similarity(vector1, vector2):
    """
    Calculates the cosine similarity between two feature vectors.

    Args:
        vector1 (torch.Tensor): First feature vector.
        vector2 (torch.Tensor): Second feature vector.

    Returns:
        float: Cosine similarity score between -1 and 1.
    """
    # Flatten to ensure 1D tensors
    vector1 = vector1.flatten()
    vector2 = vector2.flatten()
    
    # Normalize each vector
    vector1 = vector1 / vector1.norm()
    vector2 = vector2 / vector2.norm()
    
    # Calculate the cosine similarity using dot product
    similarity = torch.dot(vector1, vector2)
    return similarity.item()

# Example usage:
cosine_similarity_score1 = calculate_cosine_similarity(overall_feature_vector_image, overall_feature_vector_story)
print("Cosine similarity score between image and story feature vectors:", cosine_similarity_score1)


Cosine similarity score between image and story feature vectors: 0.7565784454345703


# cosine similarity between the image feature vector and the ground truth feature vector

In [19]:
import torch

# Assuming overall_feature_vector_image and overall_feature_vector_ground_truth are your two feature vectors
def calculate_cosine_similarity(vector1, vector2):
    """
    Calculates the cosine similarity between two feature vectors.

    Args:
        vector1 (torch.Tensor): First feature vector.
        vector2 (torch.Tensor): Second feature vector.

    Returns:
        float: Cosine similarity score between -1 and 1.
    """
    # Flatten to ensure 1D tensors
    vector1 = vector1.flatten()
    vector2 = vector2.flatten()
    
    # Normalize each vector
    vector1 = vector1 / vector1.norm()
    vector2 = vector2 / vector2.norm()
    
    # Calculate the cosine similarity using dot product
    similarity = torch.dot(vector1, vector2)
    return similarity.item()

# Example usage:
cosine_similarity_score2 = calculate_cosine_similarity(overall_feature_vector_ground_truth, overall_feature_vector_story)
print("Cosine similarity score between image and ground truth feature vectors:", cosine_similarity_score2)


Cosine similarity score between image and ground truth feature vectors: 0.8385794162750244


# Calculating the rouge score between the ground truths and the final story

In [21]:
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize
ground_truths = [
    "3 mountain climbers are on their way to the mountain.",
    "A steep mountainside covered with snow in patches.",
    "a mountain climber climbing a snow-covered mountain slope",
    "a mountain climber standing and resting near the top of the mountain",
    "A view from the campground while climbing the mountain peak."
]

# Define the generated story (candidate text)
story_text = """
Once upon a time, in the heart of the mountains, the group was stopped by a man. "He's a member of this group. He's called a soldier of an army. You can tell he's an enemy of mine. So, why don't you get a look at him?" he stared at the snowman, who had no idea where he was. The man had a strange look on his face, and he seemed to be a bit of a weirdo. However, he didn't say anything to the other snow he climbed up to a mountain, his right hand being on the mountain's top. A snowflake-shaped, red-colored mountain that was very steep, like a cave. It was about three meters tall, with a peak that could be reached he looked at his snow-covered face. There was a faint smile on its face as it looked down at a white-haired man, wearing a snow mask. After a moment, it began to grow a little taller. His face was dark red the man's face grew even more, making it seem as though he had been a very powerful and powerful person. As he walked, a small voice spoke from behind him, "I heard you're coming. Why are you here?"
"""

# Concatenate all captions into a single reference text for comparison
reference_text = " ".join(ground_truths)

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE score
scores = scorer.score(reference_text, story_text)

# Display the results
print("ROUGE-1 (Unigram) Score:", scores['rouge1'])
print("ROUGE-2 (Bigram) Score:", scores['rouge2'])
print("ROUGE-L (Longest Common Subsequence) Score:", scores['rougeL'])


ROUGE-1 (Unigram) Score: Score(precision=0.13425925925925927, recall=0.5918367346938775, fmeasure=0.2188679245283019)
ROUGE-2 (Bigram) Score: Score(precision=0.03255813953488372, recall=0.14583333333333334, fmeasure=0.0532319391634981)
ROUGE-L (Longest Common Subsequence) Score: Score(precision=0.07407407407407407, recall=0.32653061224489793, fmeasure=0.12075471698113208)


##### The low precision suggests that the generated story includes many new words, while the high recall indicates it captures a decent portion of the important words from the ground truths.

# below this is bullshit have to change

In [22]:
def compute_reward(r_score, r_topic_cv, r_topic_cl, lambda_val, gamma_val, eta_val):
    """
    Compute the reward function r(y_i) using ROUGE F1 score and cosine similarities.

    Parameters:
    - r_score (float): ROUGE F1 score (or BLEU score, as applicable).
    - r_topic_cv (float): Cosine similarity for vision-based topics.
    - r_topic_cl (float): Cosine similarity for language-based topics.
    - lambda_val (float): Weight for ROUGE F1 score.
    - gamma_val (float): Weight for topic-cv similarity.
    - eta_val (float): Weight for topic-cl similarity.

    Returns:
    - float: Computed reward r(y_i).
    """
    reward = (lambda_val * r_score) + (gamma_val * r_topic_cv) + (eta_val * r_topic_cl)
    return reward

# Example ROUGE F1 score and cosine similarities
rouge_f1 = 0.2189  # ROUGE F1 score
r_topic_cv = 0.8385794162750244  # Example cosine similarity for topic-cv
r_topic_cl = 0.7565784454345703  # Example cosine similarity for topic-cl

# Assign weights
lambda_val = 1.0  # Weight for ROUGE F1 score
gamma_val = 0.5   # Weight for topic-cv similarity
eta_val = 0.5     # Weight for topic-cl similarity

# Compute reward
reward = compute_reward(rouge_f1, r_topic_cv, r_topic_cl, lambda_val, gamma_val, eta_val)
print("Computed Reward:", reward)


Computed Reward: 1.0164789308547975


In [26]:
from collections import Counter
from nltk.tokenize import word_tokenize

# Ground truths and story text
ground_truths = [
    "3 mountain climbers are on their way to the mountain.",
    "A steep mountainside covered with snow in patches.",
    "a mountain climber climbing a snow-covered mountain slope",
    "a mountain climber standing and resting near the top of the mountain",
    "A view from the campground while climbing the mountain peak."
]
story_text = """
Once upon a time, in the heart of the mountains, the group was stopped by a man. 
"He's a member of this group. He's called a soldier of an army. You can tell he's an enemy of mine. 
So, why don't you get a look at him?" he stared at the snowman, who had no idea where he was. 
The man had a strange look on his face, and he seemed to be a bit of a weirdo. 
However, he didn't say anything to the other snow he climbed up to a mountain, his right hand being on the mountain's top. 
"""

# Combine ground truths and story text
all_texts = " ".join(ground_truths) + " " + story_text

# Tokenize and count unique words
tokens = word_tokenize(all_texts)
vocab = Counter(tokens)
vocab_size = len(vocab)

print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 96


In [27]:
class Seq2SeqModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2SeqModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        embedded = self.embedding(x)  # Embedding layer
        rnn_out, _ = self.rnn(embedded)  # LSTM layer
        output = self.fc(rnn_out)  # Fully connected layer
        return output

# Example initialization of the model
input_size = vocab_size  # Vocabulary size for embedding
hidden_size = 256        # Size of the hidden layers
output_size = vocab_size # Output vocabulary size
model = Seq2SeqModel(input_size, hidden_size, output_size)


In [30]:
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

# Data
ground_truths = [
    "3 mountain climbers are on their way to the mountain.",
    "A steep mountainside covered with snow in patches.",
    "a mountain climber climbing a snow-covered mountain slope",
    "a mountain climber standing and resting near the top of the mountain",
    "A view from the campground while climbing the mountain peak."
]
story_text = """
Once upon a time, in the heart of the mountains, the group was stopped by a man. 
"He's a member of this group. He's called a soldier of an army. You can tell he's an enemy of mine. 
So, why don't you get a look at him?" he stared at the snowman, who had no idea where he was. 
The man had a strange look on his face, and he seemed to be a bit of a weirdo. 
However, he didn't say anything to the other snow he climbed up to a mountain, his right hand being on the mountain's top. 
"""

# Combine all ground truths into a single string for vocabulary building
all_text = " ".join(ground_truths) + " " + story_text

# Tokenize and build vocabulary
tokens = word_tokenize(all_text.lower())  # Lowercase and tokenize
vocab_counter = Counter(tokens)  # Count occurrences of each token
vocab = {word: idx for idx, (word, _) in enumerate(vocab_counter.items(), start=1)}  # Assign index starting from 1
vocab["<PAD>"] = 0  # Add <PAD> token for padding

# Convert tokens to indices
def text_to_indices(text, vocab):
    tokens = word_tokenize(text.lower())  # Tokenize
    return [vocab[token] for token in tokens if token in vocab]  # Convert to indices

# Convert ground truths and story text to indices
input_sequences = [text_to_indices(gt, vocab) for gt in ground_truths]
target_sequence = text_to_indices(story_text, vocab)

# Dataset Definition
class Seq2SeqDataset(Dataset):
    def __init__(self, input_sequences, target_sequence, vocab):
        self.input_sequences = input_sequences  # List of input sequences
        self.target_sequence = target_sequence  # Single target sequence
        self.vocab = vocab
    
    def __len__(self):
        return len(self.input_sequences)
    
    def __getitem__(self, idx):
        input_tensor = torch.tensor(self.input_sequences[idx], dtype=torch.long)
        target_tensor = torch.tensor(self.target_sequence, dtype=torch.long)
        return input_tensor, target_tensor

# Collate Function for Padding
def collate_fn(batch):
    inputs, targets = zip(*batch)
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab["<PAD>"])
    padded_targets = pad_sequence([targets[0]] * len(inputs), batch_first=True, padding_value=vocab["<PAD>"])
    return padded_inputs, padded_targets

# Create Dataset and DataLoader
dataset = Seq2SeqDataset(input_sequences, target_sequence, vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Check the dataloader output
for batch_idx, (inputs, targets) in enumerate(dataloader):
    print(f"Batch {batch_idx + 1}:")
    print("Inputs:", inputs)
    print("Targets:", targets)
    break


Batch 1:
Inputs: tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9,  2, 10],
        [11,  2, 19, 20, 11, 21,  2, 22,  0,  0,  0]])
Targets: tensor([[34, 35, 11, 36, 37, 17,  9, 38, 28,  9, 39, 37,  9, 40, 41, 42, 43, 11,
         44, 10, 45, 46, 47, 11, 48, 28, 49, 40, 10, 46, 47, 50, 11, 51, 28, 52,
         53, 10, 54, 55, 56, 46, 47, 52, 57, 28, 58, 10, 59, 37, 60, 61, 62, 54,
         63, 11, 64, 65, 66, 67, 68, 46, 69, 65,  9, 70, 37, 71, 72, 73, 74, 75,
         46, 41, 10,  9, 44, 72, 11, 76, 64,  5, 77, 78, 37, 24, 46, 79,  8, 80,
         11, 81, 28, 11, 82, 10, 83, 37, 46, 84, 62, 85, 86,  8,  9, 87, 16, 46,
         88, 89,  8, 11,  2, 37, 77, 90, 91, 92,  5,  9,  2, 47, 27, 10],
        [34, 35, 11, 36, 37, 17,  9, 38, 28,  9, 39, 37,  9, 40, 41, 42, 43, 11,
         44, 10, 45, 46, 47, 11, 48, 28, 49, 40, 10, 46, 47, 50, 11, 51, 28, 52,
         53, 10, 54, 55, 56, 46, 47, 52, 57, 28, 58, 10, 59, 37, 60, 61, 62, 54,
         63, 11, 64, 65, 66, 67, 68, 46, 69, 65,  9, 70, 37, 7

In [37]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define Seq2Seq Model
class Seq2SeqModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2SeqModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)  # (batch_size, seq_length, hidden_size)
        output, _ = self.rnn(x)  # (batch_size, seq_length, hidden_size)
        logits = self.fc(output)  # (batch_size, seq_length, vocab_size)
        return logits

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<PAD>"])  # Ignore padding tokens
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Collate function to pad both inputs and targets
def collate_fn(batch):
    inputs, targets = zip(*batch)
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab["<PAD>"])
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=vocab["<PAD>"])  # Pad targets too
    return padded_inputs, padded_targets

# Pretraining with MLE
num_epochs = 10  # Number of epochs
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in dataloader:  # Use the defined dataloader
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)  # Shape: (batch_size, seq_length, vocab_size)
        
        # Get batch size and sequence length from outputs
        batch_size, seq_len, vocab_size = outputs.shape
        
        # Flatten outputs and targets correctly
        outputs = outputs.view(-1, vocab_size)  # Flatten to (batch_size * seq_len, vocab_size)
        targets = targets.view(-1)  # Flatten to (batch_size * seq_len,)
        
        # Ensure that both outputs and targets have the same length after reshaping
        if outputs.shape[0] != targets.shape[0]:
            print(f"Shape mismatch: Outputs ({outputs.shape}) vs Targets ({targets.shape})")
            continue  # Skip this batch if shapes are not aligned
        
        # Compute MLE loss (CrossEntropyLoss)
        loss = criterion(outputs, targets)  # Apply loss function
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, MLE Loss: {total_loss}")


Shape mismatch: Outputs (torch.Size([24, 93])) vs Targets (torch.Size([248]))
Shape mismatch: Outputs (torch.Size([22, 93])) vs Targets (torch.Size([248]))
Shape mismatch: Outputs (torch.Size([9, 93])) vs Targets (torch.Size([124]))
Epoch 1, MLE Loss: 0
Shape mismatch: Outputs (torch.Size([22, 93])) vs Targets (torch.Size([248]))
Shape mismatch: Outputs (torch.Size([24, 93])) vs Targets (torch.Size([248]))
Shape mismatch: Outputs (torch.Size([8, 93])) vs Targets (torch.Size([124]))
Epoch 2, MLE Loss: 0
Shape mismatch: Outputs (torch.Size([22, 93])) vs Targets (torch.Size([248]))
Shape mismatch: Outputs (torch.Size([24, 93])) vs Targets (torch.Size([248]))
Shape mismatch: Outputs (torch.Size([9, 93])) vs Targets (torch.Size([124]))
Epoch 3, MLE Loss: 0
Shape mismatch: Outputs (torch.Size([24, 93])) vs Targets (torch.Size([248]))
Shape mismatch: Outputs (torch.Size([18, 93])) vs Targets (torch.Size([248]))
Shape mismatch: Outputs (torch.Size([11, 93])) vs Targets (torch.Size([124]))
Epoc