In [2]:
%store -r label_dict

In [3]:
# Model loading 
from transformers import BertForSequenceClassification

classification_model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config

def get_tokenier(special_tokens=None):
    tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2") #GPT2Tokenizer
    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = GPT2Config.from_pretrained("gpt2", 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = GPT2Config.from_pretrained("gpt2",                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.from_pretrained(load_model_path)

    model.cuda()
    return model

In [5]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, TensorDataset
import torch 

class_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [6]:
SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}

MAXLEN          = 128  #{768, 1024, 1280, 1600}

In [7]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)


Special tokens added


In [9]:
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='../models/final_model_2/')

In [11]:
classification_model.load_state_dict(torch.load('../models/finetuned_BERT2_epoch_5.model'))

<All keys matched successfully>

In [12]:
def classify_text(text):
    encoded_input = class_tokenizer.encode_plus(
        text,
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=256, 
        return_tensors='pt'
    )

    with torch.no_grad():
        output = classification_model(**encoded_input)

    logits = output.logits
    predicted_class = torch.argmax(logits, dim=1).item()  # Get the predicted class for the single input sequence
    return predicted_class

# Example usage
input_sentence = "AI models are great for meme generation."
predicted_class = classify_text(input_sentence)
predicted_class = list(label_dict.keys())[predicted_class]
print(f"Predicted class: {predicted_class}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Predicted class: Steve Jobs Says


# Top-p (Nucleus) Text Generation (10 Samples)

The provided code snippet demonstrates text generation using the GPT-2 model with the top-p (nucleus) sampling strategy to generate 10 text samples (`num_return_sequences=10`).

- **Model Generation (`model.generate()`):**
  - Generates text based on the input `generated` prompt using the specified model.
  - Utilizes top-p (nucleus) sampling (`do_sample=True`) to select tokens based on the cumulative probability distribution, controlling diversity and avoiding repetition in generated text.
  - Specifies minimum (`min_length`) and maximum (`max_length`) length constraints for generated text.
  - Configures parameters such as `top_k` (top k tokens to consider), `top_p` (cumulative probability threshold for nucleus sampling), `temperature` (sampling temperature), and `repetition_penalty` (penalization factor for token repetition).

- **Text Decoding and Output:**
  - Decodes the generated token sequences into readable text using the tokenizer (`tokenizer.decode()`), skipping special tokens.
  - Computes the length of the predicted class and joined keywords to adjust the starting point (`a`) for displaying generated text.
  - Prints each generated text sample (`text[a:]`) along with its corresponding index (`i+1`).

This text generation process leverages advanced sampling techniques and model settings to generate diverse and contextually relevant text outputs based on the provided input prompt.


In [15]:
from rake_nltk import Rake
rake = Rake()

def get_keywords(text):
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

In [17]:
# Custom caption dataset 
from torch.utils.data import Dataset

class CaptionDataset(Dataset):
    def __init__(self, data, tokenizer, randomize=False):
        template, caption, keywords  = [], [], []
        for temp, cap, kws in data:
            template.append(temp)
            caption.append(cap)
            keywords.append(kws)
        
        self.randomize = randomize    
        self.tokenizer = tokenizer
        self.template = template
        self.keywords = keywords
        self.caption = caption
    
    @staticmethod
    def join_keywords(keywords, randomize=False):
        N = len(keywords)

        #random sampling and shuffle
        if randomize: 
            M = random.choice(range(N+1))
            keywords = keywords[:M]
            random.shuffle(keywords)

        return ','.join(keywords)
        
    def __len__(self):
        return len(self.caption)

    def __getitem__(self, i):
        keywords = self.keywords[i].copy()
        kw = self.join_keywords(keywords)
        input = SPECIAL_TOKENS['bos_token'] + self.template[i] + \
                SPECIAL_TOKENS['sep_token'] + kw + SPECIAL_TOKENS['sep_token'] + \
                self.caption[i] + SPECIAL_TOKENS['eos_token']
        
        encodings_dict = self.tokenizer(input,
                                   truncation = True,
                                   max_length = MAX_LENGTH,
                                   padding="max_length")
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {
            "label": torch.tensor(input_ids),
            "input_ids": torch.tensor(input_ids),            
            "attention_mask": torch.tensor(attention_mask),
        }

In [24]:
from transformers import pipeline

keywords = get_keywords(input_sentence)
kw = CaptionDataset.join_keywords(keywords, randomize=False)

prompt = SPECIAL_TOKENS['bos_token'] + predicted_class + \
         SPECIAL_TOKENS['sep_token'] + kw + SPECIAL_TOKENS['sep_token']

generator = pipeline(task="text-generation", model = "../models/final_model_2/")
output = generator(prompt)[0]['generated_text']

# Extract the desired output
desired_output = output.split('<|SEP|>')[2].strip()
predicted_class = predicted_class.lower().replace(" ", "-")
print(desired_output)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I HAVE TO DO MEME GENERATION <sep> AI MODELS ARE GREAT


In [26]:
from PIL import Image, ImageDraw, ImageFont

def textsize(text, font):
    im = Image.new(mode="P", size=(0, 0))
    draw = ImageDraw.Draw(im)
    _, _, width, height = draw.textbbox((0, 0), text=text, font=font)
    return width, height

# Load the image
image_file = f"../memes900k/images/{predicted_class}.jpg"
image = Image.open(image_file)

# Create a drawing object
draw = ImageDraw.Draw(image)

# Set the font and size for the text
font = ImageFont.truetype("impact.ttf", size=30)

# Define text and calculate sizes
text_parts = desired_output.split("<sep>")
top_text = text_parts[0].strip()
bottom_text = text_parts[1].strip()
padding = 3  # Padding around the text for the background
print(top_text)

top_text_width, top_text_height = textsize(top_text, font)
bottom_text_width, bottom_text_height = textsize(bottom_text, font)

lines = []

# Handle the top text
if top_text_width > image.width:
    current_line = ""
    for word in top_text.split():
        if textsize(current_line + word, font)[0] <= image.width:
            current_line += word + " "
        else:
            lines.append(current_line.strip())
            current_line = word + " "
    lines.append(current_line.strip())
    top_text_height = len(lines) * top_text_height
    top_x =  padding
else:
    top_x = (image.width - top_text_width) / 2 - padding


top_y = 5 - padding
# Draw the background rectangle for each line of the bottom text
y = top_y
for line in lines:
    line_width, line_height = textsize(line, font)
    draw.rectangle([top_x, y, top_x + line_width + 2 * padding, y + line_height + 2 * padding], fill="black")
    draw.text((top_x + padding, y + padding), line, font=font, fill=(255, 255, 255))
    y += line_height + padding

lines = []

# Handle the bottom text
if bottom_text_width > image.width:
    current_line = ""
    for word in bottom_text.split():
        if textsize(current_line + word, font)[0] <= image.width:
            current_line += word + " "
        else:
            lines.append(current_line.strip())
            current_line = word + " "
    lines.append(current_line.strip())
    bottom_text_height = len(lines) * bottom_text_height
    bottom_x = padding
else:
    bottom_x = padding

bottom_y = image.height - bottom_text_height - 20 - padding


# Draw the background rectangle for each line of the bottom text
y = bottom_y
for line in lines:
    line_width, line_height = textsize(line, font)
    draw.rectangle([bottom_x, y, bottom_x + line_width + 2 * padding, y + line_height + 2 * padding], fill="black")
    draw.text((bottom_x + padding, y + padding), line, font=font, fill=(255, 255, 255))
    y += line_height + padding

# Save the image
image.save(f"../generated_memes/{predicted_class}.jpg")


I HAVE TO DO MEME GENERATION
