The dataset zip file path present in google drive in zip_file_path variable. Output directory path in output_directory variable.

In [2]:
zip_file_path = "/content/drive/MyDrive/GenAI/harrison.zip"
output_directory = "/content/dataset"

### 1. Mount Google Drive

First, you need to mount your Google Drive to make its contents accessible in Colab. This will prompt you to authorize Colab to access your Google Drive files.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re

Extract the dataset

In [3]:
import os
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Unzip the file using the shell command
# You can also use Python's 'zipfile' module if preferred, as shown in previous examples.
!unzip -o {zip_file_path} -d {output_directory}

print(f"Attempted to unzip '{zip_file_path}' to '{output_directory}'")

Archive:  /content/drive/MyDrive/GenAI/harrison.zip
  inflating: /content/dataset/harrison_features.npz  
  inflating: /content/dataset/tag_list.txt  
Attempted to unzip '/content/drive/MyDrive/GenAI/harrison.zip' to '/content/dataset'


## Load the dataset in a dataframe

In [7]:
import numpy as np

npz_file_path = output_directory + '/harrison_features.npz'
# Load the .npz file
with np.load(npz_file_path) as data:
    # Use 'imagenet_fc_layers' as the image features based on inspection of available keys
    if 'imagenet_fc_layers' in data.files:
        image_features = data['imagenet_fc_layers']
        print(f"Successfully loaded image features with shape: {image_features.shape}")
    else:
        print(f"Error: 'imagenet_fc_layers' not found in {npz_file_path}. Available keys: {data.files}")
        image_features = None

Successfully loaded image features with shape: (57383, 2048)


In [8]:
tag_list_file_path = output_directory + '/tag_list.txt'

# Load hashtags from the text file
with open(tag_list_file_path, 'r') as f:
    hashtags_list = [line.strip() for line in f if line.strip()]

print(f"Successfully loaded {len(hashtags_list)} hashtags.")
# Display the first 5 hashtags to verify
print("First 5 hashtags:", hashtags_list[:5])

Successfully loaded 57383 hashtags.
First 5 hashtags: ['sea instapic instagram trip travel', 'sea', 'sea love', 'beach sea trip island japan', 'sun sand sea sky friend beach thailand trip adventure']


In [9]:
import pandas as pd

# Create a DataFrame from the loaded features and hashtags
df_combined = pd.DataFrame({
    'features': list(image_features),
    'hashtags': hashtags_list
})

# Limit the DataFrame to the first 10000 samples to save training time
df = df_combined.head(10000)

print(f"Combined DataFrame created with shape: {df_combined.shape}")
print(f"Limited DataFrame created with shape: {df.shape}")
print("Limited DataFrame Head:")
print(df.head())

Combined DataFrame created with shape: (57383, 2)
Limited DataFrame created with shape: (10000, 2)
Limited DataFrame Head:
                                            features  \
0  [0.8123088, 1.5523993, 0.0, 2.5534768, 1.76293...   
1  [0.12911989, 0.0, 0.16080879, 0.18392688, 0.09...   
2  [0.42463303, 0.15550624, 0.014838985, 0.086087...   
3  [0.05410369, 0.32260185, 0.16083369, 0.3008105...   
4  [0.027175432, 1.572316, 0.23366477, 1.017643, ...   

                                            hashtags  
0                 sea instapic instagram trip travel  
1                                                sea  
2                                           sea love  
3                        beach sea trip island japan  
4  sun sand sea sky friend beach thailand trip ad...  


In [10]:
# Clean hashtags
def clean_hashtags(text):
    text = text.lower()
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

df['hashtags'] = df['hashtags'].apply(clean_hashtags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtags'] = df['hashtags'].apply(clean_hashtags)


In [11]:
# Convert string features to numpy arrays if stored as text
if isinstance(df.iloc[0]['features'], str):
    df['features'] = df['features'].apply(lambda x: np.fromstring(x.strip("[]"), sep=','))

In [12]:
# ===============================================================
# 2️⃣ Preprocess Text and Build Vocabulary
# ===============================================================

from collections import Counter

hashtags_list = [h.split() for h in df['hashtags']]
all_tags = [t for tags in hashtags_list for t in tags]
counter = Counter(all_tags)

# Keep top-K hashtags (reduce vocab size)
TOP_K = 5000
most_common = counter.most_common(TOP_K)
vocab = ['<pad>', '<start>', '<end>', '<unk>'] + [w for w, _ in most_common]
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}

In [13]:
def encode_tags(tags):
    ids = [word2idx.get(w, word2idx['<unk>']) for w in tags]
    return [word2idx['<start>']] + ids + [word2idx['<end>']]

def pad_sequence(seq, max_len=10):
    seq = seq[:max_len]
    seq += [word2idx['<pad>']] * (max_len - len(seq))
    return seq

# Prepare sequences
# First, split the strings in 'hashtags' column into lists of tags
df['hashtags_split'] = df['hashtags'].apply(lambda x: x.split())

# Now apply encode_tags to the newly created 'hashtags_split' column
df['encoded'] = df['hashtags_split'].apply(encode_tags)

MAX_LEN = 10
df['padded'] = df['encoded'].apply(lambda x: pad_sequence(x, MAX_LEN))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hashtags_split'] = df['hashtags'].apply(lambda x: x.split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['encoded'] = df['hashtags_split'].apply(encode_tags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['padded'] = df['encoded'].apply(lambda x: pad_sequence(x, MAX_LEN))


In [14]:
# ===============================================================
# 3️⃣ Create Dataset Class
# ===============================================================

class HashtagDataset(Dataset):
    def __init__(self, df):
        self.X = np.stack(df['features'].to_numpy())
        self.y = np.stack(df['padded'].to_numpy())

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        feat = torch.tensor(self.X[idx], dtype=torch.float32)
        seq = torch.tensor(self.y[idx], dtype=torch.long)
        return feat, seq

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_data = HashtagDataset(train_df)
val_data = HashtagDataset(val_df)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

In [None]:
df

Unnamed: 0,features,hashtags,hashtags_split,encoded,padded
0,"[0.8123088, 1.5523993, 0.0, 2.5534768, 1.76293...",sea instapic instagram trip travel,"[sea, instapic, instagram, trip, travel]","[1, 10, 73, 37, 101, 43, 2]","[1, 10, 73, 37, 101, 43, 2, 0, 0, 0]"
1,"[0.12911989, 0.0, 0.16080879, 0.18392688, 0.09...",sea,[sea],"[1, 10, 2]","[1, 10, 2, 0, 0, 0, 0, 0, 0, 0]"
2,"[0.42463303, 0.15550624, 0.014838985, 0.086087...",sea love,"[sea, love]","[1, 10, 5, 2]","[1, 10, 5, 2, 0, 0, 0, 0, 0, 0]"
3,"[0.05410369, 0.32260185, 0.16083369, 0.3008105...",beach sea trip island japan,"[beach, sea, trip, island, japan]","[1, 20, 10, 101, 245, 168, 2]","[1, 20, 10, 101, 245, 168, 2, 0, 0, 0]"
4,"[0.027175432, 1.572316, 0.23366477, 1.017643, ...",sun sand sea sky friend beach thailand trip ad...,"[sun, sand, sea, sky, friend, beach, thailand,...","[1, 24, 116, 10, 46, 4, 20, 260, 101, 195, 2]","[1, 24, 116, 10, 46, 4, 20, 260, 101, 195]"
...,...,...,...,...,...
9995,"[1.2312934, 0.01216483, 0.15034737, 0.06016748...",yellow flower,"[yellow, flower]","[1, 8, 17, 2]","[1, 8, 17, 2, 0, 0, 0, 0, 0, 0]"
9996,"[0.97869843, 0.36804786, 0.1355541, 0.5787414,...",yellow green beautiful tagsta fashiondiaries f...,"[yellow, green, beautiful, tagsta, fashiondiar...","[1, 8, 40, 18, 884, 840, 877, 624, 898, 2]","[1, 8, 40, 18, 884, 840, 877, 624, 898, 2]"
9997,"[0.05007997, 0.010740085, 1.0760548, 0.377246,...",summer happy yellow,"[summer, happy, yellow]","[1, 52, 14, 8, 2]","[1, 52, 14, 8, 2, 0, 0, 0, 0, 0]"
9998,"[0.4592604, 0.0, 0.0, 0.0, 0.024754986, 0.2424...",photo nature yellow,"[photo, nature, yellow]","[1, 62, 21, 8, 2]","[1, 62, 21, 8, 2, 0, 0, 0, 0, 0]"


In [16]:
# ===============================================================
# 4️⃣ Define Generative Model
# ===============================================================

class TinyHashtagGenerator(nn.Module):
    def __init__(self, feature_dim, vocab_size, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.feature_proj = nn.Linear(feature_dim, embed_dim)
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, features, captions):
        # project features and treat them as first token
        img_embed = self.feature_proj(features).unsqueeze(1)
        cap_embed = self.embed(captions[:, :-1])
        inputs = torch.cat([img_embed, cap_embed], dim=1)
        output, _ = self.gru(inputs)
        logits = self.fc(output)
        return logits

# Get feature dimension automatically
feature_dim = len(df.iloc[0]['features'])
vocab_size = len(vocab)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TinyHashtagGenerator(feature_dim, vocab_size).to(device)

## Training

In [28]:
# ===============================================================
# 5️⃣ Training Setup
# ===============================================================

criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=1e-3)
EPOCHS = 30

In [29]:
# ===============================================================
# 6️⃣ Train the Model
# ===============================================================

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for feats, seqs in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        feats, seqs = feats.to(device), seqs.to(device)
        logits = model(feats, seqs)
        loss = criterion(logits.reshape(-1, vocab_size), seqs.reshape(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Train Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/30: 100%|██████████| 141/141 [00:05<00:00, 25.66it/s]


Epoch 1 Train Loss: 0.9378


Epoch 2/30: 100%|██████████| 141/141 [00:05<00:00, 25.38it/s]


Epoch 2 Train Loss: 0.8907


Epoch 3/30: 100%|██████████| 141/141 [00:06<00:00, 21.73it/s]


Epoch 3 Train Loss: 0.8634


Epoch 4/30: 100%|██████████| 141/141 [00:05<00:00, 26.69it/s]


Epoch 4 Train Loss: 0.8354


Epoch 5/30: 100%|██████████| 141/141 [00:06<00:00, 20.74it/s]


Epoch 5 Train Loss: 0.8220


Epoch 6/30: 100%|██████████| 141/141 [00:05<00:00, 26.83it/s]


Epoch 6 Train Loss: 0.8201


Epoch 7/30: 100%|██████████| 141/141 [00:05<00:00, 23.80it/s]


Epoch 7 Train Loss: 0.8221


Epoch 8/30: 100%|██████████| 141/141 [00:06<00:00, 23.08it/s]


Epoch 8 Train Loss: 0.7995


Epoch 9/30: 100%|██████████| 141/141 [00:05<00:00, 26.51it/s]


Epoch 9 Train Loss: 0.7667


Epoch 10/30: 100%|██████████| 141/141 [00:06<00:00, 20.76it/s]


Epoch 10 Train Loss: 0.7431


Epoch 11/30: 100%|██████████| 141/141 [00:05<00:00, 26.66it/s]


Epoch 11 Train Loss: 0.7336


Epoch 12/30: 100%|██████████| 141/141 [00:06<00:00, 21.47it/s]


Epoch 12 Train Loss: 0.7458


Epoch 13/30: 100%|██████████| 141/141 [00:05<00:00, 24.51it/s]


Epoch 13 Train Loss: 0.7144


Epoch 14/30: 100%|██████████| 141/141 [00:05<00:00, 25.36it/s]


Epoch 14 Train Loss: 0.6880


Epoch 15/30: 100%|██████████| 141/141 [00:06<00:00, 21.04it/s]


Epoch 15 Train Loss: 0.6819


Epoch 16/30: 100%|██████████| 141/141 [00:05<00:00, 26.39it/s]


Epoch 16 Train Loss: 0.7154


Epoch 17/30: 100%|██████████| 141/141 [00:06<00:00, 20.54it/s]


Epoch 17 Train Loss: 0.6741


Epoch 18/30: 100%|██████████| 141/141 [00:05<00:00, 25.03it/s]


Epoch 18 Train Loss: 0.6643


Epoch 19/30: 100%|██████████| 141/141 [00:06<00:00, 21.87it/s]


Epoch 19 Train Loss: 0.6597


Epoch 20/30: 100%|██████████| 141/141 [00:06<00:00, 23.11it/s]


Epoch 20 Train Loss: 0.6583


Epoch 21/30: 100%|██████████| 141/141 [00:05<00:00, 25.16it/s]


Epoch 21 Train Loss: 0.6280


Epoch 22/30: 100%|██████████| 141/141 [00:06<00:00, 20.84it/s]


Epoch 22 Train Loss: 0.6376


Epoch 23/30: 100%|██████████| 141/141 [00:05<00:00, 26.01it/s]


Epoch 23 Train Loss: 0.6011


Epoch 24/30: 100%|██████████| 141/141 [00:06<00:00, 21.01it/s]


Epoch 24 Train Loss: 0.5707


Epoch 25/30: 100%|██████████| 141/141 [00:05<00:00, 26.71it/s]


Epoch 25 Train Loss: 0.5475


Epoch 26/30: 100%|██████████| 141/141 [00:05<00:00, 23.76it/s]


Epoch 26 Train Loss: 0.5344


Epoch 27/30: 100%|██████████| 141/141 [00:06<00:00, 23.06it/s]


Epoch 27 Train Loss: 0.5400


Epoch 28/30: 100%|██████████| 141/141 [00:05<00:00, 26.29it/s]


Epoch 28 Train Loss: 0.5671


Epoch 29/30: 100%|██████████| 141/141 [00:06<00:00, 20.50it/s]


Epoch 29 Train Loss: 0.5814


Epoch 30/30: 100%|██████████| 141/141 [00:05<00:00, 26.16it/s]

Epoch 30 Train Loss: 0.5573





In [21]:
# Save model weights
torch.save(model.state_dict(), "tiny_hashtag_generator.pth")
print("✅ Model weights saved as 'tiny_hashtag_generator.pth'")


✅ Model weights saved as 'tiny_hashtag_generator.pth'


In [22]:
import torch
import torch.nn as nn

# ===============================================================
# 7️⃣ Hashtag Generation Function
# ===============================================================

def generate_hashtags(model, feature, max_len=5, temperature=1.0):
    model.eval()
    with torch.no_grad():
        feature_tensor = torch.tensor(feature, dtype=torch.float32).unsqueeze(0).to(device) # Shape (1, feature_dim)

        # Project the image features once, and ensure it's (1, 1, embed_dim) for concatenation
        initial_img_embed_for_cat = model.feature_proj(feature_tensor).unsqueeze(1) # Shape (1, 1, embed_dim)

        # Start generating with the <start> token
        current_generated_tokens = torch.tensor([[word2idx['<start>']]], dtype=torch.long).to(device) # Shape (1, 1)
        hidden = None # Initial hidden state for GRU
        tags = []

        for i in range(max_len):
            # Get the embedding for the last generated token
            # This will be <start> in the first iteration
            current_token_embed = model.embed(current_generated_tokens[:, -1]).unsqueeze(1) # Shape (1, 1, embed_dim)

            if i == 0: # First step of generation
                # The GRU input for the first step is image context followed by <start> token embedding
                # This mirrors the `torch.cat([img_embed, cap_embed], dim=1)` from `forward`
                gru_input_sequence = torch.cat([initial_img_embed_for_cat, current_token_embed], dim=1) # Shape (1, 2, embed_dim)
            else: # Subsequent steps
                # For autoregressive generation after the first step, feed only the current token embedding
                # and rely on the hidden state.
                gru_input_sequence = current_token_embed # Shape (1, 1, embed_dim)

            # Pass through GRU. The hidden state will be updated and passed to the next iteration.
            output, hidden = model.gru(gru_input_sequence, hidden) # output shape (1, seq_len_of_gru_input, hidden_dim)

            # Get logits from the output corresponding to the *last* token fed to GRU in this step
            logits = model.fc(output[:, -1, :]) / temperature
            probs = torch.softmax(logits, dim=-1)
            next_token_idx = torch.multinomial(probs, 1).item()

            if idx2word[next_token_idx] == '<end>':
                break

            tags.append(idx2word[next_token_idx])
            # Append the newly generated token to `current_generated_tokens` for the next iteration
            current_generated_tokens = torch.cat([current_generated_tokens, torch.tensor([[next_token_idx]]).to(device)], dim=1)

        return ['#' + t for t in tags]

In [33]:
# ===============================================================
# 8️⃣ Test on a Random Sample
# ===============================================================

sample_feat = df.iloc[0]['features']
predicted_tags = generate_hashtags(model, sample_feat)
print("Generated Hashtags:", predicted_tags)
print("Original Hashtags:", df.iloc[0]['hashtags'])

Generated Hashtags: ['#relax', '#friend', '#colour', '#instasize']
Original Hashtags: sea instapic instagram trip travel


### Test on Multiple Samples

This code block will iterate through a few samples from the `df` DataFrame, generate hashtags for each, and print both the original and generated tags for comparison.

In [34]:
# Test on a few more samples
# You can adjust the range (e.g., from 1 to 6 for 5 samples) to test more or fewer entries
for i in range(10200, 10205): # Testing 5 samples (index 1001 through 1005)
    sample_feat = df_combined.iloc[i]['features']
    predicted_tags = generate_hashtags(model, sample_feat)
    print(f"\n--- Sample {i+1} ---")
    print("Generated Hashtags:", predicted_tags)
    print("Original Hashtags:", df_combined.iloc[i]['hashtags'])


--- Sample 10201 ---
Generated Hashtags: ['#blossom', '#pink', '#yellow']
Original Hashtags: flower nature instanature instapic natural picoftheday spring yellow trip travel

--- Sample 10202 ---
Generated Hashtags: ['#family', '#familytime', '#cool']
Original Hashtags: yellow followme smile likeforlike

--- Sample 10203 ---
Generated Hashtags: ['#yellow']
Original Hashtags: nikon yellow

--- Sample 10204 ---
Generated Hashtags: ['#flower', '#yellow', '#mood', '#morning', '#saturday']
Original Hashtags: spring flower nature naturelovers yellow

--- Sample 10205 ---
Generated Hashtags: ['#nyc', '#music', '#handsome', '#boyfriend']
Original Hashtags: black white blue yellow fun
