In [1]:
#import all necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import torch
from torch.utils.data import Dataset,DataLoader
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from typing import List, Dict, Tuple


In [2]:
#Get the data into the workspace
Products_train =pd.read_csv('/content/drive/MyDrive/Few shot learning for categorization and recommendation/products_train.csv').set_index('id')

Sessions_train =pd.read_csv('/content/drive/MyDrive/Few shot learning for categorization and recommendation/sessions_train.csv')

In [3]:
#drop author column
Products_train.drop('author',axis=1,inplace=True)

In [4]:
#this function converts the prev_item in session data to the right format
def parse_list_string(s):
    # Remove brackets and split by space
    cleaned = s.strip('[]').strip()
    # Split by space and remove any quotes
    items = [item.strip("'").strip('"') for item in cleaned.split()]
    return items


In [5]:
#apply this function to the prev_item in the session column
Sessions_train['prev_items'] = Sessions_train['prev_items'].apply(parse_list_string)

The problem is in two formats
categorise low frequency products(maybe we can categorise products from mostly interracted with to less interracted with, then..)
using session data rank the most likely products to be interracted with

## Feature Preprocessing

## the function below prepares the data for training, we add an abitrary categories column by using kmeans

In [6]:
class ProductDataset(Dataset):
    def __init__(self,
                 sessions_df: pd.DataFrame,
                 product_df: pd.DataFrame,
                 tokenizer: GPT2Tokenizer,
                 n_categories: int = 10,
                 max_length: int = 512,
                 n_shots: int = 3):
        """
        Dataset for few-shot product categorization

        Args:
            sessions_df: DataFrame with prev_item (list) and next_item columns
            product_df: DataFrame with product attributes (title, price, brand, color)
            tokenizer: GPT2 tokenizer
            n_categories: Number of categories to generate
            max_length: Maximum sequence length
            n_shots: Number of examples to use in few-shot learning
        """
        self.sessions_df = sessions_df
        self.product_df = product_df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.n_shots = n_shots
        self.n_categories = n_categories

        # Process data and generate categories
        self._process_data()

    def _process_data(self):
        """Process data and generate categories"""
        # Count product occurrences
        self.product_counts = self._get_product_counts()
        self.frequent_products = self._get_frequent_products()
        self.rare_products = set(self.product_df.index) - self.frequent_products

        # Generate embeddings and categories for frequent products
        self.product_embeddings = self._create_product_embeddings()
        self.product_categories = self._generate_categories()

        # Update product_df with categories
        self.product_df['category'] = pd.Series(self.product_categories)

        # Prepare few-shot examples
        self.shot_examples = self._prepare_shot_examples()

        # Prepare data for training
        self.data = self._prepare_data()

    def _get_product_counts(self) -> Dict[str, int]:
        """Count product occurrences in sessions"""
        counts = {}
        for _, row in self.sessions_df.iterrows():
            for item in row['prev_items']:
                counts[item] = counts.get(item, 0) + 1
            counts[row['next_item']] = counts.get(row['next_item'], 0) + 1
        return counts

    def _get_frequent_products(self, min_count: int = 5) -> set:
        """Get set of frequent products"""
        return {pid for pid, count in self.product_counts.items()
                if count >= min_count}

    def _create_product_embeddings(self) -> Dict[str, np.ndarray]:
        """Create embeddings from product attributes and session data"""
        embeddings = {}

        # Create attribute embeddings
        for idx, row in self.product_df.iterrows():
            # Normalize numerical features
            numerical = np.array([float(row['price'])])
            numerical_normalized = (numerical - np.mean(numerical)) / np.std(numerical)

            # One-hot encode categorical features
            brand_enc = pd.get_dummies([row['brand']])
            color_enc = pd.get_dummies([row['color']])

            # Combine features
            embedding = np.concatenate([
                numerical_normalized,
                brand_enc.values.flatten(),
                color_enc.values.flatten()
            ])

            embeddings[idx] = embedding

        # Add session information for frequent products
        for pid in self.frequent_products:
            if pid in embeddings:
                # Find co-occurring products
                co_occurrences = []
                for _, row in self.sessions_df.iterrows():
                    if pid in row['prev_items'] or pid == row['next_item']:
                        co_occurrences.extend(row['prev_items'])
                        co_occurrences.append(row['next_item'])

                if co_occurrences:
                    # Create co-occurrence vector
                    co_occ_counts = pd.Series(co_occurrences).value_counts()
                    co_occ_normalized = co_occ_counts / len(co_occurrences)

                    # Append to existing embedding
                    embeddings[pid] = np.concatenate([
                        embeddings[pid],
                        co_occ_normalized.values
                    ])

        return embeddings

    def _generate_categories(self) -> Dict[str, int]:
        """Generate categories for products using clustering"""
        # Prepare data for clustering
        frequent_embeddings = {pid: self.product_embeddings[pid]
                             for pid in self.frequent_products
                             if pid in self.product_embeddings}

        X = np.array(list(frequent_embeddings.values()))
        products = list(frequent_embeddings.keys())

        # Perform clustering
        kmeans = KMeans(n_clusters=self.n_categories, random_state=42)
        clusters = kmeans.fit_predict(X)

        # Create category mapping for frequent products
        categories = {pid: int(cluster) for pid, cluster in zip(products, clusters)}

        # Assign categories to rare products based on nearest neighbors
        for pid in self.rare_products:
            if pid in self.product_embeddings:
                # Find nearest cluster
                embedding = self.product_embeddings[pid]
                cluster = kmeans.predict([embedding])[0]
                categories[pid] = int(cluster)

        return categories

    def _prepare_shot_examples(self) -> Dict[int, List[str]]:
        """Prepare few-shot examples for each category"""
        examples = {}
        for category in range(self.n_categories):
            # Get frequent products in this category
            category_products = [pid for pid, cat in self.product_categories.items()
                               if cat == category and pid in self.frequent_products]

            # Select random examples
            if category_products:
                selected = np.random.choice(
                    category_products,
                    size=min(self.n_shots, len(category_products)),
                    replace=False
                )

                examples[category] = [
                    self._format_product_text(pid)
                    for pid in selected
                ]

        return examples

    def _format_product_text(self, product_id: str) -> str:
        """Format product information as text"""
        row = self.product_df.loc[product_id]
        return (f"Product: {row['title']}, "
                f"Price: {row['price']}, "
                f"Brand: {row['brand']}, "
                f"Color: {row['color']}, "
                f"Category: {self.product_categories[product_id]}")

    def _prepare_data(self) -> List[Dict]:
        """Prepare data for training"""
        data = []
        for product_id in self.rare_products:
            if product_id not in self.product_categories:
                continue

            # Get examples for the category
            category = self.product_categories[product_id]
            shot_texts = []
            for cat in self.shot_examples:
                shot_texts.extend(self.shot_examples[cat])

            # Add target product
            target_text = self._format_product_text(product_id)

            # Combine into final text
            text = "\n".join(shot_texts + [target_text])

            data.append({
                'product_id': product_id,
                'text': text,
                'category': category
            })

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        encoding = self.tokenizer(
            item['text'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'product_id': item['product_id'],
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'category': torch.tensor(item['category'], dtype=torch.long)
        }

In [7]:

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
# Create dataset
dataset = ProductDataset(Sessions_train, Products_train, tokenizer)

# Create dataloader
dataloader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=True
)

## The above code runs forever check suggestions.txt