In [None]:
# First upgrade transformers
!pip install --upgrade transformers

# Then modify the import line
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW  # Get AdamW from PyTorch instead



In [None]:

!pip install -q transformers pandas numpy scikit-learn nltk torch faiss-cpu gradio matplotlib sentence-transformers
import os
if not os.path.exists('/content/esci-data'):
    !git clone https://github.com/amazon-science/esci-data.git

# [2] Imports
#  [2] Corrected Imports
import pandas as pd
import numpy as np
import torch
import nltk
import re
import faiss
import gradio as gr
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from transformers import BertTokenizer, BertModel  # Modified import
from torch.optim import AdamW  # Correct import location
from torch.utils.data import Dataset, DataLoader
nltk.download('stopwords')
nltk.download('wordnet')

# [3] Data Preprocessing
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        tokens = text.split()
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens
                 if token not in self.stop_words]
        return ' '.join(tokens)

# Load and preprocess data
def load_data():
    products = pd.read_parquet('/content/esci-data/shopping_queries_dataset/shopping_queries_dataset_products.parquet')
    examples = pd.read_parquet('/content/esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet')

    df = pd.merge(examples, products, on='product_id')
    df['product_text'] = df['product_title'] + ' ' + df['product_description']

    preprocessor = TextPreprocessor()
    df['processed_query'] = df['query'].apply(preprocessor.clean_text)
    df['processed_product'] = df['product_text'].apply(preprocessor.clean_text)
    df['relevance'] = df['esci_label'].map({'E':3, 'S':2, 'C':1, 'I':0})

    return df

full_data = load_data()

#  [4] Dataset Splitting
train_data, temp_data = train_test_split(full_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

#  [5] BERT Model Setup
class ProductSearchDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        query = self.tokenizer(
            row['processed_query'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        product = self.tokenizer(
            row['processed_product'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'query_input': {k: v.squeeze(0) for k, v in query.items()},
            'product_input': {k: v.squeeze(0) for k, v in product.items()},
            'label': torch.tensor(row['relevance'], dtype=torch.float)
        }

class BertRanker(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = torch.nn.Linear(768 * 2, 1)  # Concatenated features

    def forward(self, query_input, product_input):
        query_out = self.bert(**query_input).last_hidden_state[:,0,:]
        product_out = self.bert(**product_input).last_hidden_state[:,0,:]
        combined = torch.cat((query_out, product_out), dim=1)
        return self.classifier(combined).squeeze()

#  [6] Training Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertRanker().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

train_dataset = ProductSearchDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

#  [7] Training Loop with Validation
def train_model():
    train_losses = []
    val_losses = []

    for epoch in range(3):
        # Training
        model.train()
        epoch_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()

            query_input = {k: v.to(device) for k, v in batch['query_input'].items()}
            product_input = {k: v.to(device) for k, v in batch['product_input'].items()}
            labels = batch['label'].to(device)

            outputs = model(query_input, product_input)
            loss = torch.nn.MSELoss()(outputs, labels)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        avg_train_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for _, row in val_data.iterrows():
                query_input = tokenizer(row['processed_query'], return_tensors='pt').to(device)
                product_input = tokenizer(row['processed_product'], return_tensors='pt').to(device)
                output = model(query_input, product_input)
                val_loss += torch.nn.MSELoss()(output, torch.tensor([row['relevance']]).to(device)).item()

        avg_val_loss = val_loss / len(val_data)
        val_losses.append(avg_val_loss)

        print(f'Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

    # Plot training curves
    plt.plot(train_losses, label='Train')
    plt.plot(val_losses, label='Validation')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('MSE Loss')
    plt.legend()
    plt.show()

train_model()

#  [8] Evaluation Metrics
def evaluate_model(test_df):
    model.eval()
    all_scores = []
    all_labels = []

    with torch.no_grad():
        for _, row in test_df.iterrows():
            query_input = tokenizer(row['processed_query'], return_tensors='pt').to(device)
            product_input = tokenizer(row['processed_product'], return_tensors='pt').to(device)
            score = model(query_input, product_input).cpu().item()
            all_scores.append(score)
            all_labels.append(row['relevance'])

    # Calculate metrics
    ndcg = ndcg_score([all_labels], [all_scores])
    precision_at_10 = sum(np.array(sorted(zip(all_scores, all_labels), reverse=True)[:10])[:,1] >= 2) / 10

    return {
        'NDCG@10': ndcg,
        'Precision@10': precision_at_10,
        'Average Precision': sum(all_labels) / len(all_labels)
    }

print("Test Metrics:", evaluate_model(test_data))

#  [9] FAISS Indexing for Fast Retrieval
def create_faiss_index(products_df):
    model.eval()
    index = faiss.IndexFlatIP(768)

    product_embeddings = []
    for _, row in products_df.iterrows():
        inputs = tokenizer(row['processed_product'], return_tensors='pt').to(device)
        with torch.no_grad():
            emb = model.bert(**inputs).last_hidden_state[:,0,:].cpu().numpy()
        product_embeddings.append(emb)

    product_embeddings = np.concatenate(product_embeddings)
    faiss.normalize_L2(product_embeddings)
    index.add(product_embeddings)
    return index

faiss_index = create_faiss_index(full_data)

#  [10] Gradio Interface
def semantic_search(query):
    # Preprocess query
    preprocessor = TextPreprocessor()
    processed_query = preprocessor.clean_text(query)

    # Get query embedding
    inputs = tokenizer(processed_query, return_tensors='pt').to(device)
    with torch.no_grad():
        query_emb = model.bert(**inputs).last_hidden_state[:,0,:].cpu().numpy()

    # Search FAISS index
    faiss.normalize_L2(query_emb)
    distances, indices = faiss_index.search(query_emb, 10)

    # Format results
    results = []
    for idx, score in zip(indices[0], distances[0]):
        product = full_data.iloc[idx]
        results.append({
            'Title': product['product_title'],
            'Description': product['product_description'][:200] + '...',
            'Relevance Score': f"{score:.3f}"
        })

    return results

# Launch interface
iface = gr.Interface(
    fn=semantic_search,
    inputs=gr.Textbox(label="Enter your product search query"),
    outputs=gr.JSON(label="Top 10 Products"),
    title="Semantic Product Search Engine",
    examples=["Wireless Bluetooth Headphones", "Organic Cotton T-Shirts"]
)

iface.launch(share=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


AttributeError: 'float' object has no attribute 'lower'

In [None]:
#  [markdown]
# ## 1. Environment Setup & Data Loading
# !pip install -q transformers datasets sentence-transformers gradio pandas nltk torchmetrics scikit-learn umap-learn plotly
# !apt install git-lfs

# %%
import pandas as pd
import numpy as np
import torch
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import os
import plotly.express as px
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
import gradio as gr
from umap import UMAP

nltk.download('stopwords')
nltk.download('wordnet')

# %%
# Clone dataset repository
!git clone https://github.com/amazon-science/esci-data.git

# Load datasets
df = pd.read_parquet("/content/esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet")
products = pd.read_parquet("/content/esci-data/shopping_queries_dataset/shopping_queries_dataset_products.parquet")

# Merge datasets
data = df.merge(products, on='product_id')

#  [markdown]
# ## 2. Text Preprocessing Pipeline

# %%
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.pattern = re.compile(r'[^a-zA-Z0-9\s]')

    def preprocess(self, text):
        text = self.pattern.sub('', text)
        text = text.lower().strip()
        tokens = text.split()
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
        return ' '.join(tokens)

# %%
preprocessor = TextPreprocessor()

# Process product text
data['product_text'] = data['product_title'] + ' ' + data['product_description']
data['processed_product'] = data['product_text'].apply(preprocessor.preprocess)

# Process queries
data['processed_query'] = data['query'].apply(preprocessor.preprocess)

#  [markdown]
# ## 3. Dataset & DataLoader

# %%
class ProductRankingDataset(Dataset):
    def __init__(self, queries, products, labels):
        self.queries = queries
        self.products = products
        self.labels = labels

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        return {
            'query': self.queries[idx],
            'product': self.products[idx],
            'label': self.labels[idx]
        }

# %%
# Split data
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Create datasets
train_dataset = ProductRankingDataset(
    train_df['processed_query'].values,
    train_df['processed_product'].values,
    train_df['esci_label'].map({'E': 1, 'S': 1, 'C': 0, 'I': 0}).values  # Convert labels to binary
)

#  [markdown]
# ## 4. BERT-based Semantic Search Model

# %%
class SemanticSearchBERT(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.bert = AutoModel.from_pretrained(model_name)
        self.projector = nn.Linear(768, 256)

    def forward(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True,
                              max_length=128, return_tensors='pt').to(device)
        outputs = self.bert(**inputs).last_hidden_state
        return self.projector(outputs[:, 0, :])  # CLS token projection

    def encode(self, texts, batch_size=32):
        self.eval()
        embeddings = []
        with torch.no_grad():
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i+batch_size]
                embeddings.append(self(batch))
        return torch.cat(embeddings)

# %%
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SemanticSearchBERT().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.TripletMarginLoss(margin=1.0)

#  [markdown]
# ## 5. Training Loop with Negative Sampling

# %%
def train_epoch(model, dataloader):
    model.train()
    total_loss = 0

    for batch in dataloader:
        # Anchor: queries
        anchors = model(batch['query'])

        # Positive: relevant products
        positives = model(batch['product'])

        # Negative: random samples
        neg_indices = torch.randint(0, len(dataloader.dataset), (len(anchors),))
        negatives = model(dataloader.dataset.products[neg_indices])

        # Compute loss
        loss = loss_fn(anchors, positives, negatives)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# %%
# Train model
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
for epoch in range(3):
    loss = train_epoch(model, train_loader)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

#  [markdown]
# ## 6. Evaluation Metrics

# %%
def evaluate(model, test_df, k=10):
    model.eval()
    queries = test_df['processed_query'].tolist()
    products = test_df['processed_product'].tolist()

    # Encode all queries and products
    query_embs = model.encode(queries)
    product_embs = model.encode(products)

    # Calculate similarities
    similarities = F.cosine_similarity(query_embs.unsqueeze(1), product_embs.unsqueeze(0), dim=-1)

    # Calculate metrics
    map_metric = torchmetrics.RetrievalMAP()
    ndcg_metric = torchmetrics.RetrievalNormalizedDCG()

    scores = []
    for i in range(len(test_df)):
        sims = similarities[i]
        labels = torch.tensor([1 if j == i else 0 for j in range(len(test_df))])
        indices = torch.arange(len(test_df))

        map_metric.update(sims, labels, indices)
        ndcg_metric.update(sims, labels, indices)

    return {
        'MAP': map_metric.compute().item(),
        'NDCG@10': ndcg_metric.compute().item()
    }

# %%
metrics = evaluate(model, test_df)
print(f"Test Metrics: {metrics}")

# [markdown]
# ## 7. Visualization of Embeddings

# %%
def visualize_embeddings(texts, embeddings):
    reducer = UMAP(n_components=2)
    reduced = reducer.fit_transform(embeddings.cpu().numpy())

    fig = px.scatter(
        x=reduced[:, 0], y=reduced[:, 1],
        color=[t[:20] for t in texts],  # Use first 20 chars as label
        title="Product Embedding Space",
        labels={'color': 'Product Text'}
    )
    return fig

# %%
sample_texts = test_df['processed_product'].sample(100).tolist()
sample_embs = model.encode(sample_texts)
visualize_embeddings(sample_texts, sample_embs)

# [markdown]
# ## 8. Gradio Interface with FAISS Index

# %%
!pip install -q faiss-cpu

# %%
import faiss

class ProductSearchEngine:
    def __init__(self, model, products):
        self.model = model
        self.products = products
        self.index = faiss.IndexFlatIP(256)
        self._build_index()

    def _build_index(self):
        product_embs = model.encode(self.products).cpu().numpy()
        faiss.normalize_L2(product_embs)
        self.index.add(product_embs)

    def search(self, query, k=10):
        query_emb = model.encode([query]).cpu().numpy()
        faiss.normalize_L2(query_emb)
        distances, indices = self.index.search(query_emb, k)
        return [(self.products[i], d) for i, d in zip(indices[0], distances[0])]

# %%
search_engine = ProductSearchEngine(model, data['processed_product'].tolist())

# %%
def gradio_search(query):
    results = search_engine.search(preprocessor.preprocess(query))
    return {prod: float(score) for prod, score in results}

# %%
interface = gr.Interface(
    fn=gradio_search,
    inputs=gr.Textbox(label="Enter product search query"),
    outputs=gr.Label(label="Top Matching Products"),
    examples=[
        ["Wireless noise cancelling headphones"],
        ["Organic cotton t-shirt women's medium"],
        ["4K ultra HD smart TV 55 inch"]
    ]
)

interface.launch(share=True)

#  [markdown]
# ## 9. Model Saving & Loading

# %%
# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'tokenizer': model.tokenizer,
}, 'semantic_search_model.pth')

# Load model
def load_model(path):
    checkpoint = torch.load(path)
    model = SemanticSearchBERT()
    model.load_state_dict(checkpoint['model_state_dict'])
    model.tokenizer = checkpoint['tokenizer']
    return model

# %%
# Example usage after loading
loaded_model = load_model('semantic_search_model.pth')
loaded_model.to(device)