In [1]:
import string
import re
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
import nltk
import spacy
from transformers import GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load BAAI/bge-base-en tokenizer and model only once
MODEL_NAME = "BAAI/bge-base-en"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
token_count_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
# Load NLTK stopwords and spaCy English model
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# stop_words.difference_update({'no', 'not'})  # exclude these from removal

nlp = spacy.load("en_core_web_sm")

In [4]:
# Preprocessing functions
def preprocess_text(text):
    text = text.lower().replace('\n', ' ')
    text = text.replace('  ', ' ')
    return text

In [5]:
def remove_punctuation(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub('', text)

In [6]:
def remove_stopwords(text):
    tokens = [token.text for token in nlp(text)]
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [7]:
def get_token_count(text):
    """Returns token count and token list using GPT2 tokenizer."""
    tokens = token_count_tokenizer.tokenize(text)
    token_count = len(tokens)
    return token_count

In [8]:
# Function to generate embedding
def generate_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    # Mean pooling over token embeddings
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

In [9]:
# Main processing function
def process_data(df):
    for i, row in df.iterrows():
        try:
            title = row.get('title', '')
            overview = row.get('overview', '')
            description = row.get('description', '')


            # Combine fields for embedding
            combined_text = f"{title} {overview} {description}".strip()


            # Preprocess and clean the text
            combined_text = preprocess_text(combined_text)
            combined_text = remove_punctuation(combined_text)
            combined_text = remove_stopwords(combined_text)
            combined_text = combined_text.replace('"', '').replace("'", '')

            if not combined_text:
                print(f"Skipping post_id {title} due to empty combined_text.")
                df.at[i, 'values'] = None
                df.at[i, 'token_count'] = 0
                continue

            # Generate token count and tokens
            token_count = get_token_count(combined_text)


            # Generate embedding
            embedding_vector = generate_embedding(combined_text)

            # print(embedding_vector.shape)
            
            embedding_list = embedding_vector.tolist()
            if not embedding_list or not isinstance(embedding_list, list) or not all(isinstance(x, float) for x in embedding_list):
                raise ValueError("Invalid embedding vector generated.")

            embedding_str = "[" + ", ".join(map(str, embedding_list)) + "]"

            # Store results
            df.at[i, 'values'] = embedding_str
            df.at[i, 'token_count'] = token_count

            print(f"Stored embedding and token count for post_id {title}")

        except Exception as e:
            print(f"Error processing post_id {title}: {e}")
            df.at[i, 'values'] = None
            df.at[i, 'token_count'] = 0
            continue


In [10]:
df1 = pd.read_csv('bestpoolshop_merged.csv')
df1.head()

Unnamed: 0,title,overview,price,stock,description,link
0,22421 Poolife NST Non Stabilized Swimming Pool...,Use only in pools with a skimmer and skimmer b...,$185.76,Only 7 left in stock,Poolife NST Non Stabilized Without Conditioner...,https://bestpoolshop.com/product/poolife-nst-n...
1,22422 Poolife NST Non Stabilized Swimming Pool...,Use only in pools with a skimmer and skimmer b...,,1 in stock,Poolife NST Non Stabilized Without Conditioner...,https://bestpoolshop.com/product/poolife-nst-n...
2,Bulk 50lb 3inch Chlorine Tablets For Swimming ...,Do not allow this product to get damp or wet b...,,2 in stock,Swimming Pool 3 inch Stabilized Chlorine Table...,https://bestpoolshop.com/product/50lb-3inch-ch...
3,CVBR004 Clearview Bromo Bromine Brominating 1i...,"96% Bromine, Slow dissolving, Reduced odor, En...",$71.99,Only 8 left in stock,Oreq Clearview Swimming Pool and Spa Bromo Bro...,https://bestpoolshop.com/product/clearview-bro...
4,CVTLST005 ClearView Swimming Pool Spa Chlorine...,World’s First Low-Odor Chlorine TabsExperience...,$48.99,Only 3 left in stock,CVTLST005 Scent-Trific ClearView Swimming Pool...,https://bestpoolshop.com/product/cvtlst005-cle...


In [11]:
process_data(df1)

Stored embedding and token count for post_id 22421 Poolife NST Non Stabilized Swimming Pool Chlorine Tablet 20.6lb
Stored embedding and token count for post_id 22422 Poolife NST Non Stabilized Swimming Pool Chlorine Tablet 44lb
Stored embedding and token count for post_id Bulk 50lb 3inch Chlorine Tablets For Swimming Pools – 10 Buckets
Stored embedding and token count for post_id CVBR004 Clearview Bromo Bromine Brominating 1in. Tablets 4lb.
Stored embedding and token count for post_id CVTLST005 ClearView Swimming Pool Spa Chlorine Chlorinating 3 inch Tablets Pucks Tabs 5 lbs.
Stored embedding and token count for post_id CVTLST010 ClearView Swimming Pool Spa Chlorine Chlorinating 3 inch Tablets Pucks Tabs 10 lbs.
Stored embedding and token count for post_id CVTS005 ClearView Swimming Pool Spa Chlorine Chlorinating 1 inch Tablets Pucks Tabs 5 lbs.
Stored embedding and token count for post_id Pool Chlorine Tablets 25lb 3 inch Jumbo
Stored embedding and token count for post_id Pool Chlorin

In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        16 non-null     object 
 1   overview     9 non-null      object 
 2   price        12 non-null     object 
 3   stock        16 non-null     object 
 4   description  16 non-null     object 
 5   link         16 non-null     object 
 6   values       16 non-null     object 
 7   token_count  16 non-null     float64
dtypes: float64(1), object(7)
memory usage: 1.1+ KB


In [13]:
df3 = pd.read_csv('epoolsupply_merged.csv')
df3.head()

Unnamed: 0,title,overview,price,stock,description,link
0,"Poolife 3"" Chlorine Tabs (50lbs) | 42118","steady, consistent chlorination | sanitized, c...",$169.95,In stock,"Poolife 3"" Chlorine Tabs arehighly effective c...",https://www.epoolsupply.com/collections/poolif...
1,"Poolife 1"" Cleaning Tablets | 42104 (5LBS)",pool maintenance | 1-inch trichlor tablets | k...,$52.99,In stock,"Poolife 1"" Cleaning Tablets arehighly effectiv...",https://www.epoolsupply.com/collections/poolif...
2,Poolife Bromine Tablets | 25 LBS (62182),slow-dissolving tablets | continuous disinfect...,$299.00,Out of stock,Key Features: manufacturer sku: 62182 brand: P...,https://www.epoolsupply.com/collections/poolif...
3,Poolife NST Prime Tablets | 4.5 LBS,chlorine | oxidizers | stabilizers | residenti...,$41.99,Out of stock,Poolife NST Prime Tablets are designed forpool...,https://www.epoolsupply.com/collections/poolif...
4,Poolife NST Prime Tablets | 9 LBS (22424),slow-dissolving tablets | chlorine sanitizers ...,$72.99,Out of stock,Poolife NST Prime Tablets in the9 LBSsize are ...,https://www.epoolsupply.com/collections/poolif...


In [14]:
process_data(df3)

Stored embedding and token count for post_id Poolife 3" Chlorine Tabs (50lbs) | 42118
Stored embedding and token count for post_id Poolife 1" Cleaning Tablets | 42104 (5LBS)
Stored embedding and token count for post_id Poolife Bromine Tablets | 25 LBS (62182)
Stored embedding and token count for post_id Poolife NST Prime Tablets | 4.5 LBS
Stored embedding and token count for post_id Poolife NST Prime Tablets | 9 LBS (22424)
Stored embedding and token count for post_id Poolife NST Prime Tablets | 20.2 LBS (22425)
Stored embedding and token count for post_id Poolife NST Prime Tablets | 39.4 LBS (22426)
Stored embedding and token count for post_id Poolife 3" Cleaning Tablet | 1, 7 OZ (42130)
Stored embedding and token count for post_id Poolife 3" Cleaning Tablets | 4.81 LBS (42107)
Stored embedding and token count for post_id Sirona Brom Tabs | 82235
Stored embedding and token count for post_id GLB Large 3" Chlorine Tablets | 71228A | 4 LB
Stored embedding and token count for post_id GLB 

In [15]:
df4 = pd.read_csv('poolweb_merged.csv')
df4.head()

Unnamed: 0,title,overview,price,stock,description,link
0,Pool Season Chlorinating Tablets Wrapped - 3 I...,3 inch wrapped tablets | 90% available chlorin...,$277.33,209 in stock,Description The Pool Season Chlorinating Table...,https://www.poolweb.com/products/pool-season-c...
1,Pool Season Chlorinating Tablets Wrapped - 3 I...,3 inch wrapped tablets | 90% available chlorin...,$147.01,225 in stock,Description The Pool Season Chlorinating Table...,https://www.poolweb.com/products/pool-season-c...
2,Pool Season Chlorinating Tablets Wrapped - 3 I...,3 inch wrapped tablets | 90% available chlorin...,$79.82,200 in stock,Description The Pool Season Chlorinating Table...,https://www.poolweb.com/products/pool-season-c...
3,Pool Season Non-Chlorine Shock Oxidizer - 1 Lb...,Non-chlorine oxidizer will not raise chlorine ...,$13.85,248 in stock,Description The Pool Season Non-Chlorine Shock...,https://www.poolweb.com/products/pool-season-n...
4,Pool Season Chlorinating Concentrate - 50 Lb. ...,Stabilized chlorinating concentrate | Totally ...,$334.29,214 in stock,Description This is a 50 pound bucket of Pool ...,https://www.poolweb.com/products/pool-season-c...


In [16]:
process_data(df4)

Stored embedding and token count for post_id Pool Season Chlorinating Tablets Wrapped - 3 Inch -  50 Lb. Bucket
Stored embedding and token count for post_id Pool Season Chlorinating Tablets Wrapped - 3 Inch -  25 Lb. Bucket
Stored embedding and token count for post_id Pool Season Chlorinating Tablets Wrapped - 3 Inch -  8 Lb. Pail
Stored embedding and token count for post_id Pool Season Non-Chlorine Shock Oxidizer - 1 Lb. Pouch
Stored embedding and token count for post_id Pool Season Chlorinating Concentrate - 50 Lb. Bucket
Stored embedding and token count for post_id Pool Season Chlorinating Concentrate - 2 Lb. Jar
Stored embedding and token count for post_id Cal-Hypo Shockwave - 1 Lb. Pouch
Stored embedding and token count for post_id Pool Season Chlorinating Concentrate - 25 Lb. Pail
Stored embedding and token count for post_id Pool Season Chlorinating Concentrate - 1 Lb. Pouch
Stored embedding and token count for post_id Shock 68% Calcium Hypochlorite - 1 Lb. Bag
Stored embedding a

In [22]:
df4['token_count'][0]

np.float64(123.0)

In [18]:
import psycopg2
import ast  # To convert embedding string to list safely

# Database connection
conn = psycopg2.connect(
    host="localhost",
    database="your_db",
    user="your_user",
    password="your_password"
)
cur = conn.cursor()

def insert_product_embeddings(df, table_name):
    for _, row in df.iterrows():
        try:
            embedding = ast.literal_eval(row['values']) if isinstance(row['values'], str) else None

            cur.execute(f"""
                INSERT INTO {table_name} 
                (title, overview, price, stock, description, link, embedding, token_count)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
            """, (
                row.get('title', ''),
                row.get('overview', ''),
                row.get('price', ''),
                row.get('stock', ''),
                row.get('description', ''),
                row.get('link', ''),
                embedding,
                int(row.get('token_count', 0))
            ))
        except Exception as e:
            print(f"Error inserting row: {e}")

    conn.commit()
    print(f"Data inserted into {table_name}")

# Example usage:
# insert_product_embeddings(df_site1, 'site_1_products')
# insert_product_embeddings(df_site2, 'site_2_products')
    

OperationalError: connection to server at "localhost" (127.0.0.1), port 5432 failed: could not initiate GSSAPI security context: Unspecified GSS failure.  Minor code may provide more information: Cannot find KDC for realm "BRAINVIRE.COM"
connection to server at "localhost" (127.0.0.1), port 5432 failed: FATAL:  password authentication failed for user "your_user"
connection to server at "localhost" (127.0.0.1), port 5432 failed: FATAL:  password authentication failed for user "your_user"
