In [1]:
# Installing / Verifynng dependencies
#! pip install tf-keras


In [20]:
# Importing Libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
# Loading the dataset
df = pd.read_csv('/Users/ramyavissapragada/Downloads/pinterest_finalised.csv')  # adjust path if needed
df.head()

Unnamed: 0,id,description,title,repin_count
0,21181060741374444,,,0
1,21181060741360357,,,0
2,21181060741360356,The Toppu Mini Bowl is part of the popular Top...,Toppu Mini Bowl in Caramel / Rose,0
3,21181060741360355,,𐀔,0
4,21181060741360353,,,0


In [22]:
# Look at the structure
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           5000 non-null   int64 
 1   description  4997 non-null   object
 2   title        1506 non-null   object
 3   repin_count  5000 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 156.4+ KB
None


In [23]:
# 1. Drop missing descriptions
df.dropna(subset=['description'], inplace=True)

# 2. Fill missing titles with empty string
df['title'] = df['title'].fillna('')

# 3. Combine title + description into 'text'
df['text'] = df['title'].str.strip() + '. ' + df['description'].str.strip()
df['text'] = df['text'].str.strip().replace(r'^\.', '', regex=True)

# 4. Remove emoji-only / foreign-language / non-English rows
df = df[df['text'].str.contains(r'[a-zA-Z]{3,}', regex=True)]

# 5. Drop very short text
df = df[df['text'].str.len() > 10]

# 6. Clean repin_count column
df['repin_count'] = pd.to_numeric(df['repin_count'], errors='coerce')
df.dropna(subset=['repin_count'], inplace=True)
df['repin_count'] = df['repin_count'].astype(int)

# (Optional) Check final dataset size
print(f"Final number of pins: {len(df)}")


Final number of pins: 2428


In [24]:
from sentence_transformers import SentenceTransformer

# Load a small, fast, high-quality model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [25]:
texts = df['text'].tolist()

print("Encoding pin texts... This may take 1–2 minutes.")
embeddings = model.encode(texts, show_progress_bar=True)


Encoding pin texts... This may take 1–2 minutes.


Batches:   0%|          | 0/76 [00:00<?, ?it/s]

In [26]:
import numpy as np

df['embedding'] = list(embeddings)


In [27]:
df.head()

Unnamed: 0,id,description,title,repin_count,text,embedding
2,21181060741360356,The Toppu Mini Bowl is part of the popular Top...,Toppu Mini Bowl in Caramel / Rose,0,Toppu Mini Bowl in Caramel / Rose. The Toppu M...,"[0.029299932, 0.05243382, 0.007619313, -0.0220..."
5,21181060741360352,,constellation,1,constellation.,"[0.0029528304, 0.02040875, -0.023886636, 0.050..."
7,21181060741360345,Blomb No. 03 50ml Eau de Parfum – Altar PDX,,0,Blomb No. 03 50ml Eau de Parfum – Altar PDX,"[-0.0066533852, 0.040499054, -0.022409916, -0...."
9,21181060741360343,Pyxis: This shimmer eyeshadow shade is named f...,Celestial Sphere Gelée Eye Gloss: Pyxis,0,Celestial Sphere Gelée Eye Gloss: Pyxis. Pyxis...,"[-0.041557647, 0.014049628, 0.01359767, -0.019..."
10,21181060741360342,,Marshmalow flowers @inspirationbyblanca,1,Marshmalow flowers @inspirationbyblanca.,"[0.006069741, -0.013287566, 0.06150761, -0.004..."


In [28]:
def search_pins(query, top_k=5, min_similarity=0.35):
    # Encode query
    query_embedding = model.encode([query])
    
    # Compute similarity
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Sort indices by descending similarity
    sorted_indices = similarities.argsort()[::-1]
    
    # Filter by threshold
    filtered_indices = [i for i in sorted_indices if similarities[i] >= min_similarity]
    
    # Take top_k
    top_indices = filtered_indices[:top_k]
    
    # Retrieve results
    results = df.iloc[top_indices][['text', 'description', 'repin_count']].copy()
    results['similarity'] = similarities[top_indices].round(3)
    
    return results.reset_index(drop=True)


In [29]:
results = search_pins("kitchen storage hacks", top_k=5, min_similarity=0.40)


In [33]:
if results.empty:
    print("No relevant pins found for this query.")
else:
    print(results)


                                                text  \
0  Shea Checkered Storage Bench. from @urbanoutfi...   
1                   The Curious Shelf – Shop Curious   
2   Ikea hack kitchen design by Norm Architects f...   

                                         description  repin_count  similarity  
0                              from @urbanoutfitters            3       0.448  
1                   The Curious Shelf – Shop Curious            2       0.424  
2  Ikea hack kitchen design by Norm Architects fo...            1       0.408  


In [34]:
results['repin_count'].describe()
results[['text', 'similarity', 'repin_count']]


Unnamed: 0,text,similarity,repin_count
0,Shea Checkered Storage Bench. from @urbanoutfi...,0.448,3
1,The Curious Shelf – Shop Curious,0.424,2
2,Ikea hack kitchen design by Norm Architects f...,0.408,1


In [36]:
#np.save('/Users/ramyavissapragada/Desktop/Pinterest/embeddings.npy', embeddings)
# df.to_csv('/Users/ramyavissapragada/Desktop/Pinterest/cleaned_pins.csv', index=False)

