## Load Data

In [60]:
import pandas as pd
df = pd.read_csv('medium_125k_20190719.csv', nrows=10000)

In [61]:
df.head()

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False
2,lgbtqia,"""CISGENDER?! Is That A Disease?!""","Or, a primer in gender vocabulary for the curi...",False
3,equality,"""Call me Nat Love"" :Black Cowboys and the Fron...",,False
4,artificial-intelligence,"""Can I Train my Model on Your Computer?""",How we waste computational resources and how t...,False


## Data Cleaning

In [62]:
df.isna().sum()
df = df.dropna()
df = df[~df['subtitle_truncated_flag']]
df['subtitle_truncated_flag'].value_counts()

subtitle_truncated_flag
False    6211
Name: count, dtype: int64

In [63]:
df.shape

(6211, 4)

81545 vectors

In [64]:
df['title_extended'] = df['title'] + df['subtitle']
df['category'].nunique()

93

93 Unique categories

## Prep for UpSert

In [33]:
from pinecone import Pinecone, PodSpec
from tqdm.autonotebook import tqdm
api_key = "key"
env = 'env'
pc = Pinecone(api_key= api_key)

In [35]:
pc.create_index(name='medium-data',dimension=384,metric='cosine',spec=PodSpec(
    environment=env,
    pod_type= "s1.x1",
    pods= 1
  ))

In [45]:
from sentence_transformers import SentenceTransformer

In [46]:
import torch

## Creation of an embedding model

In [65]:
model = SentenceTransformer('all-MiniLM-L6-v2',device = 'cpu')

In [66]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [67]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect..."


## Embed the sentences in our original file to a list of vectors

In [68]:
df['values'] = df['title_extended'].map(lambda x: (model.encode(x)).tolist())

In [70]:
df['id'] = df.reset_index(drop = 'index').index

In [71]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values,id
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.031074441969394684, -0.014303443022072315,...",0
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","[-0.03467032313346863, -0.018165184184908867, ...",1


In [72]:
df['metadata'] = df.apply(lambda x: {
    'title' :x['title'],
    'subtitle' :x['subtitle'],
    'category' :x['category']
}, axis=1)

In [73]:
df.head(2)

Unnamed: 0,category,title,subtitle,subtitle_truncated_flag,title_extended,values,id,metadata
0,work,"""21 Conversations"" - A fun (and easy) game for...",A (new?) Icebreaker game to get your team to s...,False,"""21 Conversations"" - A fun (and easy) game for...","[-0.031074441969394684, -0.014303443022072315,...",0,"{'title': '""21 Conversations"" - A fun (and eas..."
1,spirituality,"""Biblical Porn"" at Mars Hill",Author and UW lecturer Jessica Johnson talks a...,False,"""Biblical Porn"" at Mars HillAuthor and UW lect...","[-0.03467032313346863, -0.018165184184908867, ...",1,"{'title': '""Biblical Porn"" at Mars Hill', 'sub..."


In [74]:
df_upsert = df[['id','values','metadata']]

In [2]:
#df_upsert['id'] = df_upsert['id'].map(lambda x:str(x))

In [78]:
idx = pc.Index('medium-data')

In [80]:
idx.upsert_from_dataframe(df_upsert)

sending upsert requests:   0%|          | 0/6211 [00:00<?, ?it/s]

{'upserted_count': 6211}

Search query

In [106]:
xc = idx.query(
  vector = (model.encode("classic physics")).tolist(), #python list, NOT np.array
  top_k=10,
  include_metadata=True
)

Resulting titles

In [107]:
for result in xc['matches']:
    print(f"{round(result['score'],2)}: {result['metadata']['title']}")

0.42: A Brief History of the Grand Unified Theory of Physics
0.33: America Is No Longer Attracting The Top Minds In Physics
0.31: #ChiStories Podcast: Thomas Friedman and the “New-”
0.3: Actually, There Is a Time Like the Present
0.3: Adjusting your individual resonant frequency.
0.29: A Century of General Relativity. Part I: History and Intuition
0.29: A Great Reckoning in a Little Car
0.28: 3 Important Myths About Science
0.27: A Ball of Space Mud Just Pelted Earth — and Scientists Couldn’t Be Happier
0.27: 25 Things I Thought I Would Have by the Time I Turned 25


In [108]:
for result in xc['matches']:
    print(f"{round(result['score'],2)}: {result['metadata']['subtitle']}")

0.42: It’s the best of times or the worst of times in physics
0.33: International enrollment and applications to American grad schools are plummeting. This is the opposite of what makes America great.
0.31: “Order vs. Disorder.” “Non-Linear Accelerations.” “Mirroring Mother Nature.”
0.3: Think there’s no time like the present? Modern physics begs to differ.
0.3: How to use quantum physics to create reality. Or, what Tesla didn’t say out loud.
0.29: A brief introduction to the idea of relativity and introduce the notion of Space-Time.
0.29: (Part One)
0.28: The hard limits of mankind’s greatest tool
0.27: We haven’t seen a rock like this in 50 years
0.27: beach waves
