In [10]:
pip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org  --upgrade pip

[0mNote: you may need to restart the kernel to use updated packages.


In [11]:
!pip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org  \
pinecone-client \
pandas \
langchain \
sentence-transformers

[0m

In [12]:
from pinecone import Pinecone
import os
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from langchain.document_loaders import TextLoader
from langchain.document_loaders import  DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
# initialize connection to pinecone (get API key at app.pc.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
environment = os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

# configure client
pc = Pinecone(api_key=api_key)

# Prepare Data - New Articles

In [14]:
# load multiple document and process documants
loader = DirectoryLoader("./bbc/sport", glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [15]:
# split the text into smaler chunks
text_spliter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_spliter.split_documents(documents)

In [23]:
len(texts)

1505

# Create Embeddings and Vectors

In [24]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
os.environ["CURL_CA_BUNDLE"] = ""

In [25]:
# average_word_embeddings_komninos
# dimension = 300
model = SentenceTransformer('average_word_embeddings_komninos')

In [26]:
# check dimensionality of embeddings
model

SentenceTransformer(
  (0): WordEmbeddings(
    (emb_layer): Embedding(222305, 300)
  )
  (1): Pooling({'word_embedding_dimension': 300, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
)

In [29]:
# Must be a list
news_text = []
for text in texts:
    news_text.append(text.page_content)
news_text[0]

'Fuming Robinson blasts officials\n\nEngland coach Andy Robinson insisted he was "livid" after his side were denied two tries in Sunday\'s 19-13 Six Nations loss to Ireland in Dublin.\n\nMark Cueto\'s first-half effort was ruled out for offside before the referee spurned TV replays when England crashed over in the dying minutes. "[I\'m] absolutely spitting. I\'m livid. There\'s two tries we\'ve been cost," Robinson told BBC Sport. "We\'ve got to go back to technology. I don\'t know why we didn\'t." South African referee Jonathan Kaplan ruled that Cueto was ahead of Charlie Hodgson when the fly-half hoisted his cross-field kick for the Sale wing to gather.'

In [31]:
encoded_news = model.encode(news_text, show_progress_bar=True)

Batches:   0%|          | 0/48 [00:00<?, ?it/s]

In [34]:
len(encoded_news[0])

300

# Setupup Pinecone index
	- Make sure the dimenionality of the embeddings matches the dimensionality of the index

In [37]:
pc.list_indexes()

{'indexes': [{'dimension': 64,
              'host': 'prices-9atubxs.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'prices',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [38]:
pc.delete_index('prices')

In [39]:
pc.list_indexes()

{'indexes': []}

In [43]:
from pinecone import PodSpec
pc.create_index(
	'news', 
	dimension=300 ,
	metric='cosine',
	spec=PodSpec(
        	environment="gcp-starter",
        	pod_type="starter",
        	pods=1,
        	replicas=1,
        	shards=1
	)
)

# Upsert vector data

# Query vector data

# Query with metadata filters

# Cleanup