In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

#Read data from chunks csv file
df = pd.read_csv('/content/drive/MyDrive/INLPT_Project/csv_files/chunks.csv')

In [32]:
# Display the DataFrame containing chunks
df

Unnamed: 0.1,Unnamed: 0,id,CELEX number,text,extras
0,0,0,21975A1201(01),Avis juridique important Cooperation Agreement...,
1,1,1,21975A1201(01),EUROPEAN ATOMIC ENERGY COMMUNITY AND THE INTER...,
2,2,2,21975A1201(01),Contracting Parties shall consult each other r...,
3,3,3,21975A1201(01),with respect to items on their agenda in which...,
4,4,4,21975A1201(01),confidential nature of certain information and...,
...,...,...,...,...,...
25792,25792,25792,32023R2633,(1) The Annex is subject to the pro rata obli...,Section: 'ANNEX Footnotes'; Section: 'Document'
25793,25793,25793,42009D0913,DECISION TAKEN BY COMMON AGREEMENT BETWEEN THE...,"Section: 'Document', (1); Section: 'Document'"
25794,25794,25794,42009D0913,The location of the seat of this Agency should...,"Section: 'Document', Article 2; Section: 'Docu..."
25795,25795,25795,42010D0349,DECISION TAKEN BY COMMON ACCORD BETWEEN THE RE...,"Section: 'Document', (1); Section: 'Document',..."


In [3]:
%pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [4]:
import os
from dotenv import load_dotenv

# load environment variables from .env file
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [28]:
%pip install pinecone-client




In [29]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

#loading embedding model from hugging face
embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'
device = 'cuda:0' # make sure you are on gpu

embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [30]:
#generating embeddings for our chunks
embeddings = embed_model.embed_documents(df['text'])


In [31]:
print("number of chunks:",len(embeddings))
print("dimension of docs:",len(embeddings[0]))

number of chunks: 25797
dimension of docs: 384


In [21]:
import os
from pinecone import Pinecone, ServerlessSpec

# Get Pinecone API key and environment from environment variables
pinecone_api_key = os.environ.get('PINECONE_API_KEY')

# Create an instance of the Pinecone class
pc = Pinecone(api_key=pinecone_api_key)


In [23]:
index_name = 'inlpt-project'

if index_name not in pc.list_indexes().names():
  pc.create_index(
    name=index_name,
    dimension=len(embeddings[0]),
    metric='cosine',
    spec=ServerlessSpec(
      cloud="aws",
      region="us-central1"
    )
  )

In [24]:
index_name = 'inlpt-project'
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [26]:
#Push embeddings and meta data to Pincone in batches
batch_size = 32

for i in range(0, len(df), batch_size):
    i_end = min(len(df), i+batch_size)
    batch = df.iloc[i:i_end]
    ids = [f"{x['id']}" for i, x in batch.iterrows()]
    texts = [x['text'] for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['text'],
         'CELEX number': x['CELEX number']} for i, x in batch.iterrows()
    ]
    index.upsert(vectors=zip(ids, embeds, metadata))




In [27]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.25797,
 'namespaces': {'': {'vector_count': 25797}},
 'total_vector_count': 25797}