**Notebook Description: Automated ArXiv Article Update and Pinecone Upsert**

This notebook automates the retrieval, filtering, and upsert of new AI-related articles from the ArXiv dataset into a Pinecone vector index. Starting with a download of the dataset, it filters for recent articles within AI-related categories and encodes each article’s title and abstract into embeddings. The notebook compares each article's update date against the last stored update to avoid duplication. New articles are then upserted to Pinecone in batches for efficient indexing, with relevant metadata included. Finally, it updates the last processed date, ensuring that future runs only consider articles added after the most recent update. This workflow keeps the Pinecone index current with the latest AI research.

In [None]:
#Imports and installations
!pip install kaggle sentence_transformers pinecone
!kaggle datasets download -d Cornell-University/arxiv -p /content/dataset --unzip

import pandas as pd
import numpy as np
import json
from pinecone import Pinecone, ServerlessSpec
from datetime import datetime
from sentence_transformers import SentenceTransformer


In [None]:
# Initialize Pinecone connection
pc = Pinecone(api_key="65adfe61-8c99-4c68-951e-e2d42e7884df")
index = pc.Index("document-embeddings")

# Load the model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def load_arxiv_AI():
  cols = ['id','submitter','authors', 'title', 'doi', 'abstract', 'categories', 'update_date']
  data = []
  file_name = '/content/dataset/arxiv-metadata-oai-snapshot.json'

  with open(file_name, encoding='latin-1') as f:
      for line in f:
          doc = json.loads(line)
          lst = [doc['id'], doc['submitter'], doc['authors'], doc['title'],\
                doc['doi'], doc['abstract'], doc['categories'], doc['update_date']]
          data.append(lst)

  df_data = pd.DataFrame(data=data, columns=cols)
  ai_categories = ['cs.AI', 'cs.LG', 'cs.CL', 'cs.CV', 'stat.ML', 'cs.NE', 'eess.AS', 'stat.TH']
  df_ai = df_data[df_data['categories'].apply(lambda x: any(cat in x for cat in ai_categories))]
  return df_ai

# Function to load the last update date from a file
def load_last_update_date():
    last_update_file = "last_update_date.json"
    try:
        with open(last_update_file, 'r') as f:
            data = json.load(f)
            return datetime.strptime(data["last_update_date"], "%Y-%m-%d")
    except (FileNotFoundError, json.JSONDecodeError):
        return None

# Function to save the last update date to a file
def save_last_update_date(last_update_date):
    last_update_file = "last_update_date.json"
    with open(last_update_file, 'w') as f:
        # Extract the 'update_date' value from the DataFrame
        last_update_date_str = pd.to_datetime(last_update_date['update_date'].iloc[0]).strftime("%Y-%m-%d")
        json.dump({"last_update_date": last_update_date_str}, f)


# Function to filter new articles based on update date and categories
def filter_new_articles(df, last_update_date):
    # Convert update_date column to datetime for comparison
    df['update_date'] = pd.to_datetime(df['update_date'], format="%Y-%m-%d")

    # Filter by update date
    if last_update_date:
        df = df[df['update_date'] > last_update_date]

    return df

# Function to encode and upsert new articles to Pinecone
def upsert_new_articles_to_pinecone(df):
    # Prepare embeddings and metadata for new articles
    df['prepared_text'] = df['title'] + ' {title} ' + df['abstract']
    embeddings = model.encode(df['prepared_text'].tolist(), batch_size=32, show_progress_bar=True)

    # Create the document embedding dictionary
    document_embedding_dict = {}
    for i, row in df.reset_index(drop=True).iterrows():
      doc_id = row['id']
      update_date = row['update_date']
      categories = row['categories'].split(';')
      title = row['title']
      abstract = row['abstract']
      embedding = embeddings[i]

      document_embedding_dict[doc_id] = {
          'embedding': embedding,
          'categories': categories,
          'title': title,
          'abstract': abstract,
          'update_date': update_date
      }

    print(document_embedding_dict)
    # Upsert document embeddings into Pinecone
    batch_size = 100
    embeddings_batch = []
    for i, (doc_id, data) in enumerate(document_embedding_dict.items()):
        embedding = data['embedding']
        categories = data['categories']
        update_date = str(data['update_date'])
        title = data.get('title', '')
        abstract = data.get('abstract', '')
        embeddings_batch.append((doc_id, embedding.tolist(), {'categories': categories, 'update_date': update_date, 'title': title, 'abstract': abstract}))

        # Upsert in batches
        if (i + 1) % batch_size == 0 or i == len(document_embedding_dict) - 1:
            index.upsert(embeddings_batch)
            print(f"Upserted batch {i // batch_size + 1} with {len(embeddings_batch)} documents.")
            embeddings_batch = []

def update_pinecone_with_new_articles():
    #load arxiv data from kaggle
    df_ai = load_arxiv_AI()

    # Get the last update date from Pinecone
    last_update_date = load_last_update_date()
    print(f"Last update date in Pinecone: {last_update_date}")

    # Filter by update date
    new_articles_df = filter_new_articles(df_ai, last_update_date)
    print(f"Number of new articles to upsert: {new_articles_df.shape[0]}")

    # Encode and upsert new articles to Pinecone
    if not new_articles_df.empty:
        upsert_new_articles_to_pinecone(new_articles_df)
    else:
        print("No new articles to upsert.")

    # Update the update_date json
    new_articles_df['update_date'] = pd.to_datetime(new_articles_df['update_date'])

    # The last paper based on the date
    df_sorted_by_date = new_articles_df.sort_values(by='update_date')
    last_article_by_date = df_sorted_by_date.tail(1)
    save_last_update_date(last_article_by_date)



In [None]:
update_pinecone_with_new_articles()