In [None]:
!pip install pinecone

Collecting pinecone
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-6.0.2-py3-none-any.whl (421 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.9/421.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone
Successfully installed pinecone-6.0.2 pinecone-plugin-interface-0.0.7


In [None]:
import nltk
nltk.data.path.append("/usr/local/nltk_data")
nltk.download("wordnet", download_dir="/usr/local/nltk_data")
nltk.download("omw-1.4", download_dir="/usr/local/nltk_data")


[nltk_data] Downloading package wordnet to /usr/local/nltk_data...
[nltk_data] Downloading package omw-1.4 to /usr/local/nltk_data...


True

In [19]:
import requests
import torch
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import spacy
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import os


# Download necessary NLTK resources
try:
    nltk.download('stopwords', quiet=True)
except Exception as e:
    print(f"Warning: NLTK resource download issue. Error: {e}")

# Load spaCy model for NER and POS tagging
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("Warning: spaCy model 'en_core_web_sm' not found. Using a simple pipeline.")
    nlp = spacy.blank("en")

# Check for GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Initialize Pinecone
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY", "pcsk_5KEN7q_TRVH2gFB5Xgh6DjTx6VVfADn2rRow5z5KWYd3cFVguHNbfWJH2yndKDyMNfKpsP"))
index_name = "agribot"

index = pc.Index(index_name)


# Load a different embedding model (all-MiniLM-L6-v2)
model = SentenceTransformer('all-MiniLM-L6-v2')

def scrape_text(url):
    """Scrapes and extracts clean text from a webpage."""
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve webpage.")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = soup.find_all("p")
    text = " ".join([para.get_text() for para in paragraphs])
    return re.sub(r'\s+', ' ', text).strip()

def preprocess_text(text):
    """Performs NLP preprocessing: tokenization, stopword removal, lemmatization, NER, and POS tagging."""
    stop_words = set(stopwords.words('english'))

    # Remove punctuation, dates, years, numbers, and lowercase text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', '', text)  # Remove dates in various formats (e.g., 01/01/2022)
    text = re.sub(r'\b\d{4}\b', '', text)  # Remove years (e.g., 2022)
    text = re.sub(r'\b\d+\b', '', text)  # Remove any numbers

    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Named entity recognition and POS tagging with spaCy
    doc = nlp(" ".join(filtered_tokens))
    named_entities = [ent.text for ent in doc.ents]

    # Count POS tags
    pos_counts = {}
    for token in doc:
        pos_tag = token.pos_
        pos_counts[pos_tag] = pos_counts.get(pos_tag, 0) + 1

    # Lemmatization with spaCy
    lemmatized_tokens = [token.lemma_ for token in doc]

    return " ".join(lemmatized_tokens), named_entities, pos_counts

def chunk_text(text, max_chunk_size=200):
    """Splits long text into smaller chunks for better embedding performance."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks, chunk = [], []

    for sentence in sentences:
        chunk.append(sentence)
        if len(" ".join(chunk).split()) > max_chunk_size:
            chunks.append(" ".join(chunk))
            chunk = []

    if chunk:
        chunks.append(" ".join(chunk))

    return chunks

def store_embeddings(url):
    """Processes text from a website, generates embeddings, and stores them in Pinecone."""
    text = scrape_text(url)
    if not text:
        return

    text_chunks = chunk_text(text)
    for i, chunk in enumerate(text_chunks):
        processed_text, named_entities, pos_counts = preprocess_text(chunk)
        embedding = model.encode(processed_text).tolist()

        # Print to verify the embedding dimension is 768
        print(f"Embedding shape: {len(embedding)}")  # This should print 768

        metadata = {
            "url": url,
            "chunk_id": i,
            "named_entities": named_entities,
            "top_pos_tags": list(pos_counts.keys())[:5]
        }

        index.upsert(vectors=[{"id": f"{url}_{i}", "values": embedding, "metadata": metadata}])

    print(f"Stored {len(text_chunks)} chunks in Pinecone.")

if __name__ == "__main__":
    url = input("Enter agricultural website URL: ")
    store_embeddings(url)
    print("Embeddings stored successfully!")


Using device: cpu
Enter agricultural website URL: https://www.gvsprinklers.com.au/blog/5-types-irrigation-systems/
Embedding shape: 384
Embedding shape: 384
Embedding shape: 384
Stored 3 chunks in Pinecone.
Embeddings stored successfully!
