## Agriculture Knowledge Base 

### This workbook demonstrates how to create a knowledge base.
### In the next part we would use this knowledge base to create tools through GPT API calls. 
### The agent needs to be accurate, and the solution should be low cost.

This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.

In [None]:
import os
import ast
import pandas as pd
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

**This cell creates a website class to get the components of the website**

In [None]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# The links are also stored which can be used to enhance the chatbot
class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
        

In [None]:
# Initiation of the CSV and the Base folder 
CSV_PATH = 'diseases_with_links.csv'
ROOT_DIR = 'Data'

## Helper Functions 
* ### Scrape the Website Content 
* ### To create folder structure
* ### Using Gpt Api to create markdown from the web content

In [None]:
# Web scrapping function using beautiful soup
def get_all_details(url):
    """
    Fetches the full text/content of `url` and returns it as a string.
    (You already have Website(url).get_contents() implemented.)
    """
    result = "Landing page:\n"
    result += Website(url).get_contents()
    return result

In [None]:

# —————————————————————————————
# FUNCTION 1: BUILD FOLDER STRUCTURE
# —————————————————————————————

def build_folder_structure(csv_path: str, root_dir: str):
    """
    Reads csv_path for 'disease' and 'keyword' columns,
    then creates:
        root_dir/knowledge_base/<disease>/<keyword>/
    """
    df = pd.read_csv(csv_path, converters={'links': ast.literal_eval})
    for _, row in df.iterrows():
        disease = row['disease']
        keyword = row['keyword']
        folder = os.path.join(root_dir, 'knowledge_base', disease, keyword)
        os.makedirs(folder, exist_ok=True)
    print(f"Folder tree created under {root_dir}/knowledge_base/")

In [None]:
def make_user_prompt(disease: str, keyword: str, page_content: str) -> str:
    return (
        f"Here is the full text for **{disease}** ({keyword}):\n\n"
        f"{page_content}\n\n"
        "Please distill that into a detailed Markdown with appropriate headings and bullet points,"
        "Make sure You don't add anything unecessary thimgs like add content , emails etc and extra content, which is not provided to you."
    )

In [2]:
def generate_markdown_from_links(
    csv_path: str,
    root_dir: str,
    start_disease: str = None,
    start_row: int = 0
):
    """
    Reads csv_path for 'disease', 'keyword', and 'links'.
    - If start_disease is given, skips all diseases until name matches.
    - If start_row > 0, skips that many rows in the DataFrame first.
    For each surviving row it:
      * Iterates links, fetching page text and summarizing.
      * On any SSL or other error, logs & skips that link.
      * Writes out <idx>.md into root_dir/knowledge_base/<disease>/<keyword>/
    """
    df = pd.read_csv(csv_path, converters={'links': ast.literal_eval})
    total_rows = len(df)
    started = start_disease is None  # if no disease specified, start immediately

    for row_idx, row in df.iterrows():
        # skip leading rows if requested
        if row_idx < start_row:
            continue

        disease = row['disease']
        keyword = row['keyword']
        links   = row['links']
        base_folder = os.path.join(root_dir, 'knowledge_base', disease, keyword)

        # skip until we hit start_disease
        if not started:
            if disease != start_disease:
                print(f"Skipping disease {disease!r} until we reach {start_disease!r}")
                continue
            print(f"Starting at disease {disease!r}")
            started = True

        os.makedirs(base_folder, exist_ok=True)

        for idx, url in enumerate(links, start=1):
            try:
                print(f"[{disease}/{keyword}] [{row_idx+1}/{total_rows}] link #{idx}: fetching content…")
                page_content = get_all_details(url)

                user_prompt = make_user_prompt(disease, keyword, page_content)
                print(f"[{disease}/{keyword}] summarizing link #{idx} via GPT…")
                resp = openai.chat.completions.create(
                    model=MODEL,
                    messages=[
                        {"role": "system",  "content": SYSTEM_PROMPT},
                        {"role": "user",    "content": user_prompt}
                    ],
                )
                md_content = resp.choices[0].message.content

                # write the markdown file
                md_path = os.path.join(base_folder, f"{idx}.md")
                with open(md_path, 'w', encoding='utf-8') as f:
                    f.write(md_content)

            except (UrllibSSLError, ssl.SSLError) as e:
                print(f" SSL error on {url!r}: {e}. Skipping this link.")
                continue
            except Exception as e:
                print(f" Error processing link {url!r}: {e}. Skipping.")
                continue

        # end of links for this disease → next row automatically

    print(" Done generating Markdown files.")

## Initiation and calling the Functions to genrate Markdown files 

In [None]:
# Initiation of the CSV and the Base folder 
CSV_PATH = 'diseases_with_links.csv'
ROOT_DIR = 'Data'


In [None]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
build_folder_structure(CSV_PATH, ROOT_DIR)

In [None]:
MODEL = 'gpt-4o-mini'  # or whichever model you prefer
SYSTEM_PROMPT = (
    "You are an expert Agricultural specialist writer. "
    "Given the full page content for a disease under one of two categories (About/Cure), "
    "produce a Markdown document with headings ### About and ### Cure as appropriate."
    "Don't give unecessary information try to use only the information given to you." 
)

In [None]:
generate_markdown_from_links(CSV_PATH, ROOT_DIR, start_row=20)