In [1]:
# ----- Import Libraries -----
import openai
from dotenv import load_dotenv
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# ----- User Settings -----
input_file_path = '../06_Data/Capstone_Data/Classified_VPC_Links.csv'
output_file_path = '../06_Data/Capstone_Data/Documentation_QA_Pairs.csv'
openai_api_key_env_var = "OPENAI_KEY"
max_tokens = 1000

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import os

def fetch_and_clean_webpage(url, cache_file):
    # Check if the content is already in the cache
    if url in cache_file:
        return cache_file[url]

    try:
        response = requests.get(url)
        if response.status_code != 200:
            return "Failed to retrieve the webpage"
        soup = BeautifulSoup(response.text, 'html.parser')
        main_content = soup.find('main') or soup.body
        for script in main_content(["script", "style"]):
            script.decompose()
        cleaned_text = main_content.get_text(separator='\n', strip=True)
        cache_file[url] = cleaned_text  # Save to cache
        return cleaned_text
    except Exception as e:
        return f"Error fetching page: {e}"

def process_links(csv_file, limit=None, cache_filename='content_cache.json'):
    links_df = pd.read_csv(csv_file)
    text_links = links_df[links_df['Type'] == 'text-based']['LINK']

    # Load cache file if it exists, otherwise create an empty cache
    if os.path.exists(cache_filename):
        with open(cache_filename, 'r') as file:
            cache_file = json.load(file)
    else:
        cache_file = {}

    results = {}
    for i, link in enumerate(text_links):
        if limit and i >= limit:
            break
        content = fetch_and_clean_webpage(link, cache_file)
        results[link] = content
        print(f"Processed {i+1}/{len(text_links)}: {link}")

    # Save the updated cache file
    with open(cache_filename, 'w') as file:
        json.dump(cache_file, file)

    return results

# Example usage
processed_contents = process_links(input_file_path)

Processed 1/392: https://docs.aws.amazon.com/cli/latest/reference/vpc-lattice/
Processed 2/392: https://docs.aws.amazon.com/vpc/latest/userguide/how-it-works.html
Processed 3/392: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-getting-started.html
Processed 4/392: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-ip-addressing.html
Processed 5/392: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-cidr-blocks.html
Processed 6/392: https://docs.aws.amazon.com/vpc/latest/userguide/subnet-sizing.html
Processed 7/392: https://docs.aws.amazon.com/vpc/latest/userguide/managed-prefix-lists.html
Processed 8/392: https://docs.aws.amazon.com/vpc/latest/userguide/working-with-managed-prefix-lists.html
Processed 9/392: https://docs.aws.amazon.com/vpc/latest/userguide/working-with-aws-managed-prefix-lists.html
Processed 10/392: https://docs.aws.amazon.com/vpc/latest/userguide/sharing-managed-prefix-lists.html
Processed 11/392: https://docs.aws.amazon.com/vpc/latest/userguide/managed-pre

In [22]:
import openai

def create_qa_prompt(content):
    # Create a prompt asking the model to generate a question and answer based on the content
    instructions = f"""
    Based on ONLY the contents below, please generate as many HIGH-QUALITY question answer pairs as there is information for. I want ONLY one of two responses below. Please make the question ONE SENTENCE and the answer ONE PARAGRAPH. I want you to focus on the MAIN IDEA of the articles for the questions. 

    FIRST CASE: If you determine that there IS enough information to produce a HIGH-QUALITY question answer pair, please return the answer in EXACTLY the format here:

    QUESTION: ...

    ANSWER: ...

    SECOND CASE: If you determine that there IS NOT enough information to produce a HIGH-QUALITY question answer pair, please return 'NOT ENOUGH INFORMATION'

    Here is the content of the webpage: {content}
    """
    return instructions

def get_openai_qa_response(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            messages=[{"role": "system", "content": "Do EXACTLY as the instructions in the prompt say."},
                      {"role": "user", "content": prompt}]
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        print(f"Error in getting response: {e}")
        return None

# Load API key from .env file
load_dotenv()
openai_api_key_env_var = "OPENAI_KEY"
openai.api_key = os.getenv(openai_api_key_env_var)

output_file = "output_doc_qa_pairs.txt"  # Name of the file where you want to save the output

# Assuming 'processed_contents' is a dictionary where keys are URLs and values are the content of those URLs
for url, content in processed_contents.items():
    prompt = create_qa_prompt(content)  # Create a prompt for generating Q&A
    qa_response = get_openai_qa_response(prompt)  # Get the Q&A response from OpenAI

    # Open the file in append mode and write the URL and Q&A
    with open(output_file, "a") as file:
        file.write(f"URL: {url}\nQ&A:\n{qa_response}\n\n")

    print(f"Processed URL: {url}")

print(f"Output saved to {output_file}")

Processed URL: https://docs.aws.amazon.com/cli/latest/reference/vpc-lattice/
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/how-it-works.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-getting-started.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-ip-addressing.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-cidr-blocks.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/subnet-sizing.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/managed-prefix-lists.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/working-with-managed-prefix-lists.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/working-with-aws-managed-prefix-lists.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/sharing-managed-prefix-lists.html
Processed URL: https://docs.aws.amazon.com/vpc/latest/userguide/managed-prefix-lists-referencing.ht