In [51]:
import os
import pandas as pd
import requests
import datetime
import re
##########################################################
from dotenv import load_dotenv

import os

load_dotenv()
ASU_key = os.environ.get("ASU_key")
file_name = os.environ.get("file_name")
LLM_url = os.environ.get("LLM_url")

now = datetime.datetime.now()
formatted_datetime = now.strftime("%Y_%m_%d_%H")

In [52]:
directory_path = 'C:\\programming_projects\\RAG_fine_tune\\RAG_pipeline_ASU_website\\data\\'
print(directory_path + 'chunked_' +file_name)
chunked_df = pd.read_csv(directory_path + 'chunked_' +file_name, nrows=100)

C:\programming_projects\RAG_fine_tune\RAG_pipeline_ASU_website\data\chunked_cleaned_ASU_webpage_04_07_2025.csv


In [53]:
chunked_df.columns

Index(['url', 'depth', 'title', 'topic', 'orig_word_count', 'orig_char_count',
       'page_text', 'filename', 'cleaned_text', 'langauage', 'chunked_text',
       'chunked_word_count', 'chunked_char_count'],
      dtype='object')

In [54]:
chunked_df[['cleaned_text', 'chunked_text']].head(10)

Unnamed: 0,cleaned_text,chunked_text
0,Nondegree requirementsTo be eligible to take c...,Nondegree requirementsTo be eligible to take c...
1,ASU Online empowers first year students to tak...,ASU Online empowers first year students to tak...
2,Becky received her BSc in Ocean and Earth Scie...,Becky received her BSc in Ocean and Earth Scie...
3,Brett is a Postdoctoral Scientist in the Coral...,Brett is a Postdoctoral Scientist in the Coral...
4,Rachel is a microbial oceanographer who invest...,Rachel is a microbial oceanographer who invest...
5,Rachel is a microbial oceanographer who invest...,F.E. B. Felts M. Breitbart P. Salamon R.A. Edw...
6,Jessica completed her Bachelor of Science in m...,Jessica completed her Bachelor of Science in m...
7,Rebecca joined BIOS in September 2021 as a sea...,Rebecca joined BIOS in September 2021 as a sea...
8,Dr. Carlson earned his BA degree at Colby Coll...,Dr. Carlson earned his BA degree at Colby Coll...
9,Steven Giovannoni has a PhD from the Universit...,Steven Giovannoni has a PhD from the Universit...


In [None]:
df_out = pd.DataFrame([])
for i in range(0, len(chunked_df['chunked_text'])):
    print(i, datetime.datetime.now())

    cleaned_string = chunked_df['cleaned_text'].loc[i]

    chunk = chunked_df['chunked_text'].loc[i]
    title = chunked_df['title'].loc[i]
    url = chunked_df['url'].loc[i]
    chunked_word_count = chunked_df['chunked_word_count'].loc[i]
    orig_word_count = chunked_df['orig_word_count'].loc[i]
    ##########################################
    ## text type
    bearer_token = ASU_key
    json_payload = {
        "query": "what is the topic of the following text from: {cleaned_string}? only respond with the topic, no other text. Please make the topic 3 words or less".format(cleaned_string=cleaned_string),
        "model_provider": "gcp-deepmind",
        "model_name": "geminiflash2",
    }
    headers = {
        "Authorization": f"Bearer {bearer_token}",
        "Content-Type": "application/json"
    }
    try:
        response = requests.post(LLM_url, headers=headers, json=json_payload)
        response.raise_for_status()
        result_document_section = response.json().get("response")
        # print("result:", result_document_section)
    except requests.exceptions.RequestException as e:
        print(f"API request error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

    ##########################################
    ## questions

    query = """given that the following text from the webpage {title} on url {url}, here is a text chunk limited to 500 words:\n {chunk}\n\n 
                what are some good questions to ask about the text chunk? Please respond with a question and 3 different answers for each question.  There should be a total of 3 questions, with having 3 different answers (for a total of 9 unique answers).

                the questions need to be well defined. Try to use the text as much as possible when crafting the answer. Answers need to be at least 2 sentences long. Do not use the phrase "The text," and avoid similar language. Rephrase the question in the answer.

                Please use the following format for the response:

                **Question 1:**
                **Question 1 Answer 1:**
                **Question 1 Answer 2:**
                **Question 1 Answer 3:**

                **Question 2:**
                **Question 2 Answer 1:**
                **Question 2 Answer 2:**
                **Question 2 Answer 3:**

                **Question 3:**
                **Question 3 Answer 1:**
                **Question 3 Answer 2:**
                **Question 3 Answer 3:**
                """.format(
                    title = title,
                    url = url,
                    chunk=chunk)

    json_payload = {
        "query": query,
        "model_provider": "gcp-deepmind",
        "model_name": "geminiflash2",
    }
    headers = {
        "Authorization": f"Bearer {bearer_token}",
        "Content-Type": "application/json"
    }
    try:
        response = requests.post(LLM_url, headers=headers, json=json_payload)
        response.raise_for_status()
        result = response.json().get("response")
        # print("result:", result)
    except requests.exceptions.RequestException as e:
        print(f"API request error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    
    ###################################################
    ## save out
    parts = result.split("**")  # Split the string at **
    parts_no = [[2, 4], [2, 6], [2,8], [10, 12], [10, 14], [10, 16], [18, 20], [18, 22], [18, 24]]


    for i in range(0, 9):
        try:
            pt1 = parts_no[i][0]
            pt2 = parts_no[i][1]

            Question = re.sub(r'[^a-zA-Z0-9.,!?\s]', ' ', str(parts[pt1]))
            Question = Answer.replace(r'\s+', ' ').strip()
            Answer = re.sub(r'[^a-zA-Z0-9.,!?\s]', ' ', str(parts[pt2]))
            Answer = Answer.replace(r'\s+', ' ').strip()


            Q1 = pd.DataFrame(data={'section':[result_document_section],
                                    'title':[title],
                                    'url': [url],
                                    'document_type':['web page'],
                                    'chunked_word_count':[chunked_word_count],
                                    'orig_word_count':[orig_word_count],
                                    'contex': [chunk],
                                    'question':[Question],
                                    'answer':[Answer]
                                    })
            df_out = pd.concat([Q1, df_out], ignore_index=True)

            if i % 10 == 0:
                df_out.to_csv(directory_path+'silver_data\\'+f'silver_data_{formatted_datetime}__{i}.csv', index=False)
                df_out = pd.DataFrame([])

        except Exception as e:
            print(f"Unexpected error: {e}")
            # print("result:", result)

0 2025-04-09 12:06:52.438888


In [65]:
df_out.head(10)

Unnamed: 0,section,title,url,document_type,chunked_word_count,orig_word_count,contex,question,answer
0,Nondegree requirements,Nondegree student Admission ASU,https://admission.asu.edu/undergrad/nondegree,web page,93,93,Nondegree requirementsTo be eligible to take c...,The advisability of speaking with a high schoo...,"Although not required, consulting with a couns..."
1,Nondegree requirements,Nondegree student Admission ASU,https://admission.asu.edu/undergrad/nondegree,web page,93,93,Nondegree requirementsTo be eligible to take c...,The provided details suggest that speaking wit...,The advisability of speaking with a high schoo...
2,Nondegree requirements,Nondegree student Admission ASU,https://admission.asu.edu/undergrad/nondegree,web page,93,93,Nondegree requirementsTo be eligible to take c...,It may also be beneficial for prospective stud...,The provided details suggest that speaking wit...
3,Nondegree requirements,Nondegree student Admission ASU,https://admission.asu.edu/undergrad/nondegree,web page,93,93,Nondegree requirementsTo be eligible to take c...,An important consideration before enrolling in...,It may also be beneficial for prospective stud...
4,Nondegree requirements,Nondegree student Admission ASU,https://admission.asu.edu/undergrad/nondegree,web page,93,93,Nondegree requirementsTo be eligible to take c...,"Before enrolling, a prospective nondegree stud...",An important consideration before enrolling in...
5,Nondegree requirements,Nondegree student Admission ASU,https://admission.asu.edu/undergrad/nondegree,web page,93,93,Nondegree requirementsTo be eligible to take c...,"Alternatively, if you are currently enrolled i...","Before enrolling, a prospective nondegree stud..."
6,Nondegree requirements,Nondegree student Admission ASU,https://admission.asu.edu/undergrad/nondegree,web page,93,93,Nondegree requirementsTo be eligible to take c...,Another requirement to be eligible as a nondeg...,"Alternatively, if you are currently enrolled i..."
7,Nondegree requirements,Nondegree student Admission ASU,https://admission.asu.edu/undergrad/nondegree,web page,93,93,Nondegree requirementsTo be eligible to take c...,To be eligible to take classes at ASU as a non...,Another requirement to be eligible as a nondeg...
