<a href="https://colab.research.google.com/github/shatinz/tripreports/blob/master/tripreport2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# dependencies

In [None]:
!pip install sentence-transformers transformers accelerate
!pip install generativeai
!pip install pandas
!pip install google-generativeai

Collecting generativeai
  Downloading generativeai-0.0.1-py3-none-any.whl.metadata (479 bytes)
Downloading generativeai-0.0.1-py3-none-any.whl (1.2 kB)
Installing collected packages: generativeai
Successfully installed generativeai-0.0.1


# extracting tables and reports


In [2]:
#1.extracting links
import requests
from bs4 import BeautifulSoup

url = "https://www.erowid.org/experiences/exp.cgi?S1=18"
response = requests.get(url, verify=False)  # Disable SSL verification
soup = BeautifulSoup(response.content, 'html.parser')

# Find all tr elements with class "exp-list-row"
rows = soup.find_all('tr', class_='exp-list-row')

# Extract href from the <a> tag within each row
links = []
for row in rows:
    link_tag = row.find('a')
    if link_tag and 'href' in link_tag.attrs:
        links.append("https://www.erowid.org/experiences/" + link_tag['href'])

# Print the extracted links
for link in links:
    print(link)



https://www.erowid.org/experiences/exp.php?ID=118252
https://www.erowid.org/experiences/exp.php?ID=112397
https://www.erowid.org/experiences/exp.php?ID=96444
https://www.erowid.org/experiences/exp.php?ID=64048
https://www.erowid.org/experiences/exp.php?ID=62835
https://www.erowid.org/experiences/exp.php?ID=34866
https://www.erowid.org/experiences/exp.php?ID=113441
https://www.erowid.org/experiences/exp.php?ID=113133
https://www.erowid.org/experiences/exp.php?ID=99695
https://www.erowid.org/experiences/exp.php?ID=94632
https://www.erowid.org/experiences/exp.php?ID=103790
https://www.erowid.org/experiences/exp.php?ID=96614
https://www.erowid.org/experiences/exp.php?ID=106677
https://www.erowid.org/experiences/exp.php?ID=89887
https://www.erowid.org/experiences/exp.php?ID=98866
https://www.erowid.org/experiences/exp.php?ID=101485
https://www.erowid.org/experiences/exp.php?ID=101541
https://www.erowid.org/experiences/exp.php?ID=93315
https://www.erowid.org/experiences/exp.php?ID=96521
http

In [3]:
#extracting html of tables
import requests
from bs4 import BeautifulSoup
from IPython.display import display

# Assuming 'links' list from the previous cell is available

extracted_data = []

for link in links:
    try:
        response = requests.get(link, verify=False)
        soup = BeautifulSoup(response.content, 'html.parser')

        substance_div = soup.find('div', class_='substance')
        substance = substance_div.text.strip() if substance_div else None

        dosechart_table = soup.find('table', class_='dosechart')
        dosechart = str(dosechart_table) if dosechart_table else None

        bodyweight_table = soup.find('table', class_='bodyweight')
        bodyweight = str(bodyweight_table) if bodyweight_table else None

        footdata_table = soup.find('table', class_='footdata')
        footdata = str(footdata_table) if footdata_table else None

        extracted_data.append({
            'link': link,
            'substance': substance,
            'dosechart': dosechart,
            'bodyweight': bodyweight,
            'footdata': footdata
        })

    except Exception as e:
        print(f"Error processing link {link}: {e}")

# You can now process the extracted_data list, for example, print it
display(extracted_data)



[{'link': 'https://www.erowid.org/experiences/exp.php?ID=118252',
  'substance': 'DMT Fumarate & Harmala Alkaloids',
  'dosechart': '<table class="dosechart">\n<tr>\n<td align="right" class="dosechart-time">DOSE:<br/> T+ 0:00</td>\n<td align="center" class="dosechart-amount">1 - 3 g</td>\n<td align="center" class="dosechart-method">oral</td>\n<td class="dosechart-substance"><a href="/plants/kratom/">Kratom</a></td>\n<td class="dosechart-form">(daily)</td>\n</tr>\n<tr>\n<td align="right" class="dosechart-time">\xa0 T+ 0:00</td>\n<td align="center" class="dosechart-amount">210 mg</td>\n<td align="center" class="dosechart-method">oral</td>\n<td class="dosechart-substance">Harmine</td>\n<td class="dosechart-form">(capsule)</td>\n</tr>\n<tr>\n<td align="right" class="dosechart-time">\xa0 T+ 0:00</td>\n<td align="center" class="dosechart-amount">210 mg</td>\n<td align="center" class="dosechart-method">oral</td>\n<td class="dosechart-substance"><a href="/chemicals/harmala/">Tetrahydroharmine<

In [4]:
#mapping scrapted tables html to schema
import pandas as pd
from bs4 import BeautifulSoup

schema_fields = [
    "previous_experience",
    "set_and_setting",
    "dose_amount", # Renamed dose_and_route to dose_amount
    "dose_method", # Added new field for dose method
    "before_after_changes",
    "extraordinary_thinking",
    "combination_with_other_substances",
    "intention",
    "experience_phases",
    "onset_description",
    "perceived_realness",
    "objective_elements",
    "entities_or_other_beings",
    "childhood_trauma",
    "time_relation",
    "sex_effects",
    "self_love_experience",
    "semantic_relations",
    "year_of_experience",
    "age",
    "gender",
    "substance_used",
    "most_important_element"
]

processed_reports = []

for report_data in extracted_data:
    processed_report = {}
    for field in schema_fields:
        processed_report[field] = None  # Initialize all fields to None

    # Map scraped data to schema_fields
    if report_data.get('substance'):
        processed_report['substance_used'] = report_data['substance']

    if report_data.get('bodyweight'):
        bodyweight_soup = BeautifulSoup(report_data['bodyweight'], 'html.parser')
        bodyweight_td = bodyweight_soup.find('td', class_='bodyweight-amount')
        processed_report['age'] = bodyweight_td.text.strip() if bodyweight_td else None

    if report_data.get('footdata'):
        footdata_soup = BeautifulSoup(report_data['footdata'], 'html.parser')

        year_td = footdata_soup.find('td', class_='footdata-expyear')
        processed_report['year_of_experience'] = year_td.text.replace('Exp Year:', '').strip() if year_td else None

        gender_td = footdata_soup.find('td', class_='footdata-gender')
        processed_report['gender'] = gender_td.text.replace('Gender:', '').strip() if gender_td else None

        age_td = footdata_soup.find('td', class_='footdata-ageofexp')
        processed_report['age'] = age_td.text.replace('Age at time of experience:', '').strip() if age_td else None

    # Extract dose amount and method from dosechart
    if report_data.get('dosechart'):
        dosechart_soup = BeautifulSoup(report_data['dosechart'], 'html.parser')
        amount_td = dosechart_soup.find('td', class_='dosechart-amount')
        method_td = dosechart_soup.find('td', class_='dosechart-method')

        processed_report['dose_amount'] = amount_td.text.strip() if amount_td else None
        processed_report['dose_method'] = method_td.text.strip() if method_td else None

    processed_reports.append(processed_report)

# Create a pandas DataFrame from the processed data
df_processed_reports = pd.DataFrame(processed_reports)

# Display the first few rows of the DataFrame
display(df_processed_reports.head())

Unnamed: 0,previous_experience,set_and_setting,dose_amount,dose_method,before_after_changes,extraordinary_thinking,combination_with_other_substances,intention,experience_phases,onset_description,...,childhood_trauma,time_relation,sex_effects,self_love_experience,semantic_relations,year_of_experience,age,gender,substance_used,most_important_element
0,,,1 - 3 g,oral,,,,,,,...,,,,,,2024,36,Female,DMT Fumarate & Harmala Alkaloids,
1,,,138 mg,smoked,,,,,,,...,,,,,,2018,25,Female,DMT & Changa,
2,,,75 mg,insufflated,,,,,,,...,,,,,,2011,21,Male,DMT & Methoxetamine,
3,,,3.0 g,oral,,,,,,,...,,,,,,2007,Not Given,Male,"Syrian Rue, Cacti - T. peruvianus, 2C-E, 4-AcO...",
4,,,,smoked,,,,,,,...,,,,,,2007,Not Given,Male,DMT,


In [5]:
#extracting reports html
import requests
from bs4 import BeautifulSoup
import json

# Assuming 'links' list is available from the previous scraping step

report_html_contents = []

for link in links:
    try:
        response = requests.get(link, verify=False) # Disable SSL verification
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        report_html_contents.append({
            'link': link,
            'html_content': response.text # Store text content instead of bytes
        })
        print(f"Fetched content for: {link}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching content for {link}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred while fetching {link}: {e}")

print(f"Finished fetching content for {len(report_html_contents)} reports.")

# Save the scraped data to a JSON file
with open('reports_html.json', 'w') as f:
    json.dump(report_html_contents, f, indent=4)

print("Scraped HTML data saved to reports_html.json")



Fetched content for: https://www.erowid.org/experiences/exp.php?ID=118252




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=112397




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=96444




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=64048




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=62835




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=34866




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=113441




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=113133




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=99695




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=94632




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=103790




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=96614




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=106677




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=89887




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=98866




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=101485




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=101541




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=93315




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=96521




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=94429




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=93145




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=80226




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=82934




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=86665




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=83719




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=82837




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=80880




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=76614




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=75003




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=70283




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=66730




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=65793




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=65384




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=58800




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=58907




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=52797




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=46854




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=46856




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=41106




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=30919




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=22238




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=31979




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=25157




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=23350




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=16314




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=8842




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=2205




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=1851




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=1841




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=1839




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=118625




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=117294




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=117473




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=102105




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=115502




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=114947




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=100673




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=99247




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=106448




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=106105




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=108109




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=113511




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=106818




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=101367




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=102952




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=112845




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=112640




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=112771




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=111847




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=87759




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=96140




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=85721




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=100953




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=99900




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=103249




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=109084




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=95151




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=100929




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=97363




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=110185




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=108821




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=91964




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=110432




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=81462




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=101884




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=108077




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=94599




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=90113




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=90263




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=88428




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=107582




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=100109




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=97591




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=90557




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=101998




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=87447




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=80512




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=90269




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=83098




Fetched content for: https://www.erowid.org/experiences/exp.php?ID=92381
Finished fetching content for 100 reports.
Scraped HTML data saved to reports_html.json


In [6]:
#cleaning reports html
from bs4 import BeautifulSoup, Comment # Import Comment to find HTML comments
import json

cleaned_reports = []

for report_data in report_html_contents:
    link = report_data['link']
    html_content = report_data['html_content']

    try:
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find the div containing the report text
        report_div = soup.find('div', class_='report-text-surround')
        report_text = None

        if report_div:
            # The main report text is often between <!-- Start Body --> and <!-- End Body --> comments
            # within the 'report-text-surround' div. We need to iterate through the contents
            # to find the text nodes and handle <br> tags.
            content_started = False
            extracted_lines = []
            for content in report_div.contents:
                if isinstance(content, Comment) and "Start Body" in content:
                    content_started = True
                    continue
                if isinstance(content, Comment) and "End Body" in content:
                    content_started = False
                    break
                if content_started:
                    if content.name == 'br':
                        extracted_lines.append('\n') # Replace <br> with newline
                    elif isinstance(content, str):
                        extracted_lines.append(content)
                    # You might need to handle other tags within the text if they exist,
                    # but for simple text and line breaks, this should work.
                    # If there are nested tags with text, you might need a recursive approach
                    # or use get_text() on specific elements if they consistently contain text.
                    elif hasattr(content, 'get_text'):
                         extracted_lines.append(content.get_text())


            report_text = "".join(extracted_lines).strip()

        if report_text:
            cleaned_reports.append({
                'link': link,
                'report_text': report_text
            })
            print(f"Successfully extracted report text for {link}")
        else:
            print(f"Warning: Could not extract report text for {link}")

    except Exception as e:
        print(f"Error processing HTML for {link}: {e}")

print(f"Finished extracting report text for {len(cleaned_reports)} reports.")

# Save the cleaned reports to a JSON file
with open('cleaned_reports.json', 'w') as f:
    json.dump(cleaned_reports, f, indent=4)

print("Cleaned report data saved to cleaned_reports.json")

Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=118252
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=112397
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=96444
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=64048
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=62835
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=34866
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=113441
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=113133
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=99695
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=94632
Successfully extracted report text for https://www.erowid.org/experiences/exp.php?ID=1

# sending extracted report to llm


In [None]:
schema_fields = [
    "previous_experience",
    "set_and_setting",
    "before_after_changes",
    "intention",
    "experience_phases",
    "onset_description",
    "perceived_realness",
    "objective_elements",
    "entities_or_other_beings",
    "childhood_trauma",
    "sex_effects",
    "self_love_experience",
    "most_important_element"
]


# Task
Extract information from each report in "cleaned_reports.json" using the Gemini API to fill a predefined schema. Chunk reports if necessary to fit within token limits, process each chunk separately, and save the extracted information for each report and chunk. Implement sleep time between API calls.

## Load cleaned reports

### Subtask:
Load the cleaned report data from `cleaned_reports.json`.


**Reasoning**:
Load the cleaned report data from the JSON file.



In [None]:
import json

with open('cleaned_reports.json', 'r') as f:
    cleaned_reports = json.load(f)

## Initialize gemini api

### Subtask:
Set up the Gemini API key and model.


**Reasoning**:
Configure the Gemini API key and specify the model for text generation.



In [None]:
import google.generativeai as genai
import os
from google.colab import userdata

# Configure the Gemini API key
# Ensure you have the API key stored in an environment variable or replace 'YOUR_API_KEY'
genai.configure(api_key=userdata.get("GOOGLE_API_KEY"))

# Specify the Gemini model to be used
model_name = "gemini-1.5-flash"

## Define schema and prompt

### Subtask:
Define the schema fields and create a clear prompt for Gemini to extract information from the reports based on the schema, emphasizing concise answers.


**Reasoning**:
Define the schema fields and create the extraction prompt for the Gemini model based on the instructions.



In [None]:
schema_fields = [
    "previous_experience",
    "set_and_setting",
    "before_after_changes",
    "intention",
    "experience_phases",
    "onset_description",
    "perceived_realness",
    "objective_elements",
    "entities_or_other_beings",
    "childhood_trauma",
    "sex_effects",
    "self_love_experience",
    "most_important_element",
    "experiencing_fear"
]

extraction_prompt = f"""
You are an AI assistant tasked with extracting specific information from a drug experience report.
Your goal is to read the provided report text and extract information for the following fields:
{', '.join(schema_fields)}

For each field, provide a concise answer directly related to the field name.
If the information for a field is not present in the report, state "Not specified".

Here is the report text:
{{report_text}}

Please provide the extracted information in a structured format (e.g., JSON or a clear list of key-value pairs).
"""

print(extraction_prompt)


You are an AI assistant tasked with extracting specific information from a drug experience report.
Your goal is to read the provided report text and extract information for the following fields:
previous_experience, set_and_setting, before_after_changes, intention, experience_phases, onset_description, perceived_realness, objective_elements, entities_or_other_beings, childhood_trauma, sex_effects, self_love_experience, most_important_element, experiencing_fear

For each field, provide a concise answer directly related to the field name.
If the information for a field is not present in the report, state "Not specified".

Here is the report text:
{report_text}

Please provide the extracted information in a structured format (e.g., JSON or a clear list of key-value pairs).



## Process reports with gemini

### Subtask:
Iterate through each cleaned report. For each report:
- Check if the report text needs chunking to fit within Gemini's token limit.
- If chunking is needed, split the report into smaller chunks.
- Send each chunk (or the whole report if no chunking is needed) to the Gemini API with the defined prompt.
- Process Gemini's response to extract the schema information.
- Store the extracted information for each report, keeping track of which chunk it came from if chunking was used.
- Implement a sleep time between API calls to avoid rate limits or token errors.


**Reasoning**:
Iterate through each cleaned report, chunk if necessary, send to Gemini API, process response, store extracted data, and implement sleep.



In [None]:
import time

extracted_data_from_llm = []
# A rough estimate of token limit for Gemini 1.5 Flash, considering prompt size and response size
# This might need adjustment based on actual usage and model capabilities
TOKEN_LIMIT = 10000  # Example token limit, adjust as needed
SLEEP_TIME = 5  # seconds

for report_data in cleaned_reports:
    link = report_data['link']
    report_text = report_data['report_text']
    report_chunks = []

    # Simple chunking mechanism based on character count as a proxy for tokens
    # A more sophisticated approach would use a proper tokenizer
    if len(report_text) > TOKEN_LIMIT * 0.8: # Chunk if text is close to the limit
        # Split into chunks roughly based on token limit
        chunk_size = int(TOKEN_LIMIT * 0.7) # Make chunks smaller than the limit
        report_chunks = [report_text[i:i + chunk_size] for i in range(0, len(report_text), chunk_size)]
        print(f"Report {link} chunked into {len(report_chunks)} parts.")
    else:
        report_chunks = [report_text]
        print(f"Report {link} does not require chunking.")

    for i, chunk in enumerate(report_chunks):
        print(f"Processing chunk {i+1}/{len(report_chunks)} for report {link}")
        prompt = extraction_prompt.format(report_text=chunk)

        try:
            # Send prompt to Gemini API
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(prompt)

            # Process the response
            # Assuming the response is in a format that can be directly used or easily parsed
            # This part might need significant adjustment based on Gemini's actual output format
            extracted_info = {
                'link': link,
                'chunk_index': i,
                'extracted_data': response.text # Store the raw response text for now
            }
            extracted_data_from_llm.append(extracted_info)
            print(f"Successfully processed chunk {i+1} for {link}")

        except Exception as e:
            print(f"Error processing chunk {i+1} for {link}: {e}")
            extracted_data_from_llm.append({
                'link': link,
                'chunk_index': i,
                'extracted_data': f"Error: {e}"
            })

        # Implement sleep time
        time.sleep(SLEEP_TIME)
        print(f"Sleeping for {SLEEP_TIME} seconds.")

# The extracted_data_from_llm list now contains the results for all reports and chunks
# You can further process or save this list as needed
# For this subtask, we just populate the list.

Report https://www.erowid.org/experiences/exp.php?ID=118252 chunked into 4 parts.
Processing chunk 1/4 for report https://www.erowid.org/experiences/exp.php?ID=118252
Error processing chunk 1 for https://www.erowid.org/experiences/exp.php?ID=118252: HTTPConnectionPool(host='localhost', port=38591): Read timed out. (read timeout=600.0)
Sleeping for 5 seconds.
Processing chunk 2/4 for report https://www.erowid.org/experiences/exp.php?ID=118252


KeyboardInterrupt: 

# embedding tripreports

In [1]:
import json
from sentence_transformers import SentenceTransformer

# Load cleaned reports
with open('cleaned_reports.json', 'r') as f:
    cleaned_reports = json.load(f)

# Initialize the Gemma embedding model
# You might need to adjust the model name based on available models in sentence-transformers
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for each report
report_embeddings = []
for report in cleaned_reports:
    report_text = report['report_text']
    embedding = embedding_model.encode(report_text)
    report_embeddings.append({
        'link': report['link'],
        'embedding': embedding.tolist() # Convert numpy array to list for JSON serialization
    })

# You can now use report_embeddings for further analysis, e.g., similarity search, clustering
# For example, save the embeddings to a JSON file
with open('report_embeddings.json', 'w') as f:
    json.dump(report_embeddings, f, indent=4)

print(f"Generated embeddings for {len(report_embeddings)} reports and saved to report_embeddings.json")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated embeddings for 100 reports and saved to report_embeddings.json


*italicized text*# New Section


# adding df processedd reports

In [8]:
# Embedding df_processed_reports and adding to report_embeddings

# Assuming 'embedding_model' is already initialized from a previous step
# Assuming 'df_processed_reports' is available from a previous step
# Assuming 'cleaned_reports' is available from a previous step and has the links

# Create a combined text representation for each row in the DataFrame
df_processed_reports['combined_text'] = df_processed_reports.apply(
    lambda row: ' '.join(f"{col}: {row[col]}" for col in df_processed_reports.columns if col not in ['link', 'combined_text'] and pd.notna(row[col])),
    axis=1
)

# Generate embeddings for the combined text and associate with the original link
df_embeddings = []
# Iterate through the processed reports and use the link from the original cleaned_reports
for index, row in df_processed_reports.iterrows():
    combined_text = row['combined_text']
    embedding = embedding_model.encode(combined_text)
    # Find the corresponding link from the cleaned_reports using the index
    original_link = cleaned_reports[index]['link']
    df_embeddings.append({
        'link': original_link,
        'embedding': embedding.tolist() # Convert numpy array to list for JSON serialization
    })

# Append the new embeddings to the existing report_embeddings list
# Load existing embeddings first if the list is not already in memory
try:
    with open('report_embeddings.json', 'r') as f:
        report_embeddings = json.load(f)
except FileNotFoundError:
    report_embeddings = [] # Start with an empty list if the file doesn't exist

report_embeddings.extend(df_embeddings)

# Save the updated embeddings to a JSON file
with open('report_embeddings.json', 'w') as f:
    json.dump(report_embeddings, f, indent=4)

print(f"Generated embeddings for {len(df_embeddings)} processed reports and added to report_embeddings.json.")
print(f"Total embeddings in report_embeddings.json: {len(report_embeddings)}")

Generated embeddings for 100 processed reports and added to report_embeddings.json.
Total embeddings in report_embeddings.json: 200


#clustering


