In [1]:
import requests
from requests.exceptions import ConnectionError, HTTPError, Timeout, RequestException
from bs4 import BeautifulSoup
import time
import csv
import json

In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = "https://gmatclub.com/forum/actual-lsat-cr-bank-by-broall-249447.html?srsltid=AfmBOopTVwNhrVdTv_KFVTe69W7BTUYY9B36-2ziUSlcSXHUdM0olzUJ"  # Replace with the actual URL

# Send a GET request to fetch the webpage content
response = requests.get(url)
if response.status_code != 200:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
    exit()

# Parse the webpage content
soup = BeautifulSoup(response.text, "html.parser")

# Find all <li> elements that contain the links
li_elements = soup.find_all("li", style=True)

# Extract the <a> tag links from these <li> elements
question_links = []
for li in li_elements:
    a_tag = li.find("a", href=True)
    if a_tag and "gmatclub.com/forum" in a_tag['href']:  # Filter for valid GMAT Club links
        question_links.append(a_tag['href'])

# Save the links to a file or display them
with open("question_links.txt", "w") as file:
    for link in question_links:
        file.write(link + "\n")

print(f"Extracted {len(question_links)} question links. Saved to 'question_links.txt'.")


In [None]:


# Path to the file containing the list of links
links_file = "question_links.txt"

# Output CSV file
output_file = "questions_and_answers5.csv"

# Function to fetch a webpage with retries
def fetch_page_with_retries(url, retries=3, backoff=2):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx and 5xx)
            return response
        except ConnectionError as e:
            print(f"ConnectionError on attempt {attempt + 1} for {url}: {e}")
        except HTTPError as e:
            print(f"HTTPError on attempt {attempt + 1} for {url}: {e}")
        except Timeout as e:
            print(f"Timeout on attempt {attempt + 1} for {url}: {e}")
        except RequestException as e:
            print(f"RequestException on attempt {attempt + 1} for {url}: {e}")
        if attempt < retries - 1:
            time.sleep(backoff)  # Exponential backoff
    print(f"Failed to fetch {url} after {retries} attempts.")
    return None

# Function to scrape a single link
def scrape_question_and_answer(url):
    response = fetch_page_with_retries(url)
    if response is None:
        return None, None

    soup = BeautifulSoup(response.text, "html.parser")
    # Find all <div> elements with class "item text"
    item_texts = soup.find_all("div", class_="item text")

    if not item_texts:
        print(f"No questions or answers found for {url}")
        return None, None

    # Extract the question (first <div>)
    question = item_texts[0].get_text(strip=True)

    # Concatenate all remaining <div> elements for the answer
    answer = " ".join(item.get_text(strip=True) for item in item_texts[1:])

    return question, answer

# Read all links from the file
with open(links_file, "r") as file:
    links = [line.strip() for line in file.readlines()]

# Open the output CSV file for writing
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    # Write the header row
    writer.writerow(["Question", "Answer", "URL"])

    # Process each link
    for link in links:
        print(f"Processing: {link}")
        question, answer = scrape_question_and_answer(link)
        if question and answer:
            writer.writerow([question, answer, link])

print(f"Scraping completed. Extracted data saved to {output_file}.")


In [5]:
import pandas as pd

def merge_and_deduplicate_csvs(file_paths, output_path):
    """
    Merge multiple CSV files into one and remove duplicate rows.

    Parameters:
        file_paths (list of str): List of file paths to the CSV files to merge.
        output_path (str): File path for the output merged and deduplicated CSV.
    
    Returns:
        None: Writes the merged and deduplicated DataFrame to the specified output path.
    """
    # Ensure the input is valid
    if len(file_paths) != 5:
        raise ValueError("Please provide exactly 5 CSV file paths.")

    # Load and concatenate CSVs
    dataframes = [pd.read_csv(file) for file in file_paths]
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Remove duplicate rows
    deduplicated_df = merged_df.drop_duplicates()

    # Save the deduplicated CSV
    deduplicated_df.to_csv(output_path, index=False)
    print(f"Merged and deduplicated CSV saved to {output_path}")
# Example usage:
file_paths = ["questions_and_answers.csv", "questions_and_answers2.csv", "questions_and_answers3.csv", "questions_and_answers4.csv", "questions_and_answers5.csv"]
output_path = "merged.csv"
merge_and_deduplicate_csvs(file_paths, output_path)


Merged and deduplicated CSV saved to merged.csv


In [None]:

Baseurl = 'https://api.claudeshop.top'
Skey ="sk-EInQ3CxofU5UTNTkaB8QmIfn8Vh1jBYwILBc4nDNu0StzOLq"

headers = {
   'Accept': 'application/json',
   'Authorization': f'Bearer {Skey}',
   'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
   'Content-Type': 'application/json'
}
# Function to call OpenAI API
def process_with_openai(question, answer):
    # Define the prompt for OpenAI
    prompt = f"""
\nThe following text is extracted from a website. It contains a question, answer, some discussions about the answer, and some irrelevant text. Your task is to:\n1. Filter out irrelevant text.\n2. Extract the question and options.\n3. Extract the correct answer.\n4. Based on the explanation and discussion, explain the correct answer.\nFormat the output as JSON with the following structure:\n"question": "The question and its options in a clean format,\n    "answer": "Explanation of the answer and the correct answer letter at the end"\nHere is the input text:\nQuestion: {question}\nAnswer and Discussions: {answer}"""
    
    url = Baseurl + "/v1/chat/completions"
    payload = json.dumps({
   "model": "gpt-4o-2024-08-06",
   "messages": [
      {
         "role": "system",
         "content": "You are a helpful assistant for processing educational text."
      },
      {
         "role": "user",
         "content": prompt
      }
   ]
})
    # Call the OpenAI API
    try:
        response = requests.request("POST", url, headers=headers, data=payload) 

        # Extract the response content
        content = response['choices'][0]['message']['content']
        return content.strip()
    except Exception as e:
        print(f"Error processing with OpenAI: {e}")
        return None


In [14]:

# File paths
input_csv = "output.csv"
output_json = "head_preview.json"

# Process the CSV and call OpenAI API for each entry
processed_data = []

with open(input_csv, "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        question = row["Question"]
        answer = row["Answer"]
        url = row["URL"]

        print(f"Processing question from URL: {url}")
        result = process_with_openai(question, answer)
        if result:
            try:
                # Parse the result into JSON
                result_json = json.loads(result)
                result_json["source_url"] = url  # Include source URL for reference
                processed_data.append(result_json)
            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON for {url}: {e}")

# Save the processed data to a JSON file
with open(output_json, "w", encoding="utf-8") as jsonfile:
    json.dump(processed_data, jsonfile, indent=4, ensure_ascii=False)

print(f"Processing completed. Data saved to {output_json}.")


Processing question from URL: https://gmatclub.com/forum/before-the-printing-press-books-could-be-purchased-only-249446.html
Error processing with OpenAI: HTTPSConnectionPool(host='api.claudeshop.top', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
Processing question from URL: https://gmatclub.com/forum/bevex-an-artificial-sweetener-used-only-in-soft-drinks-is-141503.html
Error processing with OpenAI: HTTPSConnectionPool(host='api.claudeshop.top', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by ProxyError('Unable to connect to proxy', OSError('Tunnel connection failed: 403 Forbidden')))
Processing question from URL: https://gmatclub.com/forum/many-environmentalists-have-urged-environmental-awareness-249449.html
Error processing with OpenAI: HTTPSConnectionPool(host='api.claudeshop.top', port=443): Max retries exceeded with url: /v1/chat/com