In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def scrape_questions(url_template, num_sections, num_pages):
    all_questions = []

    for section in range(1, num_sections + 1):
        for page in range(1, num_pages + 1):
            url = url_template.format(section, page)
            print(f"Scraping section {section}, page {page}...") 

            try:
                response = requests.get(url)
                response.raise_for_status() 
                soup = BeautifulSoup(response.text, 'html.parser')

                # Get all question blocks
                question_blocks = soup.find_all('article', class_='question single-question question-type-normal')

                for question_block in question_blocks:
                    # Locate the question text
                    question_tag = question_block.find('div', class_='question-main')
                    option_paragraphs = question_block.find_all('p')  
                    
                   
                    correct_option = None

                    answer_container = question_block.find('div', class_='answer_container')
                    if answer_container:
                        answer_text = answer_container.find('strong')
                        if answer_text:
                            correct_option = answer_text.text.strip().split(" ")[-1]  

                    if question_tag:
                        question = question_tag.text.strip()
                        options = []
                        for p in option_paragraphs:
                            labels = p.find_all('label')
                            if len(labels) >= 2: 
                                option_letter = labels[0].text.strip() 
                                option_value = labels[1].text.strip()  
                                options.append(f"{option_letter}{option_value}")

                       
                        question_data = {
                            "Question": question,
                            "opt1": options[0] if len(options) > 0 else None,
                            "opt2": options[1] if len(options) > 1 else None,
                            "opt3": options[2] if len(options) > 2 else None,
                            "opt4": options[3] if len(options) > 3 else None,
                            "opt5": options[4] if len(options) > 4 else None,
                            "correct_answer": correct_option,
                        }

                        all_questions.append(question_data)

            except requests.exceptions.RequestException as e:
                print(f"Request failed for section {section}, page {page}: {e}")

    return all_questions


url_template = 'https://www.examveda.com/competitive-english/practice-mcq-question-on-ordering-of-sentences/?section={}&page={}'
num_sections = 3
num_pages = 10   

output_file = 'verbal_dataset.csv'
scraped_questions = scrape_questions(url_template, num_sections, num_pages)
df_new = pd.DataFrame(scraped_questions)

if os.path.exists(output_file):
    df_existing = pd.read_csv(output_file)
    
    existing_questions_set = set(df_existing['Question'])
    df_new = df_new[~df_new['Question'].isin(existing_questions_set)]
    df_combined = pd.concat([df_existing, df_new], ignore_index=True) if not df_new.empty else df_existing
else:
    df_combined = df_new

total_questions = len(df_combined)
print(f"Total questions in dataset: {total_questions}")
if not df_combined.empty:
    df_combined.to_csv(output_file, index=False)
    print(f"Questions saved to '{output_file}'")
else:
    print("No data to save.")
