In [1]:
from bs4 import BeautifulSoup
import requests
from pathlib import Path

In [2]:
base_url = 'https://www.cracklsat.net/lsat/reading-comprehension/'
response = requests.get(f'{base_url}test1.html')
response

<Response [200]>

In [9]:
test_files = []
for i in range(1, 40):
    response = requests.get(f'https://www.cracklsat.net/lsat/reading-comprehension/test-{i}.html')
    if response.status_code != 200:
        print(f"Failed to get test-{i}")
        continue
    with open(f'test-{i}.html', 'wb') as f:
        f.write(response.content)
        print(f"Saved test-{i}.html")

Failed to get test-1
Saved test-2.html
Saved test-3.html
Saved test-4.html
Saved test-5.html
Saved test-6.html
Failed to get test-7
Failed to get test-8
Saved test-9.html
Saved test-10.html
Saved test-11.html
Saved test-12.html
Saved test-13.html
Saved test-14.html
Saved test-15.html
Saved test-16.html
Saved test-17.html
Saved test-18.html
Saved test-19.html
Saved test-20.html
Saved test-21.html
Saved test-22.html
Saved test-23.html
Saved test-24.html
Saved test-25.html
Saved test-26.html
Saved test-27.html
Saved test-28.html
Saved test-29.html
Saved test-30.html
Saved test-31.html
Saved test-32.html
Saved test-33.html
Saved test-34.html
Saved test-35.html
Saved test-36.html
Saved test-37.html
Saved test-38.html
Saved test-39.html


Question API

In [5]:
class Question:
    def __init__(self, question, answer_choices, correct_answer=None):
        self.question = question
        self.answer_choices = answer_choices
        self.correct_answer = correct_answer

    def __repr__(self):
        answers = 'Answer ' + '\nAnswer '.join(self.answer_choices)
        return f"Question {self.question}\n{answers}"


In [3]:
from typing import Literal
from dataclasses import dataclass

@dataclass
class Explanation:
    question_number: int
    stimulus: str
    prompt: str
    answers: list[str]
    correct_answer: Literal['A', 'B', 'C', 'D', 'E']
    explanation: str

    def __repr__(self):
        return (f"Question Number: {self.question_number}\n"
                f"Stimulus: {self.stimulus[:80]}\n"
                f"Prompt: {self.prompt[:80]}\n"
                f"Answers: {self.answers}\n"
                f"Correct Answer: {self.correct_answer}\n"
                f"Explanation: {self.explanation}")

Example Question format in HTML:
```html
<p class=q>
    <b>1.</b> Tempest: I bought two urns from an auction at Christie’s. Christie’s advertised them as dating from the Louis XV period of the late 18th century. I now believe that they actually date from the late 19th century and are worth much less than I paid for them. Several antiques experts agree with me. Therefore, Christie’s advertised them falsely and should refund my purchase price.</p>
    <p>Which one of the following, if true, most seriously weakens Tempest’s argument?</p>
<div class="radio narrow"><label><input type="radio" name="1" value="A">A. The auction catalog described the urns as "A pair of Louis XV porphyry and gilt-bronze two-handled vases."</label></div>
<div class="radio narrow"><label><input type="radio" name="1" value="B">B. Dating antiques is an imprecise art, and often, several experts disagree about the date of origin of the same item.</label></div>
<div class="radio narrow"><label><input type="radio" name="1" value="C">C. Scientists have performed tests on the bronze linings of the urns, but they have not produced conclusive results regarding the vases' ages.</label></div>
<div class="radio narrow"><label><input type="radio" name="1" value="D">D. Christie's states in its auction catalogs that all buyers should consult outside specialists before bidding on antiques, especially in the case of extremely valuable items.</label></div>
<div class="radio narrow"><label><input type="radio" name="1" value="E">E. The experts who Tempest hired to date the urns are known as some of the best in their field.</label></div>
```

In [4]:
test_files = list(Path('data/logical-reasoning_tests').glob('*.html'))
test_files.sort(key=lambda x: int(x.stem.split('-')[1]))
test_files

[]

In [148]:
def extract_questions(test_files):
    questions = []
    question_n = 1
    for test_file in test_files:
        with open(test_file) as f:
            soup = BeautifulSoup(f, 'html.parser')
            question_elements = soup.find_all('p', class_='q')
            for question_elements in question_elements:
                answers = soup.find_all('input', attrs={'name': str(question_n), 'type': 'radio'})
                question_n += 1
                question = question_elements.text
                answer_choices = [answer.parent.text for answer in answers]
                questions.append(Question(question, answer_choices))
    return questions

In [108]:
questions = extract_questions(test_files)
questions

[Question 1. Tempest: I bought two urns from an auction at Christie’s. Christie’s advertised them as dating from the Louis XV period of the late 18th century. I now believe that they actually date from the late 19th century and are worth much less than I paid for them. Several antiques experts agree with me. Therefore, Christie’s advertised them falsely and should refund my purchase price.
 Answer A. The auction catalog described the urns as "A pair of Louis XV porphyry and gilt-bronze two-handled vases."
 Answer B. Dating antiques is an imprecise art, and often, several experts disagree about the date of origin of the same item.
 Answer C. Scientists have performed tests on the bronze linings of the urns, but they have not produced conclusive results regarding the vases' ages.
 Answer D. Christie's states in its auction catalogs that all buyers should consult outside specialists before bidding on antiques, especially in the case of extremely valuable items.
 Answer E. The experts who 

Get explanations

In [8]:
base_url = 'https://www.cracklsat.net/lsat/logical-reasoning/'
response = requests.get(f'{base_url}question-1-answer-and-explanation.html')
response

<Response [200]>

In [10]:
explanation_files = []
for i in range(1, 499):
    response = requests.get(f'{base_url}question-{i}-answer-and-explanation.html')
    if response.status_code != 200:
        print(f"Failed to get test-{i}")
        continue
    with open(f'data/lr-questions-explanations/question-{i}.html', 'wb') as f:
        f.write(response.content)
        print(f"Saved question-{i}.html")

Saved question-1.html
Saved question-2.html
Saved question-3.html
Saved question-4.html
Saved question-5.html
Saved question-6.html
Saved question-7.html
Saved question-8.html
Saved question-9.html
Saved question-10.html
Saved question-11.html
Saved question-12.html
Saved question-13.html
Saved question-14.html
Saved question-15.html
Saved question-16.html
Saved question-17.html
Saved question-18.html
Saved question-19.html
Saved question-20.html
Saved question-21.html
Saved question-22.html
Saved question-23.html
Saved question-24.html
Saved question-25.html
Saved question-26.html
Saved question-27.html
Saved question-28.html
Saved question-29.html
Saved question-30.html
Saved question-31.html
Saved question-32.html
Saved question-33.html
Saved question-34.html
Saved question-35.html
Saved question-36.html
Saved question-37.html
Saved question-38.html
Saved question-39.html
Saved question-40.html
Saved question-41.html
Saved question-42.html
Saved question-43.html
Saved question-44.ht

In [11]:
def extract_explanations(explanation_files):
    explanations = []
    for explanation_file in explanation_files:
        with open(explanation_file) as f:
            soup = BeautifulSoup(f, 'html.parser')
            content_wrap = soup.find('section', class_='content-wrap')
            content = content_wrap.find('div', class_='mcontent', recursive=True)
            question_number = int(content.find('p').text.split()[1])
            qlist = content.find('ul', class_='qlist')
            # If there are 2 or fewer <p> before the qlist, then there is no separate prompt element
            if len(qlist.find_previous_siblings('p')) < 3:
                print(f"Question {question_number} has no prompt element")
                stimulus_paragraph = qlist.find_previous_sibling('p').text
                stimulus, prompt = stimulus_paragraph.rsplit('. ', 1)
                if len(stimulus) < 5:
                    print(f"Warning: No stimulus found for question {question_number}")
                    stimulus = ''
                else:
                    stimulus = stimulus.split('. ', maxsplit=1)[1]
            else:
                prompt_item = qlist.find_previous_sibling('p')
                prompt = prompt_item.text
                stimulus_items = prompt_item.find_previous_siblings('p')
                stimulus_items.reverse()
                if not stimulus_items:
                    print(f"Warning: No stimulus found for question {question_number}")
                    stimulus = ''
                else:
                    stimulus = ' '.join([item.text for item in stimulus_items])
                    stimulus = stimulus.split('. ', maxsplit=1)[1]
            # EXCEPT: prompts have the prompt in the last sentence of the stimulus
            if 'EXCEPT' in prompt:
                stimulus, prompt = stimulus.rsplit('. ', 1)
                prompt = f"{prompt} EXCEPT:"
            answers = [item.text for item in content.find_all('li')[:5]]
            correct_answer = qlist.find_next_sibling('p').text.split()[-1]
            explanation = content.find_all('p', recursive=True)[-1].text
            explanations.append(Explanation(question_number, stimulus, prompt, answers, correct_answer, explanation))
    return explanations


In [12]:
explanation_files = list(Path('data/lr-questions-explanations').glob('*.html'))
explanation_files.sort(key=lambda x: int(x.stem.split('-')[1]))
explanation_files

[PosixPath('data/lr-questions-explanations/question-1.html'),
 PosixPath('data/lr-questions-explanations/question-2.html'),
 PosixPath('data/lr-questions-explanations/question-3.html'),
 PosixPath('data/lr-questions-explanations/question-4.html'),
 PosixPath('data/lr-questions-explanations/question-5.html'),
 PosixPath('data/lr-questions-explanations/question-6.html'),
 PosixPath('data/lr-questions-explanations/question-7.html'),
 PosixPath('data/lr-questions-explanations/question-8.html'),
 PosixPath('data/lr-questions-explanations/question-9.html'),
 PosixPath('data/lr-questions-explanations/question-10.html'),
 PosixPath('data/lr-questions-explanations/question-11.html'),
 PosixPath('data/lr-questions-explanations/question-12.html'),
 PosixPath('data/lr-questions-explanations/question-13.html'),
 PosixPath('data/lr-questions-explanations/question-14.html'),
 PosixPath('data/lr-questions-explanations/question-15.html'),
 PosixPath('data/lr-questions-explanations/question-16.html'),
 

In [13]:
explanations = extract_explanations(explanation_files)

Question 102 has no prompt element
Question 103 has no prompt element
Question 106 has no prompt element
Question 107 has no prompt element
Question 108 has no prompt element
Question 109 has no prompt element
Question 110 has no prompt element
Question 111 has no prompt element
Question 112 has no prompt element
Question 113 has no prompt element
Question 114 has no prompt element
Question 115 has no prompt element
Question 116 has no prompt element
Question 117 has no prompt element
Question 118 has no prompt element
Question 123 has no prompt element
Question 124 has no prompt element
Question 126 has no prompt element
Question 127 has no prompt element
Question 128 has no prompt element
Question 129 has no prompt element
Question 130 has no prompt element
Question 131 has no prompt element
Question 132 has no prompt element
Question 133 has no prompt element
Question 134 has no prompt element
Question 135 has no prompt element
Question 137 has no prompt element
Question 138 has no 

In [14]:
explanations[118]

Question Number: 119
Stimulus: Giant Market Gains Advertising Representative: Salon Harperbegan advertising wit
Prompt: Each of the following, if true, would weaken the advertising representative's co
Answers: ["A. Noah's Ark pet shop advertises with Giant Market Gains and has never matched the monthly sales of Salon Isabella.", 'B. Giant Market Gains only serves a small portion of Meinhart County.', 'C. Last month was a holiday month, when business is typically higher than average.', 'D. Zane, an internationally famous stylist, joined the staff of Salon Harperlast month.', 'E. Last month, Salon Harper offered a discount on their spa packages.']
Correct Answer: A
Explanation: C ., D ., and E . Each of these weakens by suggesting a possible alternate cause of the increase in sales.

In [15]:
# Remove explanations with no stimulus
explanations = [explanation for explanation in explanations if explanation.stimulus]

Write explanation objects to jsonl file

In [76]:
import json


def write_explanations_to_jsonl(explanations, output_file):
    with open(output_file, 'w') as f:
        for explanation in explanations:
            f.write(json.dumps(explanation.__dict__, ensure_ascii=False) + '\n')

In [194]:
write_explanations_to_jsonl(explanations, 'data/lr_explanations.jsonl')

Write explanation objects to tsv file

In [16]:
def write_explanations_to_tsv(explanations, output_file):
    with open(output_file, 'w') as f:
        # Write header
        f.write('question_number\tstimulus\tprompt\tA\tB\tC\tD\tE\tcorrect_answer\texplanation\n')
        for explanation in explanations:
            # Write each explanation to the TSV file
            f.write(f"{explanation.question_number}\t{explanation.stimulus}\t{explanation.prompt}\t"
                    f"{'\t'.join(explanation.answers)}\t{explanation.correct_answer}\t{explanation.explanation}\n")

In [18]:
write_explanations_to_tsv(explanations, 'data/lsat_questions.tsv')

Deduplicate rows that are identical except for the question number

In [19]:
def dedupelicate_rows(input_file, output_file):
    seen = set()
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            # Split the line into columns
            columns = line.strip().split('\t')
            # Create a tuple of the columns except for the question number
            row_tuple = tuple(columns[1:])
            # Check if the row is already seen
            if row_tuple not in seen:
                seen.add(row_tuple)
                outfile.write(line)

In [20]:
dedupelicate_rows('data/lsat_questions.tsv', 'data/lsat_questions_deduped.tsv')