In [1]:
import json
import pandas as pd

In [2]:
from bs4 import BeautifulSoup
from utils import initial_cleanup

### Load the dataset

In [3]:
dataset = pd.read_csv('difficulty_dataset_generated.csv')

In [4]:
dataset.shape

(1438982, 10)

In [5]:
dataset.columns

Index(['qb_id', 'student_class', 'qb_topic', 'qb_subtopic', 'subject_code',
       'question_type', 'difficulty_level', 'qns_content', 'options',
       'difficulty_percentile'],
      dtype='object')

In [6]:
qns_content_and_options = dataset[['qns_content', 'options']]

In [7]:
qns_content_and_options.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qns_content_and_options.drop_duplicates(inplace=True)


In [8]:
qns_content_and_options.shape

(807480, 2)

In [9]:
qns_content_and_options.head()

Unnamed: 0,qns_content,options
0,"""If&nbsp;<img alt=\""\\vec c = \\vec a - \\vec ...","[""c = a &ndash; b"",""c &gt; a; c &gt; b"",""c = a..."
1,"""Three forces are acting on a body to make res...","[""3N, 3N, 7N"",""10N, 8N, 2N"",""3N, 3N, 6N"",""6N, ..."
2,"""<div style=\""text-align:justify\"">A particle ...","[""<img alt=\""\\frac{{19}}{3}m\"" src=\""16619317..."
3,"""A boat is moving in direction of vector <img ...","[""<img alt=\"" - 8\\hat i - 6\\hat j\"" src=\""15..."
5,"""Acceleration v\/s time graph for a particle m...","[""t = 0, 1, 2, 3, 4"",""t = 0, 2, 4"",""t = 1, 3"",..."


In [10]:
row = qns_content_and_options.iloc[0]
qns_content, qns_options = row['qns_content'], row['options']
print(qns_content)
print(qns_options)

"If&nbsp;<img alt=\"\\vec c = \\vec a - \\vec b\" src=\"1501045229lqPVHqk7ni.png\" \/>, which of the following relations among magnitudes cannot be achieved by any choice of&nbsp;<img alt=\"\\vec a\" src=\"1501045229oznIjIJLIn.png\" \/>&nbsp;and&nbsp;<img alt=\"\\vec b\" src=\"1501045229R4rF9t0EaK.png\" \/>?"
["c = a &ndash; b","c &gt; a; c &gt; b","c = a + b","c &lt; a &ndash; b"]


### Preparing dataset for batch api

In [11]:
question_description_prompt = """You are an expert in the Class 11 and Class 12 syllabus for competitive exams in India. Given a question extract the attributes of the question that make it difficult. If there are images in the question or options they will be provided. Respond only in a json in the format below.
{
  'sentence_complexity' : 1-question is phrased in simple language, 2-question is moderately complex, 3-question phrasing is intentionally complex,
  'keywords'       : [list of keywords in the question],
  'related_topics' : [list of related topics to the question],
  'number_of_greek_and_math_symbols_in_question' : number of greek and math symbols in question,
  'number_of_greek_and_math_symbols_in_options' : number of mathematical equations in the options,
  'number_of_graphs_in_question' : number of graphs in question,
  'number_of_graphs_in_options' : number of graphs in options,
  'multi_step_reasoning_required' : yes/no,
  'options_similarity' : 1(options are different from each other), 2(options are moderately different), 3(options are extremely confusing),
  'conceptual_depth_required' : ['recall', 'application', 'analysis', 'synthesis', 'evaluation'],
  'complex_calculation_required' : yes/no,
  'reasoning_type' : direct/indirect,
  'question_image_description' : describe the image in the question. if it contains equations return latex format output. empty if no image,
  'option_image_description' : [list of descriptions of images in the options, if it contains equations return latex format output. empty if no image]
}"""

In [12]:
def get_image_urls(html_string):
    html_string = initial_cleanup(html_string)
    index = html_string.find('<div')
    if not(index == 0 or index == 1):
        html_string = '<div>' + html_string + '</div>'
    
    soup = BeautifulSoup(html_string, 'html.parser')
    image_tags = soup.find_all('img')
    
    # Only include URLs where the img tag doesn't have an alt attribute
    img_urls = [img['src'] for img in image_tags if not img.has_attr('alt')]
    return img_urls

### Test image urls code

In [13]:
print(qns_content)
print(get_image_urls(qns_content))

"If&nbsp;<img alt=\"\\vec c = \\vec a - \\vec b\" src=\"1501045229lqPVHqk7ni.png\" \/>, which of the following relations among magnitudes cannot be achieved by any choice of&nbsp;<img alt=\"\\vec a\" src=\"1501045229oznIjIJLIn.png\" \/>&nbsp;and&nbsp;<img alt=\"\\vec b\" src=\"1501045229R4rF9t0EaK.png\" \/>?"
[]


In [14]:
def options_image_urls(options):
    options = eval(options)
    img_urls = []
    for option in options:
        img_urls.extend(get_image_urls(option))
    return img_urls

In [15]:
print(qns_options)
print(options_image_urls(qns_options))

["c = a &ndash; b","c &gt; a; c &gt; b","c = a + b","c &lt; a &ndash; b"]
[]


In [16]:
def get_urls(question, options):
    img_urls = []
    img_urls.extend(get_image_urls(question))
    img_urls.extend(options_image_urls(options))
    return img_urls

In [17]:
def attach_prefix(img_urls, prefix='https://d2lbh14zkcqlst.cloudfront.net/content_data/content_images/'):
    urls = []
    for img_url in img_urls:
        if prefix in img_url:
            urls.append(img_url)
        else:
            urls.append(prefix + img_url)
    return urls

### Test the attach_prefix code

In [18]:
img_urls = ['1673608527588314775phplV0tBhimage.png', '16736085481010609036.png', '1673608552290044040.png']
attach_prefix(img_urls)

['https://d2lbh14zkcqlst.cloudfront.net/content_data/content_images/1673608527588314775phplV0tBhimage.png',
 'https://d2lbh14zkcqlst.cloudfront.net/content_data/content_images/16736085481010609036.png',
 'https://d2lbh14zkcqlst.cloudfront.net/content_data/content_images/1673608552290044040.png']

In [19]:
img_urls = ['https://d2lbh14zkcqlst.cloudfront.net/content_data/content_images/1673608527588314775phplV0tBhimage.png', '16736085481010609036.png', '1673608552290044040.png']
attach_prefix(img_urls)

['https://d2lbh14zkcqlst.cloudfront.net/content_data/content_images/1673608527588314775phplV0tBhimage.png',
 'https://d2lbh14zkcqlst.cloudfront.net/content_data/content_images/16736085481010609036.png',
 'https://d2lbh14zkcqlst.cloudfront.net/content_data/content_images/1673608552290044040.png']

### Create batch dataset

In [20]:
num_questions = qns_content_and_options.shape[0]
batch_size = 49500
num_batches = 1 + (num_questions // batch_size)
print(num_batches)

17


In [21]:
num_urls = 0
for batch in range(num_batches):
    tasks = []
    start = batch * batch_size
    end = (batch + 1) * batch_size
    batch_questions = qns_content_and_options[start:end]
    index = 0
    for idx, row in batch_questions.iterrows():
        question = row['qns_content']
        options = row['options']
        img_urls = get_urls(question, options)
        img_urls = attach_prefix(img_urls)
        num_urls += len(img_urls)
        content = [
                    {
                        "type": "text",
                        "text": f'question_content: {question}\noptions: {options}'
                    }
                ]
        d = []
        for img_url in img_urls:
            d.append({'type' : 'image_url', 'image_url' : {'url' : img_url, 'detail' : 'low'}})
        if d:
            content.extend(d) 
        task = {
            "custom_id": f"task-{batch+1}-{index}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                    "model" :"gpt-4o-mini",
                    "temperature" : 0.1,
                    "max_tokens" : 500,
                    "response_format" : { 
                    "type": "json_object"
                    },
                    "messages" :[
                    {
                        "role": "system",
                        "content": question_description_prompt
                    },
                    {
                        "role": "user",
                        "content": content
                    }
                ],
            }
        }
        tasks.append(task)
        index += 1

    # Creating the file
    file_name = f"data_for_extraction/batch_tasks_dn_batch_{batch+1}.jsonl"

    with open(file_name, "w") as file:
        for obj in tasks:
            file.write(json.dumps(obj) + "\n")
    print("Done creating tasks. Saved to file")

Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file
Done creating tasks. Saved to file


In [22]:
num_urls

276649