In [1]:
# Imports
import signal
import numpy as np
import pandas as pd
from sympy import *
from tqdm import tqdm

from type1 import *
from type2 import *
from type4 import * # type4 already imports type3
from verify import *
from deduplication import *

## Generate Polynomial Question and Answer Data Samples

In [2]:
# Number of desired problems to generate for each type:
n_problems = 200

eval_path = '../../data/train/small_batch_examples/' 
data_path = '../../data/train/final_csvs/' 

### Type 1: Symbolic Nondimensionalisation

Note: deduplication is included in `nondimensionalize_polynomial1` function. These solutions do not require rounding of numerical values. 

In [3]:
# Eval problems for comparison
type_1_eval_path = 'polynomial_type_1_100.xlsx'

# Create empty dataframe
df = pd.DataFrame(columns=['question', 'solution', 
                           'question_type', 'answer_type',
                           'extracted_answer',
                           'small_eval_point', 'small_analytical', 'small_numerical',
                           'large_eval_point', 'large_analytical', 'large_numerical'])

# Function to generate type 1 problems and add to df
def generate_and_add_problem(df):
    n1, n2 = generate_n1n2(max_degree=25)
    res = nondimensionalize_polynomial1(n1, n2, type_1_eval_path)
    if res is not None:
        question, answer, question_type, answer_type, extracted_answer = res
        row_df = pd.DataFrame({'question': [question],
                               'solution': [answer],
                               'question_type': [question_type],
                               'answer_type': [answer_type],
                               'extracted_answer': [extracted_answer],
                               'small_eval_point': [None],
                               'small_analytical': [None],
                               'small_numerical': [None],
                               'large_eval_point': [None],
                               'large_analytical': [None],
                               'large_numerical': [None]})
        df = pd.concat([df, row_df], ignore_index=True)
    return df


# Generate n_problems
while len(df) < n_problems:
    df = generate_and_add_problem(df)


# Save to csv
# df.to_csv(data_path + f'polynomial_type_1_{n_problems}.csv', index=False)
df.to_csv(f'polynomial_type_1_{n_problems}.csv', index=False)

### Type 2: Numeric Nondimensionalisation

Note: rounding already implemented in function.

In [5]:
# Eval problems for comparison
type_2_eval_path = 'polynomial_type_2_100.xlsx'
type_2_data = pd.read_excel(type_2_eval_path)

# Create empty dataframe
df = pd.DataFrame(columns=['question', 'solution', 
                           'question_type', 'answer_type',
                           'extracted_answer',
                           'small_eval_point', 'small_analytical', 'small_numerical',
                           'large_eval_point', 'large_analytical', 'large_numerical'])

# Function to generate type 2 questions and add to dataframe
def generate_and_add_problem(df):
    while True:
        coefficients = generate_polynomial(max_degree=10, num_terms=3, coeff_bounds=[-10, 10])
        new_poly_latex = latex(sympy_polynomial_from_coefficients(coefficients))

        if q_exists(new_poly_latex, type_2_data):
            print('Question already exists in eval set. Generating again.')
            continue
          
        question, answer, question_type, answer_type, extracted_answer = nondimensionalize_polynomial2(coefficients)
        row_df = pd.DataFrame({'question': [question],
                               'solution': [answer],
                               'question_type': [question_type],
                               'answer_type': [answer_type],
                               'extracted_answer': [extracted_answer],
                               'small_eval_point': [None],
                               'small_analytical': [None],
                               'small_numerical': [None],
                               'large_eval_point': [None],
                               'large_analytical': [None],
                               'large_numerical': [None]})
        df = pd.concat([df, row_df], ignore_index=True)
        break
    
    return df


# Generate n_problems
while len(df) < n_problems:
    df = generate_and_add_problem(df)


# Save to csv
# df.to_csv(data_path + f'polynomial_type_2_{n_problems}.csv', index=False)
df.to_csv(f'polynomial_type_2_{n_problems}.csv', index=False)

### Type 3: Root-finding

In [7]:
# check against existing set:
type_3_eval_path = eval_path + 'polynomial_type_3_100.xlsx'
type_3_data = pd.read_excel(type_3_eval_path)

# Create empty dataframe
df = pd.DataFrame(columns=['question', 'solution', 
                           'question_type', 'answer_type',
                           'extracted_answer',
                           'small_eval_point', 'small_analytical', 'small_numerical',
                           'large_eval_point', 'large_analytical', 'large_numerical'])


# Function to generate type 3 questions and add them to a df
def generate_and_add_problem(df):
    while True:
        polynomial = generate_nondimensionalized(10)  # Example function to generate polynomial
        
        if q_exists(latex(polynomial), type_3_data):
            #print('Question already exists in eval set. Generating again.')
            continue
        
        soln_info, q, a, q_type, a_type, extracted_answer = solve_roots(polynomial)  # Example function to solve roots
        
        # Check validity of roots
        good_roots, comparisons = check_roots(soln_info, polynomial, x_var=x, eps_var=eps, corr=False)  # Example function to check roots
        
        if good_roots:
            # Convert complex numbers in comparisons to np.complex128
            small_analytical = [[np.complex128(complex(x, y)) for x, y in comparisons[0][1]]]
            large_analytical = [[np.complex128(complex(x, y)) for x, y in comparisons[1][1]]]
            
            # Add row for the new question and answer to the dataframe
            row_df = pd.DataFrame({'question': [q],
                                   'solution': [a],
                                   'question_type': [q_type],
                                   'answer_type': [a_type],
                                   'extracted_answer': [extracted_answer],
                                   'small_eval_point': [0.01],
                                   'small_analytical': [small_analytical],
                                   'small_numerical': [comparisons[0][0]],
                                   'large_eval_point': [100],
                                   'large_analytical': [large_analytical],
                                   'large_numerical': [comparisons[1][0]]})
            df = pd.concat([df, row_df], ignore_index=True)
            break
        else:
            # print(f"Approximation for polynomial {polynomial} not good enough. Trying again.")
            pass
    
    return df

# Generate n_problems
while len(df) < n_problems:
    df = generate_and_add_problem(df)

  df = pd.concat([df, row_df], ignore_index=True)


In [8]:
print(len(df))

150


In [9]:
for i in range(len(df)): 
    df.loc[i,'solution']=round_numbers_in_string(df.loc[i, 'solution'])
    df.loc[i,'extracted_answer']=round_numbers_in_string(df.loc[i,'extracted_answer'])

In [11]:
# Save dataframe to csv
df.to_csv(data_path + f'polynomial_type_3_{len(df)}.csv', index=False)

### Type 4: Root-finding with Correction Terms

Note: there is currently no Type 4 evaluation set to check against, so we don't need to worry about duplicate problems. 

In [10]:
TIMEOUT = 5 # seconds

def timeout(func, args=(), kwargs={}, timeout_duration=TIMEOUT):
    def handler(signum, frame):
        raise TimeoutError("Function timed out")

    # Set up the signal handler for SIGALRM
    old_handler = signal.signal(signal.SIGALRM, handler)
    try:
        # Start the alarm clock
        signal.alarm(timeout_duration)
        try:
            # Execute the function with specified arguments and keyword arguments
            result = func(*args, **kwargs)
        except Exception as e:
            # Catch any exception raised by the function
            result = e
    finally:
        # Restore the original signal handler
        signal.signal(signal.SIGALRM, old_handler)
        # Cancel the alarm clock
        signal.alarm(0)

    # Check if a TimeoutError was raised
    if isinstance(result, TimeoutError):
        raise TimeoutError(f"Function execution exceeded {timeout_duration} seconds")
    else:
        return result

In [11]:
# Create empty dataframe
df = pd.DataFrame(columns=['question', 'solution', 
                            'question_type', 'answer_type',
                            'extracted_answer',
                            'small_eval_point', 'small_analytical', 'small_numerical',
                            'large_eval_point', 'large_analytical', 'large_numerical'])

# Initialize tqdm progress bar
pbar = tqdm(total=n_problems)

while len(df) < n_problems:
    try:
        # Problem type 3
        polynomial = generate_nondimensionalized(10)
        soln_info, q, a, q_type, a_type, extracted_answer3 = solve_roots(polynomial)

        # Problem type 4
        x, eps = symbols('x epsilon')
        order = random.choice([1, 2])
        soln_info, newq, newa, new_q_type, new_a_type, extracted_answer4 = timeout(
            get_delta_corrections,
            args=(soln_info, polynomial, a, x),
            kwargs={'eps_var': eps, 'term_trunc': order + 1}
        )

        # Check validity of roots and corrections (3 and 4)
        good_roots, good_corr, comparisons = check_roots(
            soln_info,
            polynomial,
            x_var=x, eps_var=eps
        )

        if good_roots and good_corr:
            # Add row for the new question and answer to the dataframe
            row_df = pd.DataFrame({
                'question': [newq],
                'solution': [newa],
                'question_type': [new_q_type],
                'answer_type': [new_a_type],
                'extracted_answer': [extracted_answer4],
                'small_eval_point': [0.01],
                'small_analytical': [[np.complex128(complex(x, y)) for x, y in comparisons[0][1]]],
                'small_numerical': [comparisons[0][0]],
                'large_eval_point': [100],
                'large_analytical': [[np.complex128(complex(x, y)) for x, y in comparisons[1][1]]],
                'large_numerical': [comparisons[1][0]]
            })

            # Ensure the row is not empty before concatenating
            if not row_df.isnull().all().all():
                df = pd.concat([df, row_df], ignore_index=True)
                pbar.update(1)  # Update progress bar
        else:
            print(f"Approximation for polynomial {polynomial} not good enough. Trying again.")

    except TimeoutError:
        print("Timeout occurred. Trying again.")
        continue  
    except Exception as e:
        print(f"An error occurred: {e}. Trying again.")
        continue  

pbar.close() 

 18%|█▊        | 27/150 [04:55<22:24, 10.93s/it]
  df = pd.concat([df, row_df], ignore_index=True)


Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Approximation for polynomial epsilon*x**6 + x - 1 not good enough. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.




Timeout occurred. Trying again.
Timeout occurred. Trying again.




Timeout occurred. Trying again.


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1029d1400>>
Traceback (most recent call last):
  File "/Users/smart/Library/Python/3.9/lib/python/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
  File "/var/folders/88/dz760h_j197bwmcvghz9mrkh0000gn/T/ipykernel_66559/1606000727.py", line 5, in handler
TimeoutError: Function timed out


KeyboardInterrupt: 

In [12]:
print(len(df))

77


In [13]:
for i in range(len(df)): 
    df.loc[i,'solution']=round_numbers_in_string(df.loc[i, 'solution'])
    df.loc[i,'extracted_answer']=round_numbers_in_string(df.loc[i,'extracted_answer'])

In [15]:
for i in range(len(df)): 
    df.loc[i,'solution']=round_numbers_in_string(df.loc[i, 'solution'])
    df.loc[i,'extracted_answer']=round_numbers_in_string(df.loc[i,'extracted_answer'])

In [17]:
# Save dataframe to csv file
df.to_csv(data_path + f'polynomial_type_4_{len(df)}.csv', index=False)