In [1]:
import itertools
import string
import random
import json
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

In [2]:
def generate_row_letters(num_strings: int, max_length: int = 2):
    counter = 0
    for length in range(1, max_length + 1):
        if counter >= num_strings:
            break
        for item in itertools.product(string.ascii_uppercase, repeat=length):
            if counter >= num_strings:
                return
            yield "".join(item)
            counter += 1

In [9]:
def generate_plate(rows: int, cols: int):
    plate = np.random.randint(1, 100, size=(rows, cols))
    return plate

In [3]:
def _gen_filler(n_cols: int, has_empty: bool = True) -> list[float]:
    fill = []
    for _ in range(n_cols):
        if has_empty:
            fill.append(None)
        # bias towards blanks since most plates are surrounded by blanks
        r = random.randint(1, 6 if has_empty else 3)
        if r == 1:
            fill.append(random.random())
        elif r == 2:
            fill.append(random.randint(1, 1000))
        elif r == 3:
            res = ''.join(random.choices(string.ascii_lowercase + string.ascii_uppercase +
                             string.digits, k=random.randint(5, 50)))
            fill.append(res)
        else:
            fill.append(None)
    return fill
    

In [4]:
def create_nontabular_plate(
    n_rows: int,
    n_cols: int,
    offset: int,
    contents: str = 'Plate',
    n_filler: int = 10
) -> pd.DataFrame:
    cols_before = random.randint(1, n_filler)
    cols_after = random.randint(1, n_filler)
    tot_cols = n_cols + cols_before + cols_after
    rows_before = random.randint(1, n_filler)
    rows_after = random.randint(1, n_filler)
    # Add one for the header
    row_start = rows_before + offset + 1
    results = {'row_start': row_start, 
               'row_end': row_start + (n_rows - 1), # it's inclusive
               'col_start': cols_before, 
               'col_end': cols_before + (n_cols - 1), # it's inclusive
               'contents': contents}
    plate_rows = []
    letters = list(generate_row_letters(n_rows))
    # Generate junk rows before the header
    for row in range(rows_before):
        plate_rows.append(_gen_filler(tot_cols))
    # Generate the header which is a monotonically increasing sequence
    plate_header = (
        _gen_filler(cols_before) + list(range(1, n_cols + 1)) + _gen_filler(cols_after)
    )
    plate_rows.append(plate_header)
    # Fill out the rest of the plate
    for row in range(n_rows):
        data_row = (
            _gen_filler(cols_before - 1)
            + [letters[row]]
            # This is the plate contents
            + _gen_filler(n_cols, has_empty=False)
            + _gen_filler(cols_after)
        )
        plate_rows.append(data_row)
    for row in range(rows_after):
        plate_rows.append(_gen_filler(tot_cols))
    return pd.DataFrame(plate_rows), results

In [5]:
def create_plates(
    n_plates: int,
    n_rows: int,
    n_cols: int,
    return_dict: bool = False,
) -> pd.DataFrame | dict[str, pd.DataFrame]:
    plates: list[pd.DataFrame] = []
    results: list[dict] = []
    offset = 0
    for i in range(n_plates):
        plate_df, result = create_nontabular_plate(n_rows, n_cols, offset)
        offset += plate_df.shape[0]
        plates.append(plate_df)
        results.append(result)
    return {f"plate_{i}": plate_data for i, plate_data in enumerate(plates)} if return_dict else pd.concat(plates, ignore_index=True), results

In [6]:
def _get_plate_from_result_dict(df: pd.DataFrame, result_dict: dict) -> pd.DataFrame:
    row_start, row_end = result_dict['row_start'], result_dict['row_end'] + 1
    col_start, col_end = result_dict['col_start'], result_dict['col_end'] + 1
    proposed_plate = df.iloc[
        row_start:row_end,
        col_start:col_end,
    ]
    return proposed_plate

In [10]:
n_plates = 2
plates_df, results = create_plates(n_plates, 4, 6)
print(results)
_get_plate_from_result_dict(plates_df, results[0])

[{'row_start': 8, 'row_end': 11, 'col_start': 4, 'col_end': 9, 'contents': 'Plate'}, {'row_start': 19, 'row_end': 22, 'col_start': 7, 'col_end': 12, 'contents': 'Plate'}]


  return {f"plate_{i}": plate_data for i, plate_data in enumerate(plates)} if return_dict else pd.concat(plates, ignore_index=True), results


Unnamed: 0,4,5,6,7,8,9
8,,,A,1,956,F1ajilfsxvnnuhyaDTRaqcHtEy4So1KFKtJe3k5KL53IF6pJ
9,,,B,574,RQGhiToKkXE5NaNOtwv7qKEb38iaOX,584
10,,872.0,C,402,0.701383,250
11,,,D,348,0.023827,m7ruCPPjbSjVL3L


In [11]:
plates_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,,XgW1yG2JziKl,,0.831536,,,,950,,,,0.080083,,0.972764,,,,,,0.438795,,UVIidz9GEzObkBTZ3Q5VwtiD,,,,,,,,,,
1,,,,IRIBOFhzALkofBWwqpJ2vYlBITfPGg2wmQUbisMiKvo4Z,,881,,,,,,,,,,,,,,,,Kz7y1Bo1kSxXpYs6OC8tE55AcAJFAx1xWlCEa6oAYSO7G9,,,,,,,,,,
2,,,,0.139817,,578,,,,5qsRyFt2ICMTLUxk,,0.440834,,233.0,,,,0.665498,,,,,,,,0.236294,,,,,,
3,,955,,,,,,vvo6c8d3uRKjMqfVVGoP7Re0m6Pby1,,,,820,,,,,,628,,72,,0.737257,,0.509196,,BGmdzi4MBDB9BYyYCalKAwwPzUJkwB5iMZt3u3nJM0r9wkQK,,,,,,
4,,,,,,106,,0.266888,,,,478,,,,904,,79,,,,0.611219,,278,,VGnvKB2ObtB6ewjY0mMVbkjWhyVNxO,,,,,,
5,,,,,,,,,,2U41GGmCM,,8mBBWFVIFrsB9S179QlBLFVVwpgR3CgSjnc,,0.613069,,0.209509,,NXZqzJyEUe,,,,615,,VXearSrtExKHMjTTxhqvTQaGil2AGVzPwufIsM3DgRsxX,,0.720037,,,,,,
6,,,,664,,,,,,0.67897,,,,0.461165,,,,,,mRuUQW0wzjuCV,,528,,wH7ssTVjc,,,,,,,,
7,,,,0.841097,,24,,,1,2,3,4,5,6.0,,44,,HQ1xIQxv1rZDjuXRZQZ,,,,,,,,,,,,,,
8,,8ndV05sYjJdGxdqDP4zqiwj7ucMij,,0.930407,,,A,1,956,F1ajilfsxvnnuhyaDTRaqcHtEy4So1KFKtJe3k5KL53IF6pJ,480,464,27,,0.275999,,958.0,,,,,,,,,,,,,,,
9,,,,,,,B,574,RQGhiToKkXE5NaNOtwv7qKEb38iaOX,584,0.94756,315,0.261854,,,,571.0,,,,,,,,,,,,,,,


In [103]:
evals = []
for i in range(100):
    plates_df, results = create_plates(random.randint(1, 10), 4, 6)
    evals.append({"input": plates_df.to_csv(index=False, header=False), "output":results})
for i in range(100):
    plates_df, results = create_plates(random.randint(1, 10), 8, 12)
    evals.append({"input": plates_df.to_csv(index=False, header=False), "output":results})
for i in range(100):
    plates_df, results = create_plates(random.randint(1, 10), 16, 24)
    evals.append({"input": plates_df.to_csv(index=False, header=False), "output":results})
for i in range(10):
    plates_df, results = create_plates(random.randint(1, 10), 32, 48)
    evals.append({"input": plates_df.to_csv(index=False, header=False), "output":results})

In [108]:
with open('gen_evals.json', 'w') as f:
    json.dump(evals, f)

In [114]:
plates_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
0,,0lGZsTtZqRq1YGyEHYGFhR0DW7bIfUNSxJlbjCPHQLfa11KwC,,,,,,,,,,0.121591,,966,,,,112,,,,0.983377,,608.0,,,,,,gZO1PtE0nKJ3lwzDGw,,,,
1,,0.2955,,l5sIozlzJfnOQ67xrmUzpiBVZoB7TD,,0.269455,,,,,,129,,kVtQ5VBE49u,,,,,1,2,3,4,5,6.0,,,,,,,,,,
2,,722,,,,i6AstG22s8Cb88ktNduWfu2O4ZSMtT18ySfhFlK0d9t,,,,1z3LMVH9UTx5OxbazLSJl8pgxjLLspp7zWZPrJwAf02Nnshd7,,0.029341,,,,,A,0.020699,0.776257,oaqsnXhXqqFyvPybUFaapUftTdEroY303ahWMKxYBIZ,TryYzSOqwPejSMmr4DMJrEhoO55nbsTJ3SY,0.861957,lfBqcQsv2oMWoJCzLygxKIedwyQ7iXT1juIUDJThZdsB3r...,,kmzohBqQcxtPHkJmhODI6Ov2joDNmBMY,,,,,,,,,
3,,0.702402,,,,,,YTOxGPEGlSH7KlONI6uSzZyVj7PdJ3eVlf9fFALXmRgUdzcU,,,,268,,0.209087,,0.4656,B,20us17NGMB5AHUn72EXfiEXupxTMXtyUL5xDUg3MKuYID,70,D28qisjvRiONwW9nKhGuQoHmgHxVppiLJhgg,0.683478,907,0.827364,,,,,,,,,,,
4,,,,,,,,,,,,,,478,,,C,41,xhHtGvH9bRmofTl6PT4uRMUdZpqCUrHl7nib72o,8,0.013126,0.337339,948,,,,,,,,,,,
5,,0.15484,,,,,,,,945,,538,,KUk8wQH99CzjAR2t94GxwRMwLiXB7WRZhQDk29i7HBxm0i...,,0.224276,D,340,66,Fv78o91,e32TAsxqeqtjGXrxl9cQc5,NE73bDert9O6LTTieu0pDwOJtYlPjJ78z1A,cHuOBrgAI7jgxAusDWMMUlOx5tRa9KRz4Vem,,,,,,,,,,,
6,,,,,,0.531886,,,,0.342786,,,,,,0.696387,,DVg1fdLMN3vu,,,,151,,0.521799,,308.0,,,,,,,,
7,,0.045334,,,,,,,,,,bYGUODKj0l,,,,546.0,,,,IfBupSHirZ,,,,0.12874,,0.161239,,0.758832,,,,k98otDxWQYlNmX7Fr1uVsWi8y,,
8,,0.439353,,,,,,148,,,,0.665736,,0.716956,,,,975,,,,158,,411.0,,,,,,,,,,
9,,ooMs0vYObIIXBsYwkp,,0.904097,,eyODFr,,,,,,,,,,,,337,,252,,,,,,,,,,,,,,a6EurjfKbqRwNAsTHOw6LlU6yEoFL26LXPVofBm0
