In [1]:
import PyPDF2
import re

In [2]:
reader = PyPDF2.PdfReader('./c1 psych.pdf')

In [3]:
# problem
reader.pages[1].extract_text().split('\n')[61]

'Central Cape Gir 1:58.61  SO Kennedy Ringwald 7St. Francis Borg 1:59.16  SO Addison Pfeiffer 8'

In [4]:
def split_merged_line(line):
    """Splits a merged line into separate records when two swimmers' entries are combined."""
    time_pattern = r'(\d{1,2}:\d{2}\.\d{2}|\d{2}\.\d{2})'
    
    matches = list(re.finditer(time_pattern, line))

    if len(matches) <= 1:
        return [line] # ok

    split_match = re.search(r'(\d+)([A-Z])', line)

    if split_match:
        split_point = split_match.start(2)
        return [line[:split_point].strip(), line[split_point:].strip()]

    return [line]

In [5]:
split_merged_line('Central Cape Gir 1:58.61  SO Kennedy Ringwald 7St. Francis Borg 1:59.16  SO Addison Pfeiffer 8')

['Central Cape Gir 1:58.61  SO Kennedy Ringwald 7',
 'St. Francis Borg 1:59.16  SO Addison Pfeiffer 8']

In [6]:
output_file = 'psych_res_individual.txt'

with open(output_file, 'w') as f:
    event_name = ""

    for page in reader.pages:
        lines = page.extract_text().split('\n')

        for line in lines:
            if "Event" in line and "Girls" in line and "Yard" in line and "Relay" not in line: # process relays later
                new_name = re.sub(r'Event\s+\d+\s+\.+\((.+?)\)', r'\1', line).strip()
                new_name = " ".join(new_name.split())
                if new_name in event_name:
                    continue # reading header of a new page
                else:
                    event_name = new_name
                    f.write(f"EVENT NAME: {event_name}\n")
                continue

            possible_records = split_merged_line(line)
            for line in possible_records:
                match = re.match(r'(.+?)\s+(\d{1,2}:\d{2}\.\d{2}|\d{2}\.\d{2})\s+(\w{2})\s+([A-Za-z-]+(?:\s+[A-Za-z-]+)*)\s+(\d+)', line)

                if match:
                    school, time, grade, name, place = match.groups()
                    f.write(f"{school}\t{time}\t{grade}\t{name}\t{place}\n")

print(f"Extraction complete. Data saved to {output_file}")

Extraction complete. Data saved to psych_res_individual.txt


In [7]:
# another problem
reader.pages[0].extract_text().split('\n')[60]

'7) Claire Weber SO 8) India Browning SO1:59.35 Parkway Central 12'

In [8]:
def split_merged_relay_line(line):
    """Splits a merged relay team line where the next team's time and school name are attached."""
    split_match = re.search(r'([A-Za-z][A-Za-z])(\d{1,2}:\d{2}\.\d{2})', line)
    # split_match = re.search(r'(\d+)([A-Z])', line)

    if split_match:
        split_point = split_match.start(2)
        return [line[:split_point].strip(), line[split_point:].strip()]

    return [line] # ok

In [9]:
split_merged_relay_line('7) Claire Weber SO 8) India Browning SO1:59.35 Parkway Central 12')

['7) Claire Weber SO 8) India Browning SO', '1:59.35 Parkway Central 12']

In [10]:
output_file = 'psych_res_relays.txt'

with open(output_file, 'w') as f:
    event_name = ""
    relay_mode = False
    current_relay = ""

    for page in reader.pages:
        lines = page.extract_text().split('\n')

        for i, line in enumerate(lines):
            if "Event" in line and "Girls" in line and "Yard" in line:
                new_name = re.sub(r'Event\s+\d+\s+\.+\((.+?)\)', r'\1', line).strip()
                new_name = " ".join(new_name.split())
                if new_name in event_name:
                    continue # reading header of a new page
                else:
                    event_name = new_name
                    f.write(f"EVENT NAME: {event_name}\n")
                    
                relay_mode = "Relay" in event_name
                continue
            
            if relay_mode:
                possible_records = split_merged_relay_line(line)
                for line in possible_records:
                    relay_match = re.match(r'(\d{1,2}:\d{2}\.\d{2})\s+(.+?)\s+(\d+)', line)
                    
                    if relay_match:
                        if current_relay:
                            f.write("\n".join(current_relay) + "\n")

                        seed_time, team, place = relay_match.groups()
                        current_relay = [f"{seed_time}\t{team}\t{place}"]
                        continue

                    swimmer_match = re.findall(r'(\d\))\s+([A-Za-z-]+(?:\s+[A-Za-z-]+)*)\s+([A-Z]{2})', line)
                    if swimmer_match and current_relay is not None:
                        for _, name, grade in swimmer_match:
                            current_relay.append(f"{name}\t{grade}")
                        continue

    if current_relay:
        f.write("\n".join(current_relay) + "\n")

print(f"Extraction complete. Data saved to {output_file}")

Extraction complete. Data saved to psych_res_relays.txt


In [11]:
# read individual events
i_events = {}
with open("./psych_res_individual.txt", "r") as times:
    lines = times.readlines()
    curr_evnt = ""
    
    for l in lines:
        if "EVENT NAME" in l:
            curr_evnt = l[12:len(l)-1]
            i_events[curr_evnt] = []
        else:
            as_list = l.split('\t')
            if len(as_list)==0:
                continue
            else:
                i_events[curr_evnt].append({
                    "team": as_list[0], "time": as_list[1], "grade": as_list[2], "name": as_list[3], 
                    "place": int(as_list[4])})

In [32]:
i_events['Event 11 Girls 100 Yard Breaststroke'][0:10]

[{'team': 'Visitation Acade',
  'time': '1:05.35',
  'grade': 'SR',
  'name': 'Ashlyn Canale',
  'place': 1},
 {'team': 'Villa Duchesne',
  'time': '1:06.10',
  'grade': 'JR',
  'name': 'Charlotte Brown',
  'place': 2},
 {'team': "St. Joseph's Aca",
  'time': '1:06.52',
  'grade': 'FR',
  'name': 'Payton Robic',
  'place': 3},
 {'team': "St. Joseph's Aca",
  'time': '1:07.38',
  'grade': 'JR',
  'name': 'Vienna Schindler',
  'place': 4},
 {'team': 'John Burroughs',
  'time': '1:07.48',
  'grade': 'SR',
  'name': 'Jacqueline Hu',
  'place': 5},
 {'team': 'Ursuline Academy',
  'time': '1:07.48',
  'grade': 'SO',
  'name': 'Kennedy Chrun',
  'place': 6},
 {'team': 'St. Charles West',
  'time': '1:07.49',
  'grade': 'SR',
  'name': 'Stella Garrison',
  'place': 7},
 {'team': 'Notre Dame de Si',
  'time': '1:09.02',
  'grade': 'SO',
  'name': 'Finley Glennon',
  'place': 8},
 {'team': 'Parkway Central',
  'time': '1:09.57',
  'grade': 'JR',
  'name': 'Serena Huang',
  'place': 9},
 {'team':

In [13]:
# read relay events
r_events = {}
with open("./psych_res_relays_test.txt", "r") as times:
    lines = times.readlines()
    curr_evnt = ""
    for l in lines:
        if "EVENT NAME" in l:
            curr_evnt = l[12:len(l)-1]
            r_events[curr_evnt] = []
        else:
            as_list = l.split('\t')
            if as_list[0][0].isdigit():
                r_events[curr_evnt].append({
                    "event": " ".join(curr_evnt.split()[3:]), "time": as_list[0], "team": as_list[1], "place": int(as_list[2])})

In [33]:
r_events['Event 1 Girls 200 Yard Medley Relay'][0:10]

[{'event': '200 Yard Medley Relay',
  'time': '1:46.01',
  'team': "St. Joseph's Aca",
  'place': 1},
 {'event': '200 Yard Medley Relay',
  'time': '1:51.63',
  'team': 'Pembroke Hill',
  'place': 2},
 {'event': '200 Yard Medley Relay',
  'time': '1:52.04',
  'team': 'Visitation Acade',
  'place': 3},
 {'event': '200 Yard Medley Relay',
  'time': '1:53.36',
  'team': 'Villa Duchesne',
  'place': 4},
 {'event': '200 Yard Medley Relay',
  'time': '1:53.81',
  'team': 'Clayton',
  'place': 5},
 {'event': '200 Yard Medley Relay',
  'time': '1:54.14',
  'team': 'John Burroughs',
  'place': 6},
 {'event': '200 Yard Medley Relay',
  'time': '1:55.81',
  'team': 'Parkway West',
  'place': 7},
 {'event': '200 Yard Medley Relay',
  'time': '1:55.95',
  'team': 'Parkway Central',
  'place': 8},
 {'event': '200 Yard Medley Relay',
  'time': '1:56.25',
  'team': 'West Plains',
  'place': 9},
 {'event': '200 Yard Medley Relay',
  'time': '1:58.16',
  'team': 'Monett',
  'place': 10}]

In [15]:
# score
school_scores = {}
school_events = {}
person_scores = {}
person_events = {}

scoring = [20,17,16,15,14,13,12,11,9,7,6,5,4,3,2,1]
relay_scoring = [40,34,32,30,28,26,24,22,18,14,12,10,8,6,4,2]

for event in i_events:
    for i, info in enumerate(i_events[event]):
        if i==len(scoring):
            break
        
        if info["team"] in school_scores:
            school_scores[info["team"]] += scoring[i]
            school_events[info["team"]].append(info)
        else:
            school_scores[info["team"]] = scoring[i]
            school_events[info["team"]] = [info]

        if info["name"] in person_scores:
            person_scores[info["name"]] += scoring[i]
            person_events[info["name"]].append(info)
        else:
            person_scores[info["name"]] = scoring[i]
            person_events[info["name"]] = [info]

for event in r_events:
    for i, info in enumerate(r_events[event]):
        if i==len(relay_scoring):
            break
        if info["team"] in school_scores:
            school_scores[info["team"]] += relay_scoring[i]
            school_events[info["team"]].append(info)
        else:
            school_scores[info["team"]] = relay_scoring[i]
            school_events[info["team"]] = [info]

In [16]:
top_10 = sorted(school_scores.items(), key=lambda x:x[1], reverse=True)[0:10]

for i, item in enumerate(top_10):
    print(i+1, ": ", item[1], ",\t", item[0], sep="")

1: 301,	St. Joseph's Aca
2: 171,	Central Cape Gir
3: 163,	Parkway Central
4: 160,	Pembroke Hill
5: 144,	Clayton
6: 126,	Visitation Acade
7: 117,	Villa Duchesne
8: 96,	St. Teresa's Aca
9: 93,	West Plains
10: 89,	Father Tolton Re


In [17]:
sorted(person_scores.items(), key=lambda x:x[1], reverse=True)[0:5]

[('Anna-Grace Guenther', 40),
 ('Charlotte Brown', 37),
 ('Helena Tietjen', 37),
 ('Sydney Ringwald', 37),
 ('Ashlyn Canale', 37)]

In [18]:
for e in school_events['Parkway Central']:
    print(e)

{'team': 'Parkway Central', 'time': '25.04', 'grade': 'JR', 'name': 'Cecilia Tremont', 'place': 8}
{'team': 'Parkway Central', 'time': '25.43', 'grade': 'SO', 'name': 'Kenadie Johnson', 'place': 12}
{'team': 'Parkway Central', 'time': '57.71', 'grade': 'JR', 'name': 'Reese Anderson', 'place': 3}
{'team': 'Parkway Central', 'time': '59.02', 'grade': 'JR', 'name': 'Serena Huang', 'place': 5}
{'team': 'Parkway Central', 'time': '54.71', 'grade': 'JR', 'name': 'Cecilia Tremont', 'place': 6}
{'team': 'Parkway Central', 'time': '56.89', 'grade': 'SO', 'name': 'Gabrielle Salmans', 'place': 16}
{'team': 'Parkway Central', 'time': '5:12.36', 'grade': 'JR', 'name': 'Reese Anderson', 'place': 3}
{'team': 'Parkway Central', 'time': '1:09.57', 'grade': 'JR', 'name': 'Serena Huang', 'place': 9}
{'event': '200 Yard Medley Relay', 'time': '1:55.95', 'team': 'Parkway Central', 'place': 8}
{'event': '200 Yard Freestyle Relay', 'time': '1:41.69', 'team': 'Parkway Central', 'place': 5}
{'event': '400 Yard

In [19]:
# time for diving!

reader = PyPDF2.PdfReader('./c1 district diving.pdf')

In [20]:
len(reader.pages[0].extract_text().split('\n')[5].split())

8

In [21]:
reader.pages[1].extract_text().split('\n')

['2025 C1D2 MSHSAA Girls District Championship',
 'Event Results',
 'Girls Diving 11 Dives',
 'Score Diver Name Rank Order Team Affiliation Pts',
 '111421.00 Eiley Minich Parkway West',
 '210370.60 Sarah Fields Parkway Central',
 '3 9320.10 Gabriella Pupillo Parkway West',
 '4 5301.40 Abigail Hirsch Ft. Zumwalt South',
 '5 8295.15 Eliza Loveless Parkway West',
 '6 7284.00 Natalie Bruins Parkway Central',
 '7 2277.85 Samantha Walden Ft. Zumwalt East',
 '8 4272.85 Shelbey Hoagland Ft. Zumwalt South',
 '9 6271.10 Catherine Bowman Parkway West',
 '10 3241.30 Caitlyn Gagnepain St. Dominic',
 '11 1194.35 Savannah Richards Lutheran/Veritas',
 'Mary Institute and Country Day School 2/15/2025 1:06:40 PM Page 1 of 1']

In [22]:
def breakup_long_diver_string(input_str):
    pattern = r'^(\d+)\s+(\d+)([A-Za-z]+)\s+([A-Za-z]+)\s+(.*?)\s+(\d+\.\d{2})\s+(\d+\.\d{2})\s+(\d+\.\d{2})$'

    match = re.match(pattern, input_str)

    if match:
        place = int(match.group(1))
        diving_order = int(match.group(2))
        diver_name = f"{match.group(3)} {match.group(4)}"
        school = match.group(5)
        prelims = float(match.group(6))
        semis = float(match.group(7))
        finals = float(match.group(8))
        
        return [place, diving_order, diver_name, school, prelims, semis, finals]
    else:
        return None

In [23]:
def breakup_short_diver_string(input_str):
    pattern = r'^(?:(\d+)\s+(\d{1,2})|(\d)(\d{1,2}))(\d{3}\.\d{2})\s+([A-Za-z]+\s[A-Za-z]+)\s+(.*)$'
    match = re.match(pattern, input_str)
    
    if not match:
        return None
    
    if match.group(1) is not None:
        place = int(match.group(1))
        diving_order = int(match.group(2))
    else:
        place = int(match.group(3))
        diving_order = int(match.group(4))
    
    score = float(match.group(5))
    diver_name = match.group(6)
    school = match.group(7)
    
    return [place, diving_order, score, diver_name, school]

In [24]:
def can_convert_to_float(value):
    try:
        float(value)
        return True
    except (ValueError, TypeError):
        return False

In [25]:
all_district_res = []

for page in reader.pages:
    lines = page.extract_text().split('\n')

    for i, line in enumerate(lines):
        if i<4 or i==len(lines)-1:
            continue
        line = line.replace('-', '')
        split_line = line.split()
        if can_convert_to_float(split_line[-1]):
            res = breakup_long_diver_string(line)
            if not res:
                if 'and DIVING' in line:
                    line = lines[i-1]+" "+line
                    res = breakup_long_diver_string(line)
                    if not res:
                        continue
                    all_district_res.append({'score': res[6], 'school': res[3], 'name': res[2], })
                else:
                    print(line, 'long')
                continue
            all_district_res.append({'score': res[6], 'school': res[3], 'name': res[2], })
        else:
            res = breakup_short_diver_string(line)
            if not res:
                print(line, 'short')
                continue
            all_district_res.append({'score': res[2], 'school': res[4], 'name': res[3]})

6 5Addison Branch MONETT HIGH SCHOOL SWIMMING  short
11 1Addalyn Brownlee MONETT HIGH SCHOOL SWIMMING  short


In [34]:
all_district_res[4]

{'score': 306.8, 'school': 'Saint Joesphs Academy', 'name': 'Tatum Smiley'}

In [27]:
top_16_divers = sorted(all_district_res, key=lambda x:x['score'], reverse=True)[0:16]
top_16_divers

[{'score': 423.5, 'school': 'Notre Dame de Sion', 'name': 'Naya Narciso'},
 {'score': 421.0, 'school': 'Parkway West', 'name': 'Eiley Minich'},
 {'score': 393.2,
  'school': 'Central High School',
  'name': 'Journey Wildschuetz'},
 {'score': 385.6, 'school': 'Notre Dame de Sion', 'name': 'Stella Henderson'},
 {'score': 373.45, 'school': 'Kearney', 'name': 'Anna Williams'},
 {'score': 370.6, 'school': 'Parkway Central', 'name': 'Sarah Fields'},
 {'score': 361.55, 'school': 'Villa Duchesne', 'name': 'Hannah Ponciroli'},
 {'score': 358.5, 'school': 'Greenwood', 'name': 'Sophia Sechler'},
 {'score': 345.3, 'school': 'Notre Dame de Sion', 'name': 'Kate Sobba'},
 {'score': 332.6, 'school': 'Belton High School', 'name': 'Emma Gillespie'},
 {'score': 325.4, 'school': 'Notre Dame de Sion', 'name': 'Brooke Sobba'},
 {'score': 322.75, 'school': 'Poplar Bluff', 'name': 'Allison Williamson'},
 {'score': 320.1, 'school': 'Parkway West', 'name': 'Gabriella Pupillo'},
 {'score': 313.55, 'school': 'St 

In [28]:
scoring = [20,17,16,15,14,13,12,11,9,7,6,5,4,3,2,1]

for i, info in enumerate(top_16_divers):
    info['place'] = i+1
    if info["school"] in school_scores:
        school_scores[info["school"]] += scoring[i]
        school_events[info["school"]].append(info)
    else:
        school_scores[info["school"]] = scoring[i]
        school_events[info["school"]] = [info]

In [29]:
new_top_10 = sorted(school_scores.items(), key=lambda x:x[1], reverse=True)[0:10]

for i, item in enumerate(new_top_10):
    print(i+1, ": ", item[1], ",\t", item[0], sep="")

1: 301,	St. Joseph's Aca
2: 176,	Parkway Central
3: 171,	Central Cape Gir
4: 160,	Pembroke Hill
5: 144,	Clayton
6: 129,	Villa Duchesne
7: 126,	Visitation Acade
8: 96,	St. Teresa's Aca
9: 93,	West Plains
10: 89,	Parkway West


In [30]:
for e in school_events['Parkway Central']:
    print(e)

{'team': 'Parkway Central', 'time': '25.04', 'grade': 'JR', 'name': 'Cecilia Tremont', 'place': 8}
{'team': 'Parkway Central', 'time': '25.43', 'grade': 'SO', 'name': 'Kenadie Johnson', 'place': 12}
{'team': 'Parkway Central', 'time': '57.71', 'grade': 'JR', 'name': 'Reese Anderson', 'place': 3}
{'team': 'Parkway Central', 'time': '59.02', 'grade': 'JR', 'name': 'Serena Huang', 'place': 5}
{'team': 'Parkway Central', 'time': '54.71', 'grade': 'JR', 'name': 'Cecilia Tremont', 'place': 6}
{'team': 'Parkway Central', 'time': '56.89', 'grade': 'SO', 'name': 'Gabrielle Salmans', 'place': 16}
{'team': 'Parkway Central', 'time': '5:12.36', 'grade': 'JR', 'name': 'Reese Anderson', 'place': 3}
{'team': 'Parkway Central', 'time': '1:09.57', 'grade': 'JR', 'name': 'Serena Huang', 'place': 9}
{'event': '200 Yard Medley Relay', 'time': '1:55.95', 'team': 'Parkway Central', 'place': 8}
{'event': '200 Yard Freestyle Relay', 'time': '1:41.69', 'team': 'Parkway Central', 'place': 5}
{'event': '400 Yard