In [None]:
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np

In [None]:
page_count = range(2600)
years_int = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2021]
years = [str(year) for year in years_int]
pages = [str(num) for num in page_count[1:]]
# Create column titles
df_titles = ['place_overall','gender_place','name_ctz','event','bib','division','final_time']
final_df = pd.DataFrame(columns=df_titles)

df_dict = {}

# for each page create list for each of the columns
for year in tqdm(years, desc='Years', position=0):
    final_df = pd.DataFrame(columns=df_titles)
    time.sleep(5)
    for page in tqdm(pages, desc='Pages', position=1, leave=False):
        #Send HTTP request
        url = 'https://chicago-history.r.mikatiming.com/2022/?page='+page+'&event=ALL_EVENT_GROUP_'+year+'&lang=EN_CAP&pid=search'
        with requests.get(url) as response:
            soup = BeautifulSoup(response.text, 'html.parser')
            # if no more results element found, break loop
            no_results = soup.find('div', class_='alert alert-info text-center', text='No results found.')
            if no_results:
                print("No more results. Ending the loop.")
                break
            # get row_elements elements
            row_elements = soup.find_all(lambda tag: tag.name == 'li' and 
                                          'list-group-item' in tag.get('class', []) and 
                                          'row' in tag.get('class', []))
            
            # Overall finish place retrieval
            extracted_overall_place_elements = []
            required_classes_overall_place = {'list-field', 'type-place', 'place-secondary', 'hidden-xs'}
            extracted_gender_place_elements = []
            required_classes_gender_place = {'list-field', 'type-place', 'place-primary'}
            # Full name retrieval
            full_names = []
            # Bib and division retrieval
            bibs = []
            divisions = []
            events = []
            for li in row_elements:
                # Get elements with classes of overall finish
                found_elements = li.find_all(lambda tag: tag.name == 'div' and 
                                                           tag.get('class') and 
                                                           required_classes_overall_place.issubset(set(tag.get('class'))))
                extracted_overall_place_elements.extend(found_elements)
                # Get elements with correct classes
                found_elements = li.find_all(lambda tag: tag.name == 'div' and 
                                                           tag.get('class') and 
                                                           required_classes_gender_place.issubset(set(tag.get('class'))))
                
                # Extract and store the information from those <div> elements
                extracted_gender_place_elements.extend(found_elements)
                # Full name retrieval 
                name_elements = li.find_all('h4', class_='list-field type-fullname')
                for h4 in name_elements:
                    # get name in 'a' tag if present
                    a_tag = h4.find('a')
                    if a_tag:
                        full_names.append(a_tag.get_text(strip=True))
                    else:
                        # just get value
                        full_names.append(h4.get_text(strip=True))
                # Race
                event_element = li.find('div', class_='list-field type-event_name')
                if event_element:
                    events.append(event_element.get_text(strip=True).replace('Event', '').strip())
                # Bib and division
                bib_element = li.find('div', class_='list-field type-field')
                if bib_element:
                    bibs.append(bib_element.get_text(strip=True).replace('BIB', '').strip())
                
                division_element = li.find('div', class_='list-field type-age_class')
                if division_element:
                    divisions.append(division_element.get_text(strip=True).replace('Division', '').strip())

            bibs = bibs[1:]
            divisions = divisions[1:]
            events = events[1:]
            overall_places_list = extracted_overall_place_elements[1:]
            overall_places_text_list = []
            for element in overall_places_list:  # Note: you're using overall_places_list which is extracted_overall_place_elements[1:]
                # Step 3: Extract text content from each element
                text_content = element.get_text(separator=' ', strip=True)  # 'separator' can be used to define how to join texts if the tag contains multiple strings, 'strip' will remove extra whitespaces
                # Step 4: Append the extracted text to the list
                overall_places_text_list.append(text_content)

            gender_places_list = extracted_gender_place_elements[1:]
            gender_places_text_list = []
            for element in gender_places_list:  # Note: you're using overall_places_list which is extracted_overall_place_elements[1:]
                # Step 3: Extract text content from each element
                text_content = element.get_text(separator=' ', strip=True)  # 'separator' can be used to define how to join texts if the tag contains multiple strings, 'strip' will remove extra whitespaces
                # Step 4: Append the extracted text to the list
                gender_places_text_list.append(text_content)
    
            #finish times retrieval
            finish_times = []
            time_div_list_finish= []
            # Iterate through each row element
            try:
                for li in row_elements:
                    # Find all divs with class 'list-field type-time'
                    time_divs = li.find_all('div', class_='list-field type-time')
                    # exclude half times because previous years don't have them
                    time_divs = [time for time in time_divs if 'HALF' not in str(time)]
                    time_div_list_finish.append(time_divs[0])
            except Exception as e:
                print('Year '+year+' complete')
                break
            for div in time_div_list_finish:
                finish_times.append(div.get_text())
            finish_times = [time.replace('Finish', '') for time in finish_times[1:]]
            finish_times = [time.replace('–', '') for time in finish_times]
    
            page_df = pd.DataFrame({
                    'place_overall': overall_places_text_list,
                    'gender_place': gender_places_text_list,
                    'name_ctz': full_names,
                    'event':events,
                    'bib': bibs,
                    'division': divisions,
                    'final_time': finish_times
                })
            
            final_df = pd.concat([final_df, page_df])

    final_df = final_df[final_df['event']=='Marathon']
    final_df['place_overall'] = final_df['place_overall'].replace('–', np.nan, regex=True).replace('PND', np.nan, regex=True).astype('Int64')
    final_df['gender_place'] = final_df['gender_place'].replace('–', np.nan, regex=True).replace('PND', np.nan, regex=True).astype('Int64')
    final_df = final_df.sort_values('place_overall').reset_index()
    final_df['gender'] = np.nan

    # Use overall and gender place to determine gender
    # Set the first row of 'gender' column to 'M'
    final_df.loc[0, 'gender'] = 'M'
    for i in range(1, len(final_df)):
        if pd.isnull(final_df.loc[i, 'gender_place']):
            break
        # Check if 'gender_place' is within 15 greater than the previous row
        if final_df.loc[i, 'gender_place'] < final_df.loc[i - 1, 'gender_place'] + 15:
            # Set 'gender' to the same as the previous row
            final_df.loc[i, 'gender'] = final_df.loc[i - 1, 'gender']
        # if gender place is the same set to same gender
        if final_df.loc[i, 'gender_place'] == final_df.loc[i - 1, 'gender_place']:
            # Set 'gender' to the same as the previous row
            final_df.loc[i, 'gender'] = final_df.loc[i - 1, 'gender']
        else:
            # Set 'gender' based on the value of the previous row
            if final_df.loc[i - 1, 'gender'] == 'M':
                final_df.loc[i, 'gender'] = 'F'
            else:
                final_df.loc[i, 'gender'] = 'M'
    final_df['marathon'] = 'chicago'
    final_df['year'] = year
    final_df.to_csv(year+'_chicago_results.csv', index=False)        
    
print('DONE')