In [1]:
from googleapiclient.discovery import build
import pandas as pd
from datetime import datetime
import calendar
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

with open('api_key.txt', 'r') as f:
    API_KEY = f.read().strip()

year = 2026

# === CONFIG ===
SHEET_ID = '1Er54lmX1jBCbjQWkcFjWX6hp0WsoTiSqeb8U9SNIReo'
RANGE = f'{year}!A1:Z215'

# === SETUP ===
service = build('sheets', 'v4', developerKey=API_KEY)
sheet = service.spreadsheets()

# === STEP 1: Get full sheet grid with formatting ===
result = sheet.get(
    spreadsheetId=SHEET_ID,
    ranges=[RANGE],
    includeGridData=True
).execute()

grid = result['sheets'][0]['data'][0]['rowData']

In [2]:
# Step 1: Find all month labels (like 'JANUARY', 'FEBRUARY', etc.)
month_positions = {}  # (row_idx, col_idx) => month_num
month_names = {name.upper(): i for i, name in enumerate(calendar.month_name) if name}

for row_idx, row in enumerate(grid):
    for col_idx, cell in enumerate(row.get('values', [])):
        val = cell.get('formattedValue', '')
        val_clean = val.strip().upper()
        if val_clean in month_names:
            month_num = month_names[val_clean]
            month_positions[(row_idx, col_idx)] = month_num

# Step 2: For each day-number cell, find the nearest month label *above and to the right*
date_map = {}  # (row_idx, col_idx) -> datetime

for row_idx, row in enumerate(grid):
    for col_idx, cell in enumerate(row.get('values', [])):
        val = cell.get('formattedValue', '')
        if not val.isdigit():
            continue
        day = int(val)

        # Find the closest month label above and to the right
        best_month = None
        best_distance = float('inf')

        for (m_row, m_col), month_num in month_positions.items():
            if m_row < row_idx and m_col <= col_idx:
                dist = (row_idx - m_row) + (col_idx - m_col)
                if dist < best_distance:
                    best_distance = dist
                    best_month = month_num

        if best_month is not None:
            try:
                date = datetime(year, best_month, day)
                date_map[(row_idx, col_idx)] = date
            except ValueError:
                pass  # skip invalid dates like Feb 30


In [3]:
events = []
merged_ranges = result['sheets'][0].get('merges', [])
visited = set()

# Helper to find merged range for a given cell
def get_merged_range(row, col):
    for mr in merged_ranges:
        if (
            mr['startRowIndex'] <= row < mr['endRowIndex']
            and mr['startColumnIndex'] <= col < mr['endColumnIndex']
        ):
            return mr
    return None

for row_idx, row in enumerate(grid):
    for col_idx, cell in enumerate(row.get('values', [])):
        if (row_idx, col_idx) in visited:
            continue

        val = cell.get('formattedValue')
        if not val or val.strip().isdigit():
            continue
        link = cell.get('hyperlink', '')

        # === Get event start date and event type color from the date cell above ===
        event_date = None
        event_type = 'Other'

        for r2 in range(row_idx, 0, -1):
            if (r2, col_idx) in date_map:
                event_date = date_map[(r2, col_idx)]

                # Try to read the background color from the date cell
                try:
                    date_cell = grid[r2]['values'][col_idx]  # <-- correct path
                    color = date_cell.get('effectiveFormat', {}).get('backgroundColor', {})
                    r = color.get('red', 0)
                    g = color.get('green', 0)
                    b = color.get('blue', 0)

                    if r > 0.8 and g < 0.5:
                        event_type = 'GBHL100'
                    elif r > 0.8 and g > 0.8:
                        event_type = 'GBHL90'
                    elif b > 0.8:
                        event_type = 'GBHL80'
                    else:
                        event_type = 'Other'
                except Exception as e:
                    print(f"Warning: Failed to read event type color from ({r2},{col_idx}): {e}")

                break

        if not event_date:
            continue

        # Parse event text
        lines = val.strip().split('\n')
        if len(lines) < 3:
            continue

        event_name = lines[0].strip()
        organizer = lines[1].strip()
        region_loc = lines[2].strip().strip('[]')
        region, location = region_loc.split(' - ') if ' - ' in region_loc else ('Unknown', region_loc)

        # Determine merged range (if any)
        merged = get_merged_range(row_idx, col_idx)
        if merged:
            end_col = merged['endColumnIndex'] - 1
            # Mark all merged cells as visited
            for r in range(merged['startRowIndex'], merged['endRowIndex']):
                for c in range(merged['startColumnIndex'], merged['endColumnIndex']):
                    visited.add((r, c))
        else:
            end_col = col_idx
            visited.add((row_idx, col_idx))

        # Get end date from the right-most merged cell's column
        end_date = event_date
        for r2 in range(row_idx, 0, -1):
            if (r2, end_col) in date_map:
                end_date = date_map[(r2, end_col)]
                break

        # === Extract format (Singles / Doubles) from the row below ===
        format_type = 'Unknown'

        # Determine the row below the event block
        format_row_idx = (merged['endRowIndex'] if merged else row_idx + 1)

        # Use the first column of the merged block if available, else same col
        format_col_idx = merged['startColumnIndex'] if merged else col_idx

        # Safely access the format cell
        if format_row_idx < len(grid):
            format_row = grid[format_row_idx]
            if format_col_idx < len(format_row.get('values', [])):
                format_cell = format_row['values'][format_col_idx]
                format_val = format_cell.get('formattedValue', '').strip().lower()
                if 'double' in format_val:
                    format_type = 'Doubles'
                elif 'single' in format_val:
                    format_type = 'Singles'
                elif 'team' in format_val:
                    format_type = 'Team'
                # Assume missing formats are singles events
                else:
                    format_type = 'Singles'

        event = {
            'start_date': event_date.strftime('%Y-%m-%d'),
            'end_date': end_date.strftime('%Y-%m-%d'),
            'event_name': event_name,
            'organizer': organizer,
            'region': region,
            'location': location,
            'format': format_type,
            'event_type': event_type,
            'link': link
        }

        events.append(event)


In [4]:
# === STEP 4: Show / Save Results ===
df = pd.DataFrame(events)
print(df.head())

   start_date    end_date                  event_name      organizer  \
0  2026-01-02  2026-01-02             A New Adventure      Matt King   
1  2026-01-03  2026-01-04          Into The West 2026      Matt King   
2  2026-03-28  2026-03-01    Defence of North Bristol  David Clubley   
3  2026-03-28  2026-03-01      Bonds of Fellowship VI  Callum Slater   
4  2026-03-01  2026-03-01  Scottish Team Championship  Stuart Dobbie   

     region location   format event_type link  
0        SW  Cardiff  Singles     GBHL90       
1        SW  Cardiff  Singles    GBHL100       
2        SW  Bristol  Singles     GBHL90       
3        SE  Havantl  Singles     GBHL90       
4  Sc & Ire  Falkirk  Singles     GBHL90       


In [5]:
df.event_type.value_counts()

event_type
GBHL90     70
GBHL80     56
GBHL100    20
Other       1
Name: count, dtype: int64

In [6]:
print(df.tail())

     start_date    end_date                        event_name  \
142  2026-10-25  2026-10-25           Battle for Middle Earth   
143  2026-10-31  2026-10-01  The Crownless Shall Be King 2026   
144  2026-11-28  2026-11-29           He Eats It By The Block   
145  2026-10-31  2026-10-31       The Brecon Beacons Are Lit!   
146  2026-11-28  2026-11-28            Grand Anglian Alliance   

           organizer region      location   format event_type link  
142    Chris Jackson     SE    Eastbourne  Singles     GBHL90       
143     Tom Culleton     SE  High Wycombe  Singles     GBHL90       
144  Jack Darlington      C     Leicester  Singles    GBHL100       
145    James Donovan     SW   Abergavenny  Singles     GBHL90       
146     James Palmer     SE       Woolpit  Singles     GBHL80       


In [7]:
old_df = pd.read_csv('data/gbhl_events.csv')

In [8]:
old_df = old_df[["event_name", "start_date", "lat", "lon"]]

In [9]:
df = df.merge(old_df, on=["event_name", "start_date"], how="left")

In [10]:
# Manual fixes on location typos
df["location"].replace("Abergevanny", "Abergavenny", inplace=True)
df["location"].replace("Bedwroth", "Bedworth", inplace=True)

In [11]:
# Create a location string for geocoding
df['location_str'] = df['location'] + ', UK'

# Set up Nominatim geocoder
geolocator = Nominatim(user_agent="gbhl-event-locator")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)  # be polite!

# Apply geocoding
def get_coords(place):
    try:
        loc = geocode(place)
        if loc:
            return pd.Series([loc.latitude, loc.longitude])
    except:
        pass
    return pd.Series([None, None])

# Find rows where lat or lon is null
missing_coords = df[df['lat'].isnull() | df['lon'].isnull()]

In [12]:
missing_coords

Unnamed: 0,start_date,end_date,event_name,organizer,region,location,format,event_type,link,lat,lon,location_str
3,2026-03-28,2026-03-01,Bonds of Fellowship VI,Callum Slater,SE,Havantl,Singles,GBHL90,,,,"Havantl, UK"
11,2026-02-07,2026-02-07,Battle For Ravenhill,Nicky Forsythe,Sc & Ire,Bangor,Singles,GBHL80,,,,"Bangor, UK"
15,2026-02-14,2026-02-15,Honour The Allegiance,Natalie Pearson,N.Eng,Sheffield,Singles,GBHL80,,,,"Sheffield, UK"
19,2026-03-14,2026-03-14,There And Back Again,Richard Ancliff,Unknown,N.Eng- Sheffield,Singles,GBHL80,,,,"N.Eng- Sheffield, UK"
37,2026-02-28,2026-02-01,Bonds of Fellowship VI,Callum Slater,SE,Havantl,Singles,GBHL90,,,,"Havantl, UK"
42,2026-04-04,2026-04-05,British Team Championship,Matt King,SW,Cardiff,Singles,GBHL90,,,,"Cardiff, UK"
47,2026-06-06,2026-06-07,Leeds By Example Again!,Adam Bird,N.Eng,Leeds,Singles,GBHL90,,,,"Leeds, UK"
49,2026-05-02,2026-05-02,The Ollyphaunt Cup 2026,Olly Jackson,SE,London,Singles,GBHL90,,,,"London, UK"
51,2026-05-02,2026-05-02,Take Them Head On!,Nick Walker,SE,Portsmouth,Singles,GBHL80,,,,"Portsmouth, UK"
59,2026-05-09,2026-05-09,Piggin Mayhem,Sean White,SW,Bath,Singles,GBHL80,,,,"Bath, UK"


In [13]:
if not missing_coords.empty:
    # Apply geocoding function just to these rows
    missing_coords[['lat', 'lon']] = missing_coords['location_str'].apply(get_coords)

    # Update the original dataframe only for those indices
    df.loc[missing_coords.index, ['lat', 'lon']] = missing_coords[['lat', 'lon']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_coords[['lat', 'lon']] = missing_coords['location_str'].apply(get_coords)


In [14]:
def classify_event_duration(start_str, end_str):
    start_date = datetime.strptime(start_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_str, '%Y-%m-%d')
    start_dow = start_date.weekday()  # 0=Monday ... 5=Saturday ... 6=Sunday
    end_dow = end_date.weekday()

    if start_dow == 5 and end_dow == 5:
        return "Saturday"
    elif start_dow == 6 and end_dow == 6:
        return "Sunday"
    elif start_dow == 5 and end_dow == 6:
        return "Weekend"
    else:
        return "Other"

# Apply to dataframe
df['event_duration'] = df.apply(
    lambda row: classify_event_duration(row['start_date'], row['end_date']),
    axis=1
)


In [15]:
df

Unnamed: 0,start_date,end_date,event_name,organizer,region,location,format,event_type,link,lat,lon,location_str,event_duration
0,2026-01-02,2026-01-02,A New Adventure,Matt King,SW,Cardiff,Singles,GBHL90,,51.481655,-3.179193,"Cardiff, UK",Other
1,2026-01-03,2026-01-04,Into The West 2026,Matt King,SW,Cardiff,Singles,GBHL100,,51.481655,-3.179193,"Cardiff, UK",Weekend
2,2026-03-28,2026-03-01,Defence of North Bristol,David Clubley,SW,Bristol,Singles,GBHL90,,51.453802,-2.597298,"Bristol, UK",Weekend
3,2026-03-28,2026-03-01,Bonds of Fellowship VI,Callum Slater,SE,Havantl,Singles,GBHL90,,,,"Havantl, UK",Weekend
4,2026-03-01,2026-03-01,Scottish Team Championship,Stuart Dobbie,Sc & Ire,Falkirk,Singles,GBHL90,,55.999196,-3.784376,"Falkirk, UK",Sunday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,2026-10-25,2026-10-25,Battle for Middle Earth,Chris Jackson,SE,Eastbourne,Singles,GBHL90,,50.766437,0.278155,"Eastbourne, UK",Sunday
143,2026-10-31,2026-10-01,The Crownless Shall Be King 2026,Tom Culleton,SE,High Wycombe,Singles,GBHL90,,51.631745,-0.755960,"High Wycombe, UK",Other
144,2026-11-28,2026-11-29,He Eats It By The Block,Jack Darlington,C,Leicester,Singles,GBHL100,,52.636200,-1.133197,"Leicester, UK",Weekend
145,2026-10-31,2026-10-31,The Brecon Beacons Are Lit!,James Donovan,SW,Abergavenny,Singles,GBHL90,,51.821321,-3.014794,"Abergavenny, UK",Saturday


In [16]:
df.to_csv("data/gbhl_events.csv", index=False)

In [17]:
df.to_json('data/events.json', orient='records')