In [1]:
import pandas as pd
import csv
import numpy as np
import re

import urllib.request

In [17]:
## load all station data into one dataframe

# utils
def clean_data(data):
    try:
        if type(data) == str:
            data.replace('*', '')
        return float(data)
    
    except ValueError:
        return float('nan')
    
    except TypeError:
        if data == None:
            float('nan')
        
latitude_pattern = r'lat.*?(-?\d+\.?\d*)'
longitude_pattern = r'lon.*?(-?\d+\.?\d*)'
    
def extract_coords(location_string):

    lat_match = re.search(latitude_pattern, location_string)
    long_match = re.search(longitude_pattern, location_string)
    
    try:
        return (
            float(lat_match.group(1)), # even if multiple coords in the string, the first set of coordinates is acceptable for all use cases in this assignment
            float(long_match.group(1))
        )
    except AttributeError:
        print('could not extract coordinates from ' + location_string)
        return None
    
# load data 

headers = ['yyyy','mm','tmax', 'tmin','af','rain','sun']
df = pd.DataFrame(columns=headers, dtype='Float64')
converters = {header: clean_data for header in headers} # apply the same cleaning function to all columns

with open('./stations.txt', 'r') as f:
    stations = f.read().splitlines() #import stations into a list

for station in stations:
    url = f"http://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/{station}data.txt"
    location_string = ""

    for idx, line in enumerate(urllib.request.urlopen(url)):
        line = line.decode('utf-8')

        if re.search(longitude_pattern, line.lower()) or re.match(latitude_pattern, line.lower()):
            location_string += line

        if 'yyyy' in line.lower():
            data_start_row = idx
            break
    
    station_data = pd.read_csv(url,
                               sep=r'\s{2,}',
                               converters=converters,
                               na_values=['---'],
                               header=data_start_row,
                               skipinitialspace=True, 
                               engine='python',
                               on_bad_lines=lambda line: line[:len(headers)] #for lines longer than headers, truncate
                               )
        
    station_data = station_data.dropna(how='all') # remove rows with no data (only NaN values)

    coords = extract_coords(location_string.lower())
    station_data['latitude'] = coords[0]
    station_data['longitude'] = coords[1]
    
    station_data['station'] = station #add station name to all rows for this station

    df = pd.concat([df, station_data])

print(df)




    


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



       yyyy   mm  tmax  tmin   af   rain  sun  latitude  longitude    station
1    1941.0  1.0   NaN   NaN  NaN   74.7  NaN    52.139     -4.570  aberporth
2    1941.0  2.0   NaN   NaN  NaN   69.1  NaN    52.139     -4.570  aberporth
3    1941.0  3.0   NaN   NaN  NaN   76.2  NaN    52.139     -4.570  aberporth
4    1941.0  4.0   NaN   NaN  NaN   33.7  NaN    52.139     -4.570  aberporth
5    1941.0  5.0   NaN   NaN  NaN   51.3  NaN    52.139     -4.570  aberporth
..      ...  ...   ...   ...  ...    ...  ...       ...        ...        ...
714  2024.0  2.0  11.8   5.6  3.0  111.8  NaN    51.006     -2.641  yeovilton
715  2024.0  3.0  12.3   5.0  3.0   86.6  NaN    51.006     -2.641  yeovilton
716  2024.0  4.0  14.0   6.5  0.0   50.4  NaN    51.006     -2.641  yeovilton
717  2024.0  5.0  18.4   8.9  0.0  136.2  NaN    51.006     -2.641  yeovilton
718  2024.0  6.0  19.7   8.9  0.0   13.0  NaN    51.006     -2.641  yeovilton

[39399 rows x 10 columns]


In [3]:
print(df.loc[df['station']=='braemar'])

       yyyy   mm  tmax  tmin    af  rain    sun  latitude  longitude  station
1    1959.0  1.0   1.7  -5.7  27.0   NaN   34.2    57.006     -3.396  braemar
2    1959.0  2.0   6.2  -3.2  15.0   NaN   68.6    57.006     -3.396  braemar
3    1959.0  3.0   7.6   0.8   7.0   NaN   80.9    57.006     -3.396  braemar
4    1959.0  4.0   NaN   NaN   NaN   NaN  105.0    57.006     -3.396  braemar
5    1959.0  5.0  15.6   4.6   1.0   NaN  182.6    57.006     -3.396  braemar
..      ...  ...   ...   ...   ...   ...    ...       ...        ...      ...
782  2024.0  2.0   6.7  -0.4  14.0  64.8    NaN    57.006     -3.396  braemar
783  2024.0  3.0   7.2   0.8  10.0  93.6    NaN    57.006     -3.396  braemar
784  2024.0  4.0   9.9   1.9   7.0  88.4    NaN    57.006     -3.396  braemar
785  2024.0  5.0  17.3   7.7   1.0  62.2    NaN    57.006     -3.396  braemar
786  2024.0  6.0  15.3   7.0   0.0  45.2    NaN    57.006     -3.396  braemar

[786 rows x 10 columns]


In [15]:
station_data[station_data.isna()]

Unnamed: 0,yyyy,mm,tmax,tmin,af,rain,sun,latitude,longitude,station
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
714,,,,,,,,,,
715,,,,,,,,,,
716,,,,,,,,,,
717,,,,,,,,,,


In [28]:
from sklearn.cluster import k_means

means_by_station = df.dropna(how='any').groupby(['station']).mean()

centroid, label, inertia = k_means(means_by_station[['mm','tmax','tmin','af','rain','sun']],
                                   n_clusters=3
)

clustered_means = means_by_station.assign(cluster=label)

clustered_means

Unnamed: 0_level_0,yyyy,mm,tmax,tmin,af,rain,sun,latitude,longitude,cluster
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
aberporth,1978.627119,6.474576,12.222976,7.026177,1.717514,73.907345,127.874953,52.139,-4.57,0
armagh,1975.735081,6.472875,13.126944,5.834629,3.315552,68.494394,104.225769,54.352,-6.649,1
bradford,1959.501324,6.474846,12.129391,5.60256,3.88173,72.719153,102.480936,53.813,-1.772,1
braemar,1982.291139,6.487342,10.361181,2.733544,8.681435,74.153376,99.032911,57.006,-3.396,1
camborne,1992.870206,6.469027,13.350737,8.251032,0.769912,89.090265,137.336873,50.218,-5.327,0
cambridge,1984.56962,6.491863,14.174503,6.153888,3.674503,45.803436,124.640145,52.245,0.102,0
cardiff,1986.480769,6.490385,14.146635,6.783173,3.096154,92.924519,123.493269,51.488,-3.187,1
cwmystwyth,1986.868321,6.44084,11.651145,4.951336,4.996183,151.117366,96.800573,52.358,-3.802,2
dunstaffnage,1991.509434,6.551887,12.39717,6.30283,2.221698,147.696698,102.238208,56.451,-5.439,2
durham,1944.375854,6.489749,12.261579,4.743812,4.849658,53.561276,111.454594,54.768,-1.585,0


In [29]:
import plotly.express as px

fig = px.scatter_geo(clustered_means,
                     lat='latitude',
                     lon='longitude',
                     color='cluster',)

fig.show()