In [15]:
import pandas as pd
from scipy.stats import normaltest
import numpy as np
import re

import urllib.request

In [9]:
# utils
def clean_data(data):
    try:
        if type(data) == str:
            data.replace('*', '')
        return float(data)
    
    except ValueError:
        return float('nan')
    
    except TypeError:
        if data == None:
            float('nan')
        
latitude_pattern = r'lat.*?(-?\d+\.?\d*)'
longitude_pattern = r'lon.*?(-?\d+\.?\d*)'

def parse_top_metadata(url):

    location_string = ""

    for idx, line in enumerate(urllib.request.urlopen(url)):
        line = line.decode('utf-8')

        if 'yyyy' in line.lower():
            data_start_row = idx
            return location_string, data_start_row

        if re.search(longitude_pattern, line.lower()) or re.match(latitude_pattern, line.lower()):
            location_string += line


    
def extract_coords(location_string):

    lat_match = re.search(latitude_pattern, location_string)
    long_match = re.search(longitude_pattern, location_string)
    
    try:
        return (
            float(lat_match.group(1)), # even if multiple coords in the string, the first set of coordinates is acceptable for all use cases in this assignment
            float(long_match.group(1))
        )
    except AttributeError:
        print('could not extract coordinates from ' + location_string)
        return None
    

In [58]:
  
# load data 

headers = ['yyyy','mm','tmax', 'tmin','af','rain','sun']
df = pd.DataFrame(columns=headers, dtype='Float64')
converters = {header: clean_data for header in headers} # apply the same cleaning function to all columns

with open('./stations.txt', 'r') as f:
    stations = f.read().splitlines() #import stations into a list

for station in stations:

    url = f"http://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/{station}data.txt"

    location_string, data_start_row = parse_above_data(url)
    
    station_data = pd.read_csv(url,
                               sep=r'\s{2,}',
                               converters=converters,
                               na_values=['---'],
                               header=data_start_row,
                               skipinitialspace=True, 
                               engine='python',
                               on_bad_lines=lambda line: line[:len(headers)] #for lines longer than headers, truncate
                               )
        
    station_data = station_data.dropna(how='all') # remove rows with no data (only NaN values)

    coords = extract_coords(location_string.lower())
    station_data['latitude'] = coords[0]
    station_data['longitude'] = coords[1]
    
    station_data['station'] = station #add station name to all rows for this station

    df = pd.concat([df, station_data])


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



In [103]:
def normalise_features(features):
    maxs = features.max()
    mins = features.min()
    normalised_features = (features - mins) / (maxs - mins)  
    return normalised_features


In [104]:
from sklearn.cluster import k_means

means_by_station = df.dropna(how='any').groupby(['station']).mean()

# normalise so each feature has unit variance
normalised_means = normalise_features(means_by_station)

centroid, label, inertia = k_means(normalised_means[['mm','tmax','tmin','af','rain','sun']],
                                   n_clusters=3,
                                   init='k-means++')

colours = ['red', 'blue', 'green']

clustered_means = means_by_station.assign(cluster=[colours[label] for label in label]) ## convert cluster names to ascii chars 


In [105]:
import plotly.express as px

fig = px.scatter_geo(clustered_means,
                     lat='latitude',
                     lon='longitude',
                     color='cluster',
                     hover_name=means_by_station.reset_index()['station']
                     )

fig.show()

In [78]:
# Investigate the third cluster

means_by_cluster = clustered_means.groupby('cluster').mean()

means_by_cluster

Unnamed: 0_level_0,yyyy,mm,tmax,tmin,af,rain,sun,latitude,longitude
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
blue,1971.392548,6.483705,12.246341,5.490268,3.836495,69.789397,110.299761,55.210786,-3.419429
green,1981.909613,6.493088,10.958755,4.496016,5.306795,120.195603,97.479215,56.253,-3.4052
red,1981.080279,6.47566,13.803169,6.81267,2.877165,62.695898,133.859005,51.838938,-1.593313


In [128]:
## Question B.2

uk_latitude_cutoffs = np.linspace(49.9, 60.9, 4)

third_labels = pd.cut(means_by_station['latitude'], 
                         bins=uk_latitude_cutoffs, 
                         labels=['Bottom', 'Middle', 'Top'], 
                         include_lowest=True)

test_features = normalised_means[-5:]
training_features = normalised_means[:-5]

test_labels = third_labels[-5:]
training_labels = third_labels[:-5]

In [161]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=5)

classifier.fit(training_features, means_by_station['uk_region'][:-5])

predictions = classifier.predict(test_features)

In [163]:
#evaluate classification performance

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_labels, predictions)

print(f"{int(accuracy*100)}%")

predictions_series = pd.Series(predictions, index=test_labels.index, name="prediction")
test_labels_comparison = pd.concat([means_by_station[-5:], predictions_series],
                                   axis=1,
                                   )

test_labels_comparison

80%


Unnamed: 0_level_0,yyyy,mm,tmax,tmin,af,rain,sun,latitude,longitude,uk_region,prediction
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
valley,1965.904706,6.495294,12.903765,7.374353,1.622353,70.843882,134.134235,53.252,-4.535,Bottom,Bottom
waddington,1977.668571,6.477143,13.059429,5.788857,3.864286,50.324286,127.784714,53.175,-0.522,Bottom,Bottom
whitby,2011.653846,6.41958,13.514685,6.70035,2.381119,58.254895,140.484266,54.481,-0.624,Middle,Bottom
wickairport,1969.148789,6.520761,10.509343,4.852249,3.854671,65.838235,104.285813,58.454,-3.088,Top,Top
yeovilton,1993.131915,6.485106,14.457872,6.12,4.32766,59.970638,128.466383,51.006,-2.641,Bottom,Bottom
