# Pre-Requisites

In [1]:
!unzip -u archive.zip

Archive:  archive.zip


In [2]:
!pip install pandas==2.2.3



# Library Imports

In [3]:
import pandas as pd

# Constants

In [4]:
RAW_DATA_FILEPATH = 'GlobalWeatherRepository.csv'

REMOVE_COLUMNS = ['timezone', 'last_updated', 'temperature_fahrenheit', 'wind_mph', 'wind_kph',
                  'wind_degree', 'wind_direction', 'pressure_mb', 'pressure_in', 'precip_mm', 'precip_in',
                  'cloud', 'feels_like_celsius', 'feels_like_fahrenheit', 'visibility_km', 'visibility_miles',
                  'uv_index', 'gust_mph', 'gust_kph', 'air_quality_Carbon_Monoxide', 'air_quality_Ozone',
                  'air_quality_Nitrogen_dioxide', 'air_quality_Sulphur_dioxide', 'air_quality_PM2.5',
                  'air_quality_PM10', 'air_quality_us-epa-index', 'air_quality_gb-defra-index', 'sunrise',
                  'sunset', 'moonrise', 'moonset', 'moon_phase', 'moon_illumination']
COUNTRIES = {
    'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 
    'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 
    'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 
    'Spain', 'Sweden', 'United Kingdom'
}
DATASET_START_DATE = '2024-05-16'
DATASET_START_WEEK = '2024-05-13'
DATASET_END_DATE = '2025-02-16'
MIN_RECORDS_PER_LOCATION = 250

MONTH_SEASON_MAPPING = {
    1: 'Winter',
    2: 'Winter',
    3: 'Spring',
    4: 'Spring',
    5: 'Spring',
    6: 'Summer',
    7: 'Summer',
    8: 'Summer',
    9: 'Autumn',
    10: 'Autumn',
    11: 'Autumn',
    12: 'Winter',
}
CONDITION_CLEAR_SUNNY = 'Clear/Sunny'
CONDITION_CLOUDY_OVERCAST = 'Cloudy/Overcast'
CONDITION_RAINY = 'Rainy'
CONDITION_SLEET = 'Sleet'
CONDITION_SNOWY = 'Snowy'
CONDITION_FOG_MIST = 'Fog/Mist'
CONDITION_THUNDERSTORMS = 'Thunderstorms'
CONDITIONS_MAPPING = {
    'sunny': CONDITION_CLEAR_SUNNY,
    'clear': CONDITION_CLEAR_SUNNY,
    'partly cloudy': CONDITION_CLEAR_SUNNY,
    'overcast': CONDITION_CLOUDY_OVERCAST,
    'cloudy': CONDITION_CLOUDY_OVERCAST,
    'patchy rain possible': CONDITION_RAINY,
    'patchy light drizzle': CONDITION_RAINY,
    'patchy rain nearby': CONDITION_RAINY,
    'light rain': CONDITION_RAINY,
    'light rain shower': CONDITION_RAINY,
    'moderate rain': CONDITION_RAINY,
    'moderate rain at times': CONDITION_RAINY,
    'light drizzle': CONDITION_RAINY,
    'moderate or heavy rain with thunder': CONDITION_RAINY,
    'heavy rain': CONDITION_RAINY,
    'heavy rain at times': CONDITION_RAINY,
    'patchy light rain': CONDITION_RAINY,
    'patchy light rain in area with thunder': CONDITION_RAINY,
    'torrential rain shower': CONDITION_RAINY,
    'moderate or heavy rain shower': CONDITION_RAINY,
    'light freezing rain': CONDITION_SLEET,
    'light sleet': CONDITION_SLEET,
    'light sleet showers': CONDITION_SLEET,
    'moderate or heavy sleet': CONDITION_SLEET,
    'freezing drizzle': CONDITION_SLEET,
    'patchy moderate snow': CONDITION_SNOWY,
    'light snow': CONDITION_SNOWY,
    'light snow showers': CONDITION_SNOWY,
    'moderate snow': CONDITION_SNOWY,
    'moderate or heavy snow showers': CONDITION_SNOWY,
    'patchy light snow': CONDITION_SNOWY,
    'patchy heavy snow': CONDITION_SNOWY,
    'heavy snow': CONDITION_SNOWY,
    'blowing snow': CONDITION_SNOWY,
    'patchy snow possible': CONDITION_SNOWY,
    'patchy snow nearby': CONDITION_SNOWY,
    'blizzard': CONDITION_SNOWY,
    'moderate or heavy snow in area with thunder': CONDITION_SNOWY,
    'mist': CONDITION_FOG_MIST,
    'fog': CONDITION_FOG_MIST,
    'freezing fog': CONDITION_FOG_MIST,
    'thundery outbreaks in nearby': CONDITION_THUNDERSTORMS,
    'thundery outbreaks possible': CONDITION_THUNDERSTORMS,
    'patchy light rain with thunder': CONDITION_THUNDERSTORMS,
    'patchy light rain in area with thunder': CONDITION_THUNDERSTORMS,
    'moderate or heavy rain in area with thunder': CONDITION_THUNDERSTORMS,
}


OUT_DATA_FILEPATH = 'EU-UK-Weather-Data.csv'

# Data Pre-Processing

## Load and Parse Data

In [5]:
df = pd.read_csv(RAW_DATA_FILEPATH)
df = df.drop(columns=REMOVE_COLUMNS, axis=1)

# convert columns to datetime objects
df['last_updated'] = pd.to_datetime(df['last_updated_epoch'], unit='s')
df['date'] = df['last_updated'].dt.date

## Filter Data Items

In [6]:
print('Total Records (Before Filtering):', len(df))

# filter EU+UK countries
df = df[df['country'].isin(COUNTRIES)]

# filter based on dataste date range
start_date = pd.to_datetime(DATASET_START_DATE).date()
end_date = pd.to_datetime(DATASET_END_DATE).date()
df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

# ensure unique records for (location, date)
df = df.sort_values(by=['location_name', 'last_updated'], ascending=[True, False])
df = df.loc[df.groupby(['location_name', 'date'])['last_updated'].idxmax()]

# filter locations based on minimum number of records
df = df.groupby('location_name').filter(lambda x: len(x) >= MIN_RECORDS_PER_LOCATION)
df = df.groupby('country').filter(lambda x: len(x) >= MIN_RECORDS_PER_LOCATION)
print('Total Records (After Filtering):', len(df))

Total Records (Before Filtering): 54763
Total Records (After Filtering): 7943


## Extrapolate Features

In [7]:
# extrapolate date related columns
df['day'] = df['last_updated'].dt.day
df['month'] = df['last_updated'].dt.month
df['year'] = df['last_updated'].dt.year
df['week'] = (df['last_updated'] - pd.to_datetime(DATASET_START_WEEK)).dt.days // 7 + 1
df['time'] = df['last_updated'].dt.time
df = df.drop(columns=['last_updated'], axis=1)

# extrapolate season based on month
df['season'] = df['month'].map(MONTH_SEASON_MAPPING)

# group condition texts into conditions
df['condition'] = df['condition_text'].str.lower().map(CONDITIONS_MAPPING)
df = df.drop(columns=['condition_text'])

## Save Processed Data

In [8]:
# confirm no missing values
assert df.isna().sum().sum() == 0, 'There are missing values in the DataFrame!'

In [9]:
# save dataframe as csv for visualisations
df.to_csv(OUT_DATA_FILEPATH, index=False)
df

Unnamed: 0,country,location_name,latitude,longitude,last_updated_epoch,temperature_celsius,humidity,date,day,month,year,week,time,season,condition
317,Netherlands,Amsterdam,52.37,4.89,1715868000,20.0,68,2024-05-16,16,5,2024,1,14:00:00,Spring,Clear/Sunny
511,Netherlands,Amsterdam,52.37,4.89,1715961600,16.0,88,2024-05-17,17,5,2024,1,16:00:00,Spring,Clear/Sunny
704,Netherlands,Amsterdam,52.37,4.89,1716042600,22.0,50,2024-05-18,18,5,2024,1,14:30:00,Spring,Rainy
898,Netherlands,Amsterdam,52.37,4.89,1716127200,22.0,57,2024-05-19,19,5,2024,1,14:00:00,Spring,Clear/Sunny
1093,Netherlands,Amsterdam,52.37,4.89,1716216300,19.0,78,2024-05-20,20,5,2024,2,14:45:00,Spring,Rainy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52661,Croatia,Zagreb,45.80,16.00,1739355300,2.3,87,2025-02-12,12,2,2025,40,10:15:00,Winter,Rainy
52856,Croatia,Zagreb,45.80,16.00,1739441700,5.4,87,2025-02-13,13,2,2025,40,10:15:00,Winter,Fog/Mist
53051,Croatia,Zagreb,45.80,16.00,1739527200,1.2,86,2025-02-14,14,2,2025,40,10:00:00,Winter,Snowy
53245,Croatia,Zagreb,45.80,16.00,1739614500,2.3,55,2025-02-15,15,2,2025,40,10:15:00,Winter,Clear/Sunny
