# Pre-Requisites

In [1]:
!unzip -u archive.zip

Archive:  archive.zip


In [2]:
!pip install pandas==2.2.3



# Library Imports

In [3]:
import pandas as pd

# Constants

In [4]:
RAW_DATA_FILEPATH = 'GlobalWeatherRepository.csv'

REMOVE_COLUMNS = ['last_updated_epoch', 'timezone',
                  'temperature_fahrenheit', 'wind_mph', 'wind_kph',
                  'wind_degree', 'wind_direction', 'pressure_mb', 'pressure_in',
                  'precip_mm', 'precip_in', 'humidity', 'cloud', 
                  'feels_like_fahrenheit', 'visibility_km', 'visibility_miles',
                  'gust_mph', 'gust_kph', 'air_quality_Carbon_Monoxide',
                  'air_quality_Sulphur_dioxide', 'air_quality_PM2.5','air_quality_PM10',
                  'air_quality_Ozone', 'air_quality_Nitrogen_dioxide', 'air_quality_gb-defra-index', 'sunrise',
                  'sunset', 'moonrise', 'moonset', 'moon_phase', 'moon_illumination']
DATASET_START_DATE = '2024-05-16'
DATASET_END_DATE = '2025-02-16'
MIN_RECORDS_PER_LOCATION = 250

OUT_DATA_FILEPATH = 'Global-Weather-Data.csv'

# Data Pre-Processing

## Load and Parse Data

In [5]:
df = pd.read_csv(RAW_DATA_FILEPATH)
df = df.drop(columns=REMOVE_COLUMNS, axis=1)

# convert columns to datetime objects
df['last_updated'] = pd.to_datetime(df['last_updated'])
df['date'] = df['last_updated'].dt.date

## Filter Data Items

In [6]:
print('Total Records (Before Filtering):', len(df))

# filter based on dataste date range
start_date = pd.to_datetime(DATASET_START_DATE).date()
end_date = pd.to_datetime(DATASET_END_DATE).date()
df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

# ensure unique records for (location, date)
df = df.sort_values(by=['location_name', 'last_updated'], ascending=[True, False])
df = df.loc[df.groupby(['location_name', 'date'])['last_updated'].idxmax()]

# filter locations based on minimum number of records
df = df.groupby('location_name').filter(lambda x: len(x) >= MIN_RECORDS_PER_LOCATION)
df = df.groupby('country').filter(lambda x: len(x) >= MIN_RECORDS_PER_LOCATION)
print('Total Records (After Filtering):', len(df))

Total Records (Before Filtering): 54763
Total Records (After Filtering): 50819


## Extrapolate Features

In [7]:
# extrapolate date related columns
df['month'] = df['last_updated'].dt.month
df['year'] = df['last_updated'].dt.year
df['week'] = (df['last_updated'] - pd.to_datetime(DATASET_START_DATE)).dt.days // 7 + 1
df['time'] = df['last_updated'].dt.time
df = df.drop(columns=['last_updated'], axis=1)

In [8]:
# extrapolate hemisphere based on latitude
df['hemisphere'] = df['latitude'].map(lambda latitude: 'Northern' if latitude >= 0 else 'Southern')

# extrapolate season based on hemisphere and month
def get_season(hemisphere: str, month: int) -> str:
    if hemisphere == 'Northern':
        if month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        elif month in [9, 10, 11]:
            return 'Autumn'
        else:
            return 'Winter'
    else:
        if month in [3, 4, 5]:
            return 'Autumn'
        elif month in [6, 7, 8]:
            return 'Winter'
        elif month in [9, 10, 11]:
            return 'Spring'
        else:
            return 'Summer'
df['season'] = df.apply(lambda row: get_season(row['latitude'], row['month']), axis=1)

In [9]:
# extrapolate continent based on country
country_mapping = {
    'Afghanistan': 'Asia', 'Albania': 'Europe', 'Algeria': 'Africa', 'Andorra': 'Europe', 'Angola': 'Africa',
    'Antigua and Barbuda': 'North America', 'Argentina': 'South America', 'Armenia': 'Asia', 'Australia': 'Australia',
    'Austria': 'Europe', 'Azerbaijan': 'Asia', 'Bahamas': 'North America', 'Bahrain': 'Asia', 'Bangladesh': 'Asia',
    'Barbados': 'North America', 'Belarus': 'Europe', 'Belgium': 'Europe', 'Belize': 'North America', 'Benin': 'Africa',
    'Bhutan': 'Asia', 'Bolivia': 'South America', 'Bosnia and Herzegovina': 'Europe', 'Botswana': 'Africa', 'Brazil': 'South America',
    'Brunei Darussalam': 'Asia', 'Bulgaria': 'Europe', 'Burkina Faso': 'Africa', 'Burundi': 'Africa', 'Cambodia': 'Asia',
    'Canada': 'North America', 'Cape Verde': 'Africa', 'Central African Republic': 'Africa', 'Chad': 'Africa', 'Chile': 'South America',
    'China': 'Asia', 'Comoros': 'Africa', 'Congo': 'Africa', 'Costa Rica': 'North America', 'Croatia': 'Europe', 'Cuba': 'North America',
    'Cyprus': 'Europe', 'Czech Republic': 'Europe', 'Democratic Republic of Congo': 'Africa', 'Denmark': 'Europe', 'Djibouti': 'Africa',
    'Dominica': 'North America', 'Dominican Republic': 'North America', 'Ecuador': 'South America', 'Egypt': 'Africa', 'El Salvador': 'North America',
    'Equatorial Guinea': 'Africa', 'Eritrea': 'Africa', 'Estonia': 'Europe', 'Ethiopia': 'Africa', 'Fiji Islands': 'Oceania',
    'Finland': 'Europe', 'France': 'Europe', 'Gabon': 'Africa', 'Gambia': 'Africa', 'Georgia': 'Asia', 'Germany': 'Europe',
    'Ghana': 'Africa', 'Greece': 'Europe', 'Grenada': 'North America', 'Guatemala': 'North America', 'Guinea': 'Africa',
    'Guinea-Bissau': 'Africa', 'Guyana': 'South America', 'Haiti': 'North America', 'Honduras': 'North America', 'Hungary': 'Europe',
    'India': 'Asia', 'Indonesia': 'Asia', 'Iran': 'Asia', 'Iraq': 'Asia', 'Ireland': 'Europe', 'Israel': 'Asia', 'Italy': 'Europe',
    'Japan': 'Asia', 'Jordan': 'Asia', 'Kazakhstan': 'Asia', 'Kenya': 'Africa', 'Kiribati': 'Oceania', 'Kuwait': 'Asia', 'Kyrghyzstan': 'Asia',
    'Latvia': 'Europe', 'Lebanon': 'Asia', 'Lesotho': 'Africa', 'Liberia': 'Africa', 'Liechtenstein': 'Europe', 'Lithuania': 'Europe',
    'Luxembourg': 'Europe', 'Macedonia': 'Europe', 'Madagascar': 'Africa', 'Malawi': 'Africa', 'Malaysia': 'Asia', 'Mali': 'Africa',
    'Malta': 'Europe', 'Marshall Islands': 'Oceania', 'Mauritania': 'Africa', 'Mauritius': 'Africa', 'Mexico': 'North America',
    'Micronesia': 'Oceania', 'Monaco': 'Europe', 'Mongolia': 'Asia', 'Montenegro': 'Europe', 'Morocco': 'Africa', 'Mozambique': 'Africa',
    'Namibia': 'Africa', 'Nepal': 'Asia', 'Netherlands': 'Europe', 'New Zealand': 'Oceania', 'Nicaragua': 'North America', 'Niger': 'Africa',
    'Nigeria': 'Africa', 'North Korea': 'Asia', 'Norway': 'Europe', 'Oman': 'Asia', 'Pakistan': 'Asia', 'Panama': 'North America',
    'Papua New Guinea': 'Oceania', 'Peru': 'South America', 'Philippines': 'Asia', 'Poland': 'Europe', 'Portugal': 'Europe',
    'Qatar': 'Asia', 'Romania': 'Europe', 'Russia': 'Europe/Asia', 'Rwanda': 'Africa', 'Saint Kitts and Nevis': 'North America',
    'Saint Lucia': 'North America', 'Saint Vincent and the Grenadines': 'North America', 'Samoa': 'Oceania', 'San Marino': 'Europe',
    'Saudi Arabia': 'Asia', 'Senegal': 'Africa', 'Serbia': 'Europe', 'Seychelles Islands': 'Africa', 'Sierra Leone': 'Africa', 'Singapore': 'Asia',
    'Slovakia': 'Europe', 'Slovenia': 'Europe', 'Solomon Islands': 'Oceania', 'Somalia': 'Africa', 'South Africa': 'Africa',
    'South Korea': 'Asia', 'Spain': 'Europe', 'Sri Lanka': 'Asia', 'Sudan': 'Africa', 'Suriname': 'South America', 'Swaziland': 'Africa',
    'Sweden': 'Europe', 'Switzerland': 'Europe', 'Syria': 'Asia', 'Tajikistan': 'Asia', 'Tanzania': 'Africa', 'Thailand': 'Asia',
    'Timor-Leste': 'Asia', 'Tonga': 'Oceania', 'Trinidad and Tobago': 'North America', 'Tunisia': 'Africa', 'Turkey': 'Asia/Europe',
    'Turkmenistan': 'Asia', 'Tuvalu': 'Oceania', 'Uganda': 'Africa', 'Ukraine': 'Europe', 'United Arab Emirates': 'Asia',
    'United Kingdom': 'Europe', 'United States of America': 'North America', 'Uruguay': 'South America', 'Uzbekistan': 'Asia',
    'Vanuatu': 'Oceania', 'Vatican City': 'Europe', 'Venezuela': 'South America', 'Vietnam': 'Asia', 'Yemen': 'Asia', 'Zambia': 'Africa',
    'Zimbabwe': 'Africa'
}
df['continent'] = df['country'].map(country_mapping)

In [10]:
# map conditions
CONDITION_CLEAR_SUNNY = 'Clear/Sunny'
CONDITION_CLOUDY_OVERCAST = 'Cloudy/Overcast'
CONDITION_RAINY = 'Rainy'
CONDITION_SLEET = 'Sleet'
CONDITION_SNOWY = 'Snowy'
CONDITION_FOG_MIST = 'Fog/Mist'
CONDITION_THUNDERSTORMS = 'Thunderstorms'

condition_mapping = {
    'sunny': CONDITION_CLEAR_SUNNY,
    'clear': CONDITION_CLEAR_SUNNY,
    'partly cloudy': CONDITION_CLEAR_SUNNY,
    'overcast': CONDITION_CLOUDY_OVERCAST,
    'cloudy': CONDITION_CLOUDY_OVERCAST,
    'patchy rain possible': CONDITION_RAINY,
    'patchy light drizzle': CONDITION_RAINY,
    'patchy rain nearby': CONDITION_RAINY,
    'light rain': CONDITION_RAINY,
    'light rain shower': CONDITION_RAINY,
    'moderate rain': CONDITION_RAINY,
    'moderate rain at times': CONDITION_RAINY,
    'light drizzle': CONDITION_RAINY,
    'moderate or heavy rain with thunder': CONDITION_RAINY,
    'heavy rain': CONDITION_RAINY,
    'heavy rain at times': CONDITION_RAINY,
    'patchy light rain': CONDITION_RAINY,
    'patchy light rain in area with thunder': CONDITION_RAINY,
    'torrential rain shower': CONDITION_RAINY,
    'moderate or heavy rain shower': CONDITION_RAINY,
    'light freezing rain': CONDITION_SLEET,
    'light sleet': CONDITION_SLEET,
    'light sleet showers': CONDITION_SLEET,
    'moderate or heavy sleet': CONDITION_SLEET,
    'freezing drizzle': CONDITION_SLEET,
    'patchy moderate snow': CONDITION_SNOWY,
    'light snow': CONDITION_SNOWY,
    'light snow showers': CONDITION_SNOWY,
    'moderate snow': CONDITION_SNOWY,
    'moderate or heavy snow showers': CONDITION_SNOWY,
    'patchy light snow': CONDITION_SNOWY,
    'patchy heavy snow': CONDITION_SNOWY,
    'heavy snow': CONDITION_SNOWY,
    'blowing snow': CONDITION_SNOWY,
    'patchy snow possible': CONDITION_SNOWY,
    'patchy snow nearby': CONDITION_SNOWY,
    'blizzard': CONDITION_SNOWY,
    'moderate or heavy snow in area with thunder': CONDITION_SNOWY,
    'mist': CONDITION_FOG_MIST,
    'fog': CONDITION_FOG_MIST,
    'freezing fog': CONDITION_FOG_MIST,
    'thundery outbreaks in nearby': CONDITION_THUNDERSTORMS,
    'thundery outbreaks possible': CONDITION_THUNDERSTORMS,
    'patchy light rain with thunder': CONDITION_THUNDERSTORMS,
    'patchy light rain in area with thunder': CONDITION_THUNDERSTORMS,
    'moderate or heavy rain in area with thunder': CONDITION_THUNDERSTORMS,
}
df['condition'] = df['condition_text'].str.lower().map(condition_mapping)
df = df.drop(columns=['condition_text'])

## Save Processed Data

In [11]:
# confirm no missing values
assert df.isna().sum().sum() == 0, "There are missing values in the DataFrame!"

In [12]:
# save dataframe as csv for visualisations
df.to_csv(OUT_DATA_FILEPATH, index=False)
df

Unnamed: 0,country,location_name,latitude,longitude,temperature_celsius,feels_like_celsius,uv_index,air_quality_us-epa-index,date,month,year,week,time,hemisphere,season,continent,condition
379,United Arab Emirates,Abu Dhabi,24.47,54.37,36.0,46.3,8.0,2,2024-05-16,5,2024,1,18:15:00,Northern,Autumn,Asia,Clear/Sunny
572,United Arab Emirates,Abu Dhabi,24.47,54.37,33.0,40.2,1.0,3,2024-05-17,5,2024,1,20:15:00,Northern,Autumn,Asia,Clear/Sunny
765,United Arab Emirates,Abu Dhabi,24.47,54.37,35.0,45.9,8.0,2,2024-05-18,5,2024,1,18:30:00,Northern,Autumn,Asia,Clear/Sunny
960,United Arab Emirates,Abu Dhabi,24.47,54.37,32.0,38.4,7.0,2,2024-05-19,5,2024,1,18:15:00,Northern,Autumn,Asia,Clear/Sunny
1155,United Arab Emirates,Abu Dhabi,24.47,54.37,32.0,39.5,7.0,2,2024-05-20,5,2024,1,18:45:00,Northern,Autumn,Asia,Clear/Sunny
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52661,Croatia,Zagreb,45.80,16.00,2.3,0.1,0.2,3,2025-02-12,2,2025,39,11:15:00,Northern,Summer,Europe,Rainy
52856,Croatia,Zagreb,45.80,16.00,5.4,4.7,0.5,4,2025-02-13,2,2025,40,11:15:00,Northern,Summer,Europe,Fog/Mist
53051,Croatia,Zagreb,45.80,16.00,1.2,-2.8,0.8,1,2025-02-14,2,2025,40,11:00:00,Northern,Summer,Europe,Snowy
53245,Croatia,Zagreb,45.80,16.00,2.3,-0.8,1.7,2,2025-02-15,2,2025,40,11:15:00,Northern,Summer,Europe,Clear/Sunny
