In [70]:
import pandas as pd
import numpy as np
import sqlite3

In [71]:
# File paths and corresponding locations
file_paths = [
    'data/cork_airport.csv',
    'data/dublin_airport.csv',
    'data/knock_airport.csv',
    'data/malin_head.csv',
    'data/mullingar.csv'
]
locations = ['Cork', 'Dublin', 'Knock', 'Malin Head', 'Mullingar']

# Load each file and add the 'Location' column
cork_data = pd.read_csv(file_paths[0], skiprows=19) 
cork_data['Location'] = 'Cork'

dublin_data = pd.read_csv(file_paths[1], skiprows=19)
dublin_data['Location'] = 'Dublin'

knock_data = pd.read_csv(file_paths[2], skiprows=19)
knock_data['Location'] = 'Knock'

malin_data = pd.read_csv(file_paths[3], skiprows=19)
malin_data['Location'] = 'Malin Head'

mullingar_data = pd.read_csv(file_paths[4], skiprows=19)
mullingar_data['Location'] = 'Mullingar'

   year  month  meant maxtp mintp  mnmax  mnmin  rain gmin  wdsp maxgt    sun  \
0  1962      1    5.5  11.6  -3.8    8.2    2.7       -6.3  12.8    70          
1  1962      2    5.3  12.3  -2.7    7.9    2.7       -4.4  13.6    57          
2  1962      3    4.2  12.2  -5.4    7.5    0.9       -7.2  10.3    63          
3  1962      4    7.7  18.8   0.6   11.2    4.1  68.6 -1.3  11.7    62  201.1   
4  1962      5    9.7  16.1   2.3   13.3    6.2  80.7 -0.7  12.0    45  216.5   

  Location  
0     Cork  
1     Cork  
2     Cork  
3     Cork  
4     Cork  


In [None]:
# Concatenate all DataFrames
all_data = pd.concat([cork_data, dublin_data, knock_data, malin_data, mullingar_data], ignore_index=True)

# Save the consolidated data to a CSV
output_file_path = 'data/consolidated_weather_data.csv'
all_data.to_csv(output_file_path, index=False)

# Preview the first few rows of the consolidated data
print(all_data.head())

In [72]:
# Filter the DataFrame for rows where year >= 2024 and month >= 1
filtered_data = all_data[(all_data['year'] >= 2000) & (all_data['month'] >= 1)]

# Preview the filtered DataFrame
print(filtered_data.head())

# Optionally, save the filtered DataFrame to a new CSV file
filtered_output_file_path = 'data/filtered_weather_data.csv'
filtered_data.to_csv(filtered_output_file_path, index=False)

     year  month  meant maxtp mintp  mnmax  mnmin   rain gmin  wdsp maxgt  \
456  2000      1    5.6  12.5  -2.0    8.0    3.1   50.2 -7.0  10.1    48   
457  2000      2    6.8  12.7   0.0    9.9    3.7  112.0 -3.6  12.0    50   
458  2000      3    8.0  15.7   0.4   11.3    4.7   20.2 -5.2   7.6    38   
459  2000      4    7.3  14.4  -1.8   10.6    3.9   50.2 -6.7  10.1    45   
460  2000      5   11.7  22.1   4.0   15.6    7.8   70.8  0.0   9.1    38   

       sun Location  
456   77.1     Cork  
457   85.6     Cork  
458  122.3     Cork  
459  147.0     Cork  
460  237.8     Cork  


In [73]:
# Create a copy of the filtered data to avoid the warning
filtered_data = filtered_data.copy()

# Replace blank values with NaN
filtered_data.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# Count missing values for each column, grouped by location
missing_values = filtered_data.isnull().groupby(filtered_data['Location']).sum()

# Display the result
print("Missing values by column and location:")
print(missing_values)


Missing values by column and location:
            year  month  meant  maxtp  mintp  mnmax  mnmin  rain  gmin  wdsp  \
Location                                                                       
Cork           0      0      0      0      0      0      0     0     0     0   
Dublin         0      0      0      0      0      0      0     0     1     0   
Knock          0      0      0      0      0      0      0     0     0     0   
Malin Head     0      0      0      0      0      0      0     0     0     1   
Mullingar      0      0      0      0      0      0      0     0     1     0   

            maxgt  sun  Location  
Location                          
Cork            0    0         0  
Dublin          0    0         0  
Knock           0   55         0  
Malin Head      0  166         0  
Mullingar       0  214         0  


[Reference: Pandas DataFrame copy() Method](https://realpython.com/python-raw-strings/)<br/>
[Reference: Replace values in Pandas dataframe using regex](https://www.geeksforgeeks.org/replace-values-in-pandas-dataframe-using-regex/)<br/>
[Reference: Pandas: How to Replace Zero with NaN](https://www.statology.org/pandas-replace-0-with-nan/)<br/>
[Reference: A Guide to R Regular Expressions](https://www.datacamp.com/tutorial/regex-r-regular-expressions-guide) <br/>
[Reference: Using isnull() and groupby() on a pandas dataframe](https://stackoverflow.com/questions/46106954/using-isnull-and-groupby-on-a-pandas-dataframe)

