In [1]:
import pandas as pd
from scipy import stats
import numpy as np

In [2]:
# import co data
file_path = 'Milan_Data/co_milan.csv'
co_data = pd.read_csv(file_path)
co_data.head()

Unnamed: 0,Samplingpoint,Pollutant,Start,Value,Unit
0,IT/SPO-IT0467A_00010_100,10,2006-01-01 00:00:00,1.5,mg.m-3
1,IT/SPO-IT0467A_00010_100,10,2006-01-01 01:00:00,1.4,mg.m-3
2,IT/SPO-IT0467A_00010_100,10,2006-01-01 02:00:00,1.6,mg.m-3
3,IT/SPO-IT0467A_00010_100,10,2006-01-01 03:00:00,1.7,mg.m-3
4,IT/SPO-IT0467A_00010_100,10,2006-01-01 04:00:00,1.2,mg.m-3


In [3]:
# import no2 data
no2_file_path = 'Milan_Data/no2_milan.csv'
no2_data = pd.read_csv(no2_file_path)

# Display the first few rows of the NO2 data to understand its structure
no2_data.head()

Unnamed: 0,Samplingpoint,Pollutant,Start,Value,Unit
0,IT/SPO-IT0466A_00008_100,8,2006-01-01 00:00:00,65.0,ug.m-3
1,IT/SPO-IT0466A_00008_100,8,2006-01-01 01:00:00,65.0,ug.m-3
2,IT/SPO-IT0466A_00008_100,8,2006-01-01 02:00:00,71.0,ug.m-3
3,IT/SPO-IT0466A_00008_100,8,2006-01-01 03:00:00,74.0,ug.m-3
4,IT/SPO-IT0466A_00008_100,8,2006-01-01 04:00:00,74.0,ug.m-3


In [4]:
# import o3 data
o3_file_path = 'Milan_Data/O3_Milan.csv'
o3_data = pd.read_csv(o3_file_path)

# Display the first few rows of the O3 data to understand its structure
o3_data.head()

Unnamed: 0.1,Unnamed: 0,Samplingpoint,Pollutant,Start,Value,Unit
0,0,IT/SPO-IT0522A_00007_500,7,2003-01-01 00:00:00,0.0,ug.m-3
1,1,IT/SPO-IT0522A_00007_500,7,2003-01-01 01:00:00,8.0,ug.m-3
2,2,IT/SPO-IT0522A_00007_500,7,2003-01-01 02:00:00,8.0,ug.m-3
3,3,IT/SPO-IT0522A_00007_500,7,2003-01-01 03:00:00,8.0,ug.m-3
4,4,IT/SPO-IT0522A_00007_500,7,2003-01-01 04:00:00,8.0,ug.m-3


In [5]:
aqi_plus_path = "clean.csv"
clean_data = pd.read_csv(aqi_plus_path)
clean_data.head()

Unnamed: 0,PT08.S1(CO),PT08.S2(NMHC),PT08.S3(NOx),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,DateTime,Season
0,1360.0,1046.0,1056.0,1692.0,1268.0,13.6,48.9,0.7578,2004-03-10 18:00:00,winter
1,1292.0,955.0,1174.0,1559.0,972.0,13.3,47.7,0.7255,2004-03-10 19:00:00,winter
2,1402.0,939.0,1140.0,1555.0,1074.0,11.9,54.0,0.7502,2004-03-10 20:00:00,winter
3,1376.0,948.0,1092.0,1584.0,1203.0,11.0,60.0,0.7867,2004-03-10 21:00:00,winter
4,1272.0,836.0,1205.0,1490.0,1110.0,11.2,59.6,0.7888,2004-03-10 22:00:00,winter


In [6]:
# Convert the Start column to datetime
co_data['Start'] = pd.to_datetime(co_data['Start'])

# Filter the data for the specified sampling point and date range
filtered_co_data = co_data[
    (co_data['Samplingpoint'] == 'IT/SPO-IT0770A_00010_500') &
    (co_data['Start'] >= '2004-03-10') &
    (co_data['Start'] <= '2005-04-04')
]

# Display the filtered data
filtered_co_data.head()

Unnamed: 0,Samplingpoint,Pollutant,Start,Value,Unit
171096,IT/SPO-IT0770A_00010_500,10,2004-03-10 00:00:00,3.0,mg.m-3
171097,IT/SPO-IT0770A_00010_500,10,2004-03-10 01:00:00,2.8,mg.m-3
171098,IT/SPO-IT0770A_00010_500,10,2004-03-10 02:00:00,2.1,mg.m-3
171099,IT/SPO-IT0770A_00010_500,10,2004-03-10 03:00:00,1.9,mg.m-3
171100,IT/SPO-IT0770A_00010_500,10,2004-03-10 04:00:00,1.8,mg.m-3


In [7]:
nan_indices = filtered_co_data[filtered_co_data['Value'].isna()].index
print(nan_indices)

#No Values are NaN

Index([], dtype='int64')


In [8]:
# Filter the NO2 data for a different sampling point within the specified date range
alternative_sampling_point = 'IT/SPO-IT0770A_00008_500'
filtered_no2_data = no2_data[
    (no2_data['Samplingpoint'] == alternative_sampling_point) &
    (no2_data['Start'] >= '2004-03-10') &
    (no2_data['Start'] <= '2005-04-04')
]

# Display the filtered NO2 data
filtered_no2_data.head()

Unnamed: 0,Samplingpoint,Pollutant,Start,Value,Unit
171000,IT/SPO-IT0770A_00008_500,8,2004-03-10 00:00:00,94.0,ug.m-3
171001,IT/SPO-IT0770A_00008_500,8,2004-03-10 01:00:00,84.0,ug.m-3
171002,IT/SPO-IT0770A_00008_500,8,2004-03-10 02:00:00,73.0,ug.m-3
171003,IT/SPO-IT0770A_00008_500,8,2004-03-10 03:00:00,62.0,ug.m-3
171004,IT/SPO-IT0770A_00008_500,8,2004-03-10 04:00:00,56.0,ug.m-3


In [9]:
nan_indices = filtered_no2_data[filtered_no2_data['Value'].isna()].index
print(nan_indices)

#No Values are NaN

Index([], dtype='int64')


In [10]:
# Filter the O3 data for the selected sampling point within the specified date range
o3_sampling_point = 'IT/SPO-IT0770A_00007_500'
filtered_o3_data = o3_data[
    (o3_data['Samplingpoint'] == o3_sampling_point) &
    (o3_data['Start'] >= '2004-03-10') &
    (o3_data['Start'] <= '2005-04-04')
]

# Display the first few rows of the data with the 8-hour rolling average
filtered_o3_data.tail()

Unnamed: 0.1,Unnamed: 0,Samplingpoint,Pollutant,Start,Value,Unit
28027,28027,IT/SPO-IT0770A_00007_500,7,2005-04-03 19:00:00,77.0,ug.m-3
28028,28028,IT/SPO-IT0770A_00007_500,7,2005-04-03 20:00:00,56.0,ug.m-3
28029,28029,IT/SPO-IT0770A_00007_500,7,2005-04-03 21:00:00,19.0,ug.m-3
28030,28030,IT/SPO-IT0770A_00007_500,7,2005-04-03 22:00:00,9.0,ug.m-3
28031,28031,IT/SPO-IT0770A_00007_500,7,2005-04-03 23:00:00,8.0,ug.m-3
