In [1]:
import pandas as pd
import glob

### Import and concatenate all Ozone csv files into one

In [2]:
# Path to the directory containing CSV files 
csv_files_path = r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Data\Ozone\*.csv" # Adjust this path as necessary 

# Use glob to get all CSV file paths 
csv_files = glob.glob(csv_files_path) 

# Use list comprehension to read all CSV files and concatenate them into a single DataFrame 
ozone_df = pd.concat([pd.read_csv(file,low_memory=False) for file in csv_files], ignore_index=True) 

# Now ozone_df contains all the data from the 20 CSV files 


### Data cleaning and preprocessing

In [3]:
# Drop unwanted columns
ozone_df.drop(['Parameter Code', 
            'Sample Duration', 
            'Pollutant Standard', 
            'Event Type', 
            'Observation Count', 
            'Observation Percent',
            'Method Code', 
            'Method Name',  
            'Local Site Name', 
            'Address', 
            'Date of Last Change'],axis=1,inplace=True)

In [4]:
# Drop Canada
ozone_df.drop(ozone_df[(ozone_df['State Code']=='CC')].index,inplace=True)

# Change State Code dtype to int64
ozone_df['State Code'] = ozone_df['State Code'].astype('int64')

# Drop Mexico, Virgin Islands and Puerto Rico
ozone_df.drop(ozone_df[(ozone_df['State Code']==80)|(ozone_df['State Code']==78)|(ozone_df['State Code']==72)].index,inplace=True)

In [5]:
# Aggregating so we have 1 measurement per station per day per parameter
ozone_df = ozone_df.groupby(['State Code', 
                       'State Name', 
                       'County Code', 
                       'County Name', 
                       'City Name', 
                       'Site Num', 
                       'Datum', 
                       'Latitude', 
                       'Longitude', 
                       'Date Local', 
                       'Parameter Name', 
                       'Units of Measure']).agg({'Arithmetic Mean':'mean', '1st Max Value':'max','1st Max Hour':'max','CBSA Name':'first','AQI':'mean'}).reset_index()

In [6]:
# Create unique identifier for each measurement
ozone_df['measurement_id'] = ozone_df['State Code'].apply(str) + '_' + ozone_df['County Code'].apply(str) + '_' + ozone_df['Site Num'].apply(str) + '_' + ozone_df['Date Local'].str.replace('-','')

In [7]:
# checking for the parameters included in the dataframe
ozone_df['Parameter Name'].unique()

array(['Ozone'], dtype=object)

In [8]:
# Renaming column to match the value measured and droping unnecessary columns
ozone_df.rename(columns={'Arithmetic Mean':'Ozone(ppm)','1st Max Value':'Ozone(ppm) Max Value', '1st Max Hour':'Ozone(ppm) Max Hour'},inplace=True)
ozone_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)

In [9]:
# Convert date column to datetime
ozone_df['Date Local'] = pd.to_datetime(ozone_df['Date Local'])

In [10]:
ozone_df

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,Ozone(ppm),Ozone(ppm) Max Value,Ozone(ppm) Max Hour,CBSA Name,AQI,measurement_id
0,1,Alabama,3,Baldwin,Fairhope,10,NAD83,30.497478,-87.880258,2004-02-29,0.023000,0.023,23,"Daphne-Fairhope-Foley, AL",21.0,1_3_10_20040229
1,1,Alabama,3,Baldwin,Fairhope,10,NAD83,30.497478,-87.880258,2004-03-01,0.016118,0.023,7,"Daphne-Fairhope-Foley, AL",21.0,1_3_10_20040301
2,1,Alabama,3,Baldwin,Fairhope,10,NAD83,30.497478,-87.880258,2004-03-02,0.015235,0.018,9,"Daphne-Fairhope-Foley, AL",17.0,1_3_10_20040302
3,1,Alabama,3,Baldwin,Fairhope,10,NAD83,30.497478,-87.880258,2004-03-03,0.018529,0.023,8,"Daphne-Fairhope-Foley, AL",21.0,1_3_10_20040303
4,1,Alabama,3,Baldwin,Fairhope,10,NAD83,30.497478,-87.880258,2004-03-04,0.010059,0.015,7,"Daphne-Fairhope-Foley, AL",14.0,1_3_10_20040304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7290717,56,Wyoming,45,Weston,Not in a city,3,WGS84,43.873056,-104.191944,2023-06-26,0.048118,0.050,7,,46.0,56_45_3_20230626
7290718,56,Wyoming,45,Weston,Not in a city,3,WGS84,43.873056,-104.191944,2023-06-27,0.049941,0.054,16,,50.0,56_45_3_20230627
7290719,56,Wyoming,45,Weston,Not in a city,3,WGS84,43.873056,-104.191944,2023-06-28,0.045235,0.052,9,,48.0,56_45_3_20230628
7290720,56,Wyoming,45,Weston,Not in a city,3,WGS84,43.873056,-104.191944,2023-06-29,0.047118,0.051,12,,47.0,56_45_3_20230629


In [11]:
#Saving final df to csv
#ozone_df.to_csv(r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Cleaning\all_ozone_clean.csv",index=False)
ozone_df.to_csv(r"C:\Users\stlva\Google Drive\Big blue data academy\main_branch\concatenated data\all_ozone_clean.csv",index=False)