In [1]:
import pandas as pd
import numpy as np
import gc

## Importing and merging preprocessed air pollutants data

In [2]:
# Import clean ozone data
ozone_df = pd.read_csv("all_ozone_clean.csv")

In [3]:
ozone_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7290722 entries, 0 to 7290721
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   State Code            int64  
 1   State Name            object 
 2   County Code           int64  
 3   County Name           object 
 4   City Name             object 
 5   Site Num              int64  
 6   Datum                 object 
 7   Latitude              float64
 8   Longitude             float64
 9   Date Local            object 
 10  Ozone(ppm)            float64
 11  Ozone(ppm) Max Value  float64
 12  Ozone(ppm) Max Hour   int64  
 13  CBSA Name             object 
 14  AQI                   float64
 15  measurement_id        object 
dtypes: float64(5), int64(4), object(7)
memory usage: 890.0+ MB


In [4]:
# Import clean SO2 data
SO2_df = pd.read_csv("all_SO2_clean.csv")

  SO2_df = pd.read_csv("all_SO2_clean.csv")


In [5]:
SO2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3164519 entries, 0 to 3164518
Data columns (total 16 columns):
 #   Column              Dtype  
---  ------              -----  
 0   State Code          int64  
 1   State Name          object 
 2   County Code         int64  
 3   County Name         object 
 4   City Name           object 
 5   Site Num            int64  
 6   Datum               object 
 7   Latitude            float64
 8   Longitude           float64
 9   Date Local          object 
 10  SO2(ppb)            float64
 11  SO2(ppb) Max Value  float64
 12  SO2(ppb) Max Hour   int64  
 13  CBSA Name           object 
 14  AQI                 float64
 15  measurement_id      object 
dtypes: float64(5), int64(4), object(7)
memory usage: 386.3+ MB


In [6]:
# Merge ozone and SO2
air_all_df = pd.merge(ozone_df, SO2_df, how='outer', on='measurement_id')
air_all_df.head()

Unnamed: 0,State Code_x,State Name_x,County Code_x,County Name_x,City Name_x,Site Num_x,Datum_x,Latitude_x,Longitude_x,Date Local_x,...,Site Num_y,Datum_y,Latitude_y,Longitude_y,Date Local_y,SO2(ppb),SO2(ppb) Max Value,SO2(ppb) Max Hour,CBSA Name_y,AQI_y
0,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-03-31,...,,,,,,,,,,
1,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-01,...,,,,,,,,,,
2,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-02,...,,,,,,,,,,
3,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-03,...,,,,,,,,,,
4,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-04,...,,,,,,,,,,


In [7]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in air_all_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in air_all_df.columns:
            air_all_df[column] = air_all_df[column].fillna(air_all_df[corresponding_column])

In [8]:
# Drop all columns from the right table, so that we don't have dupblicate columns
air_all_df = air_all_df.drop(columns=air_all_df.filter(regex='_y$').columns)

In [9]:
# Rename the columns
air_all_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name',
       'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude',
       'Date Local_x':'Date Local', 'CBSA Name_x':'CBSA Name', 'AQI_x':'AQI'},inplace=True)

In [10]:
air_all_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,Ozone(ppm),Ozone(ppm) Max Value,Ozone(ppm) Max Hour,CBSA Name,AQI,measurement_id,SO2(ppb),SO2(ppb) Max Value,SO2(ppb) Max Hour
0,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-03-31,0.0125,0.014,23.0,"Dover, DE",13.0,10_1_2_20040331,,,
1,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-01,0.016647,0.027,7.0,"Dover, DE",25.0,10_1_2_20040401,,,
2,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-02,0.027412,0.033,17.0,"Dover, DE",31.0,10_1_2_20040402,,,
3,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-03,0.018412,0.022,11.0,"Dover, DE",20.0,10_1_2_20040403,,,
4,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-04,0.037176,0.042,19.0,"Dover, DE",39.0,10_1_2_20040404,,,


In [11]:
# Import clean NO2 data
NO2_df = pd.read_csv("all_NO2_clean.csv")

In [12]:
NO2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2833926 entries, 0 to 2833925
Data columns (total 16 columns):
 #   Column              Dtype  
---  ------              -----  
 0   State Code          int64  
 1   State Name          object 
 2   County Code         int64  
 3   County Name         object 
 4   City Name           object 
 5   Site Num            int64  
 6   Datum               object 
 7   Latitude            float64
 8   Longitude           float64
 9   Date Local          object 
 10  NO2(ppm)            float64
 11  NO2(ppm) Max Value  float64
 12  NO2(ppm) Max Hour   int64  
 13  CBSA Name           object 
 14  AQI                 float64
 15  measurement_id      object 
dtypes: float64(5), int64(4), object(7)
memory usage: 345.9+ MB


In [13]:
# Merge NO2 data with the previously merged table
air_all_df = pd.merge(air_all_df, NO2_df, how='outer', on='measurement_id')

In [14]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in air_all_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in air_all_df.columns:
            air_all_df[column] = air_all_df[column].fillna(air_all_df[corresponding_column])

In [15]:
# Drop all columns from the right table, so that we don't have dupblicate columns
air_all_df = air_all_df.drop(columns=air_all_df.filter(regex='_y$').columns)

In [16]:
# Rename the columns
air_all_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name',
       'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude',
       'Date Local_x':'Date Local', 'CBSA Name_x':'CBSA Name', 'AQI_x':'AQI'},inplace=True)

In [17]:
air_all_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,...,Ozone(ppm) Max Hour,CBSA Name,AQI,measurement_id,SO2(ppb),SO2(ppb) Max Value,SO2(ppb) Max Hour,NO2(ppm),NO2(ppm) Max Value,NO2(ppm) Max Hour
0,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-03-31,...,23.0,"Dover, DE",13.0,10_1_2_20040331,,,,,,
1,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-01,...,7.0,"Dover, DE",25.0,10_1_2_20040401,,,,,,
2,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-02,...,17.0,"Dover, DE",31.0,10_1_2_20040402,,,,,,
3,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-03,...,11.0,"Dover, DE",20.0,10_1_2_20040403,,,,,,
4,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-04-04,...,19.0,"Dover, DE",39.0,10_1_2_20040404,,,,,,


In [18]:
# Import clean PM2_5 data
PM2_5_df = pd.read_csv("all_PM2_5_clean.csv")

In [19]:
PM2_5_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3832669 entries, 0 to 3832668
Data columns (total 16 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   State Code                 int64  
 1   State Name                 object 
 2   County Code                int64  
 3   County Name                object 
 4   City Name                  object 
 5   Site Num                   int64  
 6   Datum                      object 
 7   Latitude                   float64
 8   Longitude                  float64
 9   Date Local                 object 
 10  PM2_5(µg/m³ LC)            float64
 11  PM2_5(µg/m³ LC) Max Value  float64
 12  PM2_5(µg/m³ LC) Max Hour   int64  
 13  CBSA Name                  object 
 14  AQI                        float64
 15  measurement_id             object 
dtypes: float64(5), int64(4), object(7)
memory usage: 467.9+ MB


In [20]:
# Merge PM2.5 data with the previously merged table
air_all_df = pd.merge(air_all_df, PM2_5_df, how='outer', on='measurement_id')

In [21]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in air_all_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in air_all_df.columns:
            air_all_df[column] = air_all_df[column].fillna(air_all_df[corresponding_column])

In [22]:
# Drop all columns from the right table, so that we don't have dupblicate columns
air_all_df = air_all_df.drop(columns=air_all_df.filter(regex='_y$').columns)

In [23]:
# Rename the columns
air_all_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name',
       'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude',
       'Date Local_x':'Date Local', 'CBSA Name_x':'CBSA Name', 'AQI_x':'AQI'},inplace=True)

In [24]:
air_all_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,...,measurement_id,SO2(ppb),SO2(ppb) Max Value,SO2(ppb) Max Hour,NO2(ppm),NO2(ppm) Max Value,NO2(ppm) Max Hour,PM2_5(µg/m³ LC),PM2_5(µg/m³ LC) Max Value,PM2_5(µg/m³ LC) Max Hour
0,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-01,...,10_1_2_20040101,,,,,,,7.6,7.6,0.0
1,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-04,...,10_1_2_20040104,,,,,,,16.5,16.5,0.0
2,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-07,...,10_1_2_20040107,,,,,,,6.3,6.3,0.0
3,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-10,...,10_1_2_20040110,,,,,,,6.7,6.7,0.0
4,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-13,...,10_1_2_20040113,,,,,,,8.3,8.3,0.0


In [25]:
# Import clean PM10 data
PM10_df = pd.read_csv("all_PM10_clean.csv")

In [26]:
PM10_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2911905 entries, 0 to 2911904
Data columns (total 16 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   State Code                            int64  
 1   State Name                            object 
 2   County Code                           int64  
 3   County Name                           object 
 4   City Name                             object 
 5   Site Num                              int64  
 6   Datum                                 object 
 7   Latitude                              float64
 8   Longitude                             float64
 9   Date Local                            object 
 10  PM10 Tot_0-10um STP(μg/m3)            float64
 11  PM10 Tot_0-10um STP(μg/m3) Max Value  float64
 12  PM10 Tot_0-10um STP(μg/m3) Max Hour   int64  
 13  CBSA Name                             object 
 14  AQI                                   float64
 15  measurement_id 

In [27]:
# Merge PM10 data with the previously merged table
air_all_df = pd.merge(air_all_df, PM10_df, how='outer', on='measurement_id')

In [28]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in air_all_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in air_all_df.columns:
            air_all_df[column] = air_all_df[column].fillna(air_all_df[corresponding_column])

In [29]:
# Drop all columns from the right table, so that we don't have dupblicate columns
air_all_df = air_all_df.drop(columns=air_all_df.filter(regex='_y$').columns)

In [30]:
# Rename the columns
air_all_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name',
       'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude',
       'Date Local_x':'Date Local', 'CBSA Name_x':'CBSA Name', 'AQI_x':'AQI'},inplace=True)

In [31]:
air_all_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,...,SO2(ppb) Max Hour,NO2(ppm),NO2(ppm) Max Value,NO2(ppm) Max Hour,PM2_5(µg/m³ LC),PM2_5(µg/m³ LC) Max Value,PM2_5(µg/m³ LC) Max Hour,PM10 Tot_0-10um STP(μg/m3),PM10 Tot_0-10um STP(μg/m3) Max Value,PM10 Tot_0-10um STP(μg/m3) Max Hour
0,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-01,...,,,,,7.6,7.6,0.0,,,
1,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-04,...,,,,,16.5,16.5,0.0,,,
2,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-07,...,,,,,6.3,6.3,0.0,,,
3,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-10,...,,,,,6.7,6.7,0.0,,,
4,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-13,...,,,,,8.3,8.3,0.0,,,


In [32]:
# Import clean CO data
CO_df = pd.read_csv("all_CO_clean.csv")

In [33]:
CO_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2113222 entries, 0 to 2113221
Data columns (total 16 columns):
 #   Column             Dtype  
---  ------             -----  
 0   State Code         int64  
 1   State Name         object 
 2   County Code        int64  
 3   County Name        object 
 4   City Name          object 
 5   Site Num           int64  
 6   Datum              object 
 7   Latitude           float64
 8   Longitude          float64
 9   Date Local         object 
 10  CO(ppm)            float64
 11  CO(ppm) Max Value  float64
 12  CO(ppm) Max Hour   int64  
 13  CBSA Name          object 
 14  AQI                float64
 15  measurement_id     object 
dtypes: float64(5), int64(4), object(7)
memory usage: 258.0+ MB


In [34]:
# Merge CO data with the previously merged table
air_all_df = pd.merge(air_all_df, CO_df, how='outer', on='measurement_id')

In [35]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in air_all_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in air_all_df.columns:
            air_all_df[column] = air_all_df[column].fillna(air_all_df[corresponding_column])

In [36]:
# Drop all columns from the right table, so that we don't have dupblicate columns
air_all_df = air_all_df.drop(columns=air_all_df.filter(regex='_y$').columns)

In [37]:
# Rename the columns
air_all_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name',
       'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude',
       'Date Local_x':'Date Local', 'CBSA Name_x':'CBSA Name', 'AQI_x':'AQI'},inplace=True)

In [38]:
air_all_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,...,NO2(ppm) Max Hour,PM2_5(µg/m³ LC),PM2_5(µg/m³ LC) Max Value,PM2_5(µg/m³ LC) Max Hour,PM10 Tot_0-10um STP(μg/m3),PM10 Tot_0-10um STP(μg/m3) Max Value,PM10 Tot_0-10um STP(μg/m3) Max Hour,CO(ppm),CO(ppm) Max Value,CO(ppm) Max Hour
0,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-01,...,,7.6,7.6,0.0,,,,,,
1,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-04,...,,16.5,16.5,0.0,,,,,,
2,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-07,...,,6.3,6.3,0.0,,,,,,
3,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-10,...,,6.7,6.7,0.0,,,,,,
4,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2004-01-13,...,,8.3,8.3,0.0,,,,,,


In [39]:
air_all_df.columns

Index(['State Code', 'State Name', 'County Code', 'County Name', 'City Name',
       'Site Num', 'Datum', 'Latitude', 'Longitude', 'Date Local',
       'Ozone(ppm)', 'Ozone(ppm) Max Value', 'Ozone(ppm) Max Hour',
       'CBSA Name', 'AQI', 'measurement_id', 'SO2(ppb)', 'SO2(ppb) Max Value',
       'SO2(ppb) Max Hour', 'NO2(ppm)', 'NO2(ppm) Max Value',
       'NO2(ppm) Max Hour', 'PM2_5(µg/m³ LC)', 'PM2_5(µg/m³ LC) Max Value',
       'PM2_5(µg/m³ LC) Max Hour', 'PM10 Tot_0-10um STP(μg/m3)',
       'PM10 Tot_0-10um STP(μg/m3) Max Value',
       'PM10 Tot_0-10um STP(μg/m3) Max Hour', 'CO(ppm)', 'CO(ppm) Max Value',
       'CO(ppm) Max Hour'],
      dtype='object')

In [40]:
air_all_df['State Code'] = air_all_df['State Code'].astype(int).astype(str)
air_all_df['County Code'] = air_all_df['County Code'].astype(int).astype(str)
air_all_df['Site Num'] = air_all_df['Site Num'].astype(int).astype(str)


In [42]:
# Save the merged df to csv
air_all_df.to_csv("air_data_merged.csv",index=False)