In [1]:
import pandas as pd

In [2]:
import dask.dataframe as dd

## Read pollutants csv 

In [3]:
ddf = dd.read_csv("air_data_merged.csv", usecols=['State Code', 'State Name', 'County Code', 'County Name', 'City Name','Site Num', 'Datum', 'Latitude', 'Longitude', 'Date Local','CBSA Name','measurement_id'],low_memory=False)

# Perform operations on the Dask DataFrame
result = ddf.compute()
result.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,CBSA Name,measurement_id
0,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-01,"Dover, DE",10_1_2_20040101
1,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-04,"Dover, DE",10_1_2_20040104
2,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-07,"Dover, DE",10_1_2_20040107
3,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-10,"Dover, DE",10_1_2_20040110
4,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-13,"Dover, DE",10_1_2_20040113


## Read AQI csv

In [4]:
aqi = pd.read_csv("AQI_final_df.csv")

## Merging pollutants csv and AQI csv 

In [5]:
merge = pd.merge(result, aqi, on='measurement_id', how='outer')
merge.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,CBSA Name,measurement_id,max_value,AQI_cr_param
0,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-01,"Dover, DE",10_1_2_20040101,32.0,AQI_PM2_5
1,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-04,"Dover, DE",10_1_2_20040104,60.0,AQI_PM2_5
2,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-07,"Dover, DE",10_1_2_20040107,26.0,AQI_PM2_5
3,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-10,"Dover, DE",10_1_2_20040110,28.0,AQI_PM2_5
4,10,Delaware,1,Kent,Not in a city,2,WGS84,38.986672,-75.5568,2004-01-13,"Dover, DE",10_1_2_20040113,35.0,AQI_PM2_5


In [6]:
merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12349571 entries, 0 to 12349570
Data columns (total 14 columns):
 #   Column          Dtype  
---  ------          -----  
 0   State Code      int64  
 1   State Name      object 
 2   County Code     int64  
 3   County Name     object 
 4   City Name       object 
 5   Site Num        int64  
 6   Datum           object 
 7   Latitude        float64
 8   Longitude       float64
 9   Date Local      object 
 10  CBSA Name       object 
 11  measurement_id  object 
 12  max_value       float64
 13  AQI_cr_param    object 
dtypes: float64(3), int64(3), object(8)
memory usage: 1.3+ GB


In [7]:
# Checking for duplicates or errors

In [8]:
merge['measurement_id'].nunique()

12349571

In [9]:
result['measurement_id'].nunique()

12349571

In [10]:
aqi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12349571 entries, 0 to 12349570
Data columns (total 3 columns):
 #   Column          Dtype  
---  ------          -----  
 0   measurement_id  object 
 1   max_value       float64
 2   AQI_cr_param    object 
dtypes: float64(1), object(2)
memory usage: 282.7+ MB


In [11]:
merge[merge.duplicated(keep=False)]

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,CBSA Name,measurement_id,max_value,AQI_cr_param


In [13]:
merge.to_csv("daily_AQI.csv", index=False, encoding='utf-8')