In [1]:
import pandas as pd
import glob 

### Import and concatenate all RH and dewpoint csv files into one

In [2]:
# Path to the directory containing CSV files 
csv_files_path = r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Data\RH_DP\*.csv" # Adjust this path as necessary 

# Use glob to get all CSV file paths 
csv_files = glob.glob(csv_files_path) 

# Use list comprehension to read all CSV files and concatenate them into a single DataFrame 
rh_dp_df = pd.concat([pd.read_csv(file,low_memory=False) for file in csv_files], ignore_index=True) 

# Now rh_dp_df contains all the data from the 20 CSV files 


### Data cleaning and preprocessing

In [3]:
#Drop unwanted columns
rh_dp_df.drop(['Parameter Code', 
               'Sample Duration', 
               'Pollutant Standard', 
               'Event Type', 
               'Observation Count', 
               'Observation Percent', 
               'AQI', 'Method Code', 
               'Method Name', 
               'Local Site Name', 
               'Address', 
               'Date of Last Change'],axis=1,inplace=True)

In [4]:
# Grouping by so we have 1 measurement per station per day per parameter
rh_dp_df = rh_dp_df.groupby(['State Code',
                             'State Name',
                             'County Code',
                             'County Name', 
                             'City Name', 
                             'Site Num', 
                             'Datum', 
                             'Latitude', 
                             'Longitude', 
                             'Date Local', 
                             'Parameter Name', 
                             'Units of Measure']).agg({'Arithmetic Mean':'mean', '1st Max Value':'max','1st Max Hour':'max','CBSA Name':'first'}).reset_index()

In [5]:
#Drop Mexico, Virgin Islands and Puerto Rico
rh_dp_df.drop(rh_dp_df[(rh_dp_df['State Code']==80)|(rh_dp_df['State Code']==78)|(rh_dp_df['State Code']==72)].index,inplace=True)

In [6]:
# Create unique identifier for each measurement
rh_dp_df['measurement_id'] = rh_dp_df['State Code'].apply(str) + '_' + rh_dp_df['County Code'].apply(str) + '_' + rh_dp_df['Site Num'].apply(str) + '_' + rh_dp_df['Date Local'].str.replace('-','')

In [7]:
# In 'Parameter Name' column, both wind speed and wind direction were recorded. So we had to separate them in order to proceed
# Spliting the dataframe based on the parameter name
dp_df = rh_dp_df[rh_dp_df['Parameter Name']=='Dew Point']
rh_df = rh_dp_df[rh_dp_df['Parameter Name']=='Relative Humidity ']

In [8]:
# Correcting the blankspace at the end
rh_df['Parameter Name'] = rh_df['Parameter Name'].str.replace('Relative Humidity ','Relative Humidity')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rh_df['Parameter Name'] = rh_df['Parameter Name'].str.replace('Relative Humidity ','Relative Humidity')


In [9]:
#Convert date column to datetime
rh_df['Date Local'] = pd.to_datetime(rh_df['Date Local'])
dp_df['Date Local'] = pd.to_datetime(dp_df['Date Local'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rh_df['Date Local'] = pd.to_datetime(rh_df['Date Local'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_df['Date Local'] = pd.to_datetime(dp_df['Date Local'])


In [10]:
#Renaming column to match the value measured and droping unnecessary columns
rh_df.rename(columns={'Arithmetic Mean':'RH(%)' ,'1st Max Value':'RH(%) Max Value','1st Max Hour':'RH(%) Max Hour'},inplace=True)
rh_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rh_df.rename(columns={'Arithmetic Mean':'RH(%)' ,'1st Max Value':'RH(%) Max Value','1st Max Hour':'RH(%) Max Hour'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rh_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)


In [11]:
rh_df.describe()

Unnamed: 0,State Code,County Code,Site Num,Latitude,Longitude,Date Local,RH(%),RH(%) Max Value,RH(%) Max Hour
count,2621757.0,2621757.0,2621757.0,2621757.0,2621757.0,2621757,2621757.0,2621757.0,2621757.0
mean,27.05367,68.50981,1155.85,38.77961,-100.5296,2014-06-12 23:40:24.692907008,61.39943,79.91581,7.771548
min,1.0,1.0,1.0,19.4167,-160.5083,2004-01-01 00:00:00,0.0,0.0,0.0
25%,6.0,21.0,7.0,35.33161,-116.7683,2010-01-26 00:00:00,47.79167,71.0,3.0
50%,25.0,43.0,43.0,39.06429,-105.2202,2014-10-03 00:00:00,64.125,86.0,5.0
75%,42.0,97.0,1015.0,42.19438,-85.10182,2019-01-18 00:00:00,76.66667,94.0,9.0
max,56.0,510.0,9997.0,67.0931,-67.26541,2023-09-30 00:00:00,140.0708,143.3,23.0
std,17.8756,73.95954,2260.797,4.903382,17.33428,,20.21197,18.95064,7.780643


In [12]:
# Replace RH values above 100 with 100
rh_df['RH(%)'] = rh_df['RH(%)'].apply(lambda x: 100 if x > 100 else x)
rh_df['RH(%) Max Value'] = rh_df['RH(%) Max Value'].apply(lambda x: 100 if x > 100 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rh_df['RH(%)'] = rh_df['RH(%)'].apply(lambda x: 100 if x > 100 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rh_df['RH(%) Max Value'] = rh_df['RH(%) Max Value'].apply(lambda x: 100 if x > 100 else x)


In [13]:
#Renaming column to match the value measured and droping unnecessary columns
dp_df.rename(columns={'Arithmetic Mean':'Dp Temperature(F)', '1st Max Value':'Dp Temperature(F) Max Value', '1st Max Hour':'Dp Temperature(F) Max Hour'},inplace=True)
dp_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_df.rename(columns={'Arithmetic Mean':'Dp Temperature(F)', '1st Max Value':'Dp Temperature(F) Max Value', '1st Max Hour':'Dp Temperature(F) Max Hour'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)


In [14]:
rh_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,RH(%),RH(%) Max Value,RH(%) Max Hour,CBSA Name,measurement_id
0,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-06,86.875,96.0,23,"Florence-Muscle Shoals, AL",1_33_1003_20090506
1,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-07,82.0,98.0,1,"Florence-Muscle Shoals, AL",1_33_1003_20090507
2,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-08,76.791667,93.0,3,"Florence-Muscle Shoals, AL",1_33_1003_20090508
3,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-09,90.208333,99.0,8,"Florence-Muscle Shoals, AL",1_33_1003_20090509
4,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-10,95.25,100.0,22,"Florence-Muscle Shoals, AL",1_33_1003_20090510


In [18]:
dp_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,Dp Temperature(F),Dp Temperature(F) Max Value,Dp Temperature(F) Max Hour,CBSA Name,measurement_id
161544,6,California,23,Humboldt,Not in a city,2001,WGS84,41.289,-123.857,2006-10-01,49.833333,54.0,15,"Eureka-Arcata-Fortuna, CA",6_23_2001_20061001
161546,6,California,23,Humboldt,Not in a city,2001,WGS84,41.289,-123.857,2006-10-02,50.0,55.0,14,"Eureka-Arcata-Fortuna, CA",6_23_2001_20061002
161548,6,California,23,Humboldt,Not in a city,2001,WGS84,41.289,-123.857,2006-10-03,49.0,53.0,11,"Eureka-Arcata-Fortuna, CA",6_23_2001_20061003
161550,6,California,23,Humboldt,Not in a city,2001,WGS84,41.289,-123.857,2006-10-04,51.083333,59.0,14,"Eureka-Arcata-Fortuna, CA",6_23_2001_20061004
161552,6,California,23,Humboldt,Not in a city,2001,WGS84,41.289,-123.857,2006-10-05,51.958333,56.0,14,"Eureka-Arcata-Fortuna, CA",6_23_2001_20061005


In [16]:
#Saving final df to csv
#rh_df.to_csv(r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Cleaning\all_rh_clean.csv",index=False)
#dp_df.to_csv(r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Cleaning\all_dp_clean.csv",index=False)
rh_df.to_csv(r"C:\Users\stlva\Google Drive\Big blue data academy\main_branch\concatenated data\all_rh_clean.csv",index=False)
dp_df.to_csv(r"C:\Users\stlva\Google Drive\Big blue data academy\main_branch\concatenated data\all_dp_clean.csv",index=False)