In [1]:
import pandas as pd
import glob 

### Import and append all pressure csv files into one

In [2]:
# Path to the directory containing CSV files 
csv_files_path = r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Data\Pressure\*.csv" # Adjust this path as necessary 

# Use glob to get all CSV file paths 
csv_files = glob.glob(csv_files_path) 

# Use list comprehension to read all CSV files and concatenate them into a single DataFrame 
pressure_df = pd.concat([pd.read_csv(file,low_memory=False) for file in csv_files], ignore_index=True) 

# Now pressure_df contains all the data from the 20 CSV files 


In [3]:
#Drop unwanted columns
pressure_df.drop(['Parameter Code',
                  'Sample Duration', 
                  'Pollutant Standard', 
                  'Event Type', 
                  'Observation Count', 
                  'Observation Percent', 
                  'AQI', 
                  'Method Code', 
                  'Method Name', 
                  'Local Site Name', 
                  'Address', 
                  'Date of Last Change'],axis=1,inplace=True)

In [4]:
# Grouping by so we have 1 measurement per station per day
pressure_df = pressure_df.groupby(['State Code',
                                   'State Name',
                                   'County Code',
                                   'County Name',
                                   'City Name',
                                   'Site Num',
                                   'Datum',
                                   'Latitude',
                                   'Longitude',
                                   'Date Local',
                                   'Parameter Name',
                                   'Units of Measure']).agg({'Arithmetic Mean':'mean', '1st Max Value':'max','1st Max Hour':'max','CBSA Name':'first'}).reset_index()

In [5]:
#Drop Mexico, Virgin Islands and Puerto Rico
pressure_df.drop(pressure_df[(pressure_df['State Code']==80)|(pressure_df['State Code']==78)|(pressure_df['State Code']==72)].index,inplace=True)

In [6]:
#create unique identifier for each measurement
pressure_df['measurement_id'] = pressure_df['State Code'].apply(str)+'_'+pressure_df['County Code'].apply(str)+'_'+pressure_df['Site Num'].apply(str)+'_'+pressure_df['Date Local'].str.replace('-','')

In [7]:
#Renaming column to match the value measured and droping unnecessary columns
pressure_df.rename(columns={'Arithmetic Mean':'Barometric pressure(mb)','1st Max Value':'Barometric pressure(mb) Max Value', '1st Max Hour':'Barometric pressure(mb) Max Hour'},inplace=True)
pressure_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)

In [8]:
#Convert date column to datetime
pressure_df['Date Local'] = pd.to_datetime(pressure_df['Date Local'])

In [9]:
pressure_df.describe()

Unnamed: 0,State Code,County Code,Site Num,Latitude,Longitude,Date Local,Barometric pressure(mb),Barometric pressure(mb) Max Value,Barometric pressure(mb) Max Hour
count,1949314.0,1949314.0,1949314.0,1949314.0,1949314.0,1949314,1949314.0,1949314.0,1949314.0
mean,28.3553,63.43131,877.912,40.14453,-97.72728,2014-05-13 10:01:03.374909696,974.9914,978.0129,9.267589
min,1.0,1.0,1.0,19.42051,-155.2879,2004-01-01 00:00:00,0.0,0.0,0.0
25%,16.0,19.0,6.0,37.19954,-116.3479,2010-01-24 00:00:00,913.9875,917.0,0.0
50%,26.0,39.0,26.0,41.24749,-95.29472,2014-07-20 00:00:00,987.4657,991.0,8.0
75%,41.0,83.0,1005.0,42.86183,-83.00014,2018-11-10 00:00:00,1004.25,1007.0,19.0
max,56.0,510.0,9997.0,64.84569,-68.03301,2023-09-30 00:00:00,35159.28,35184.7,23.0
std,16.97818,72.66866,1741.683,4.416176,17.78388,,784.4686,787.1435,8.135893


In [10]:
#Clearing outliers

#Define reasonable ranges
reasonable_pressure_range = (870, 1100)  # Millibars

# Filter out unreasonable values and pressure
pressure_df.loc[~pressure_df['Barometric pressure(mb)'].between(*reasonable_pressure_range), 'Barometric pressure(mb)'] = None
pressure_df.loc[~pressure_df['Barometric pressure(mb) Max Value'].between(*reasonable_pressure_range), 'Barometric pressure(mb) Max Value'] = None

# Apply linear interpolation to fill NaN values
pressure_df[['Barometric pressure(mb)','Barometric pressure(mb) Max Value']].interpolate(method='linear', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pressure_df[['Barometric pressure(mb)','Barometric pressure(mb) Max Value']].interpolate(method='linear', inplace=True)


In [11]:
pressure_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,Barometric pressure(mb),Barometric pressure(mb) Max Value,Barometric pressure(mb) Max Hour,CBSA Name,measurement_id
0,1,Alabama,53,Escambia,Not in a city,1000,NAD83,31.0921,-87.5435,2016-01-01,1013.7,1016.1,20,,1_53_1000_20160101
1,1,Alabama,53,Escambia,Not in a city,1000,NAD83,31.0921,-87.5435,2016-01-02,1014.254167,1016.7,9,,1_53_1000_20160102
2,1,Alabama,53,Escambia,Not in a city,1000,NAD83,31.0921,-87.5435,2016-01-03,1010.05,1012.1,0,,1_53_1000_20160103
3,1,Alabama,53,Escambia,Not in a city,1000,NAD83,31.0921,-87.5435,2016-01-04,1011.875,1015.7,23,,1_53_1000_20160104
4,1,Alabama,53,Escambia,Not in a city,1000,NAD83,31.0921,-87.5435,2016-01-05,1017.504167,1019.5,9,,1_53_1000_20160105


In [12]:
#Saving final df to csv
#pressure_df.to_csv(r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Cleaning\all_pressure_clean.csv",index=False)
pressure_df.to_csv(r"C:\Users\stlva\Google Drive\Big blue data academy\main_branch\concatenated data\all_pressure_clean.csv",index=False)