In [1]:
import pandas as pd
import glob 

### Import and concatenate all temperature csv files into one

In [2]:
# Path to the directory containing CSV files 
csv_files_path = r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Data\Temperature\*.csv" # Adjust this path as necessary 

# Use glob to get all CSV file paths 
csv_files = glob.glob(csv_files_path) 

# Use list comprehension to read all CSV files and concatenate them into a single DataFrame 
temp_df = pd.concat([pd.read_csv(file,low_memory=False) for file in csv_files], ignore_index=True) 

# Now temp_df contains all the data from the 20 CSV files 

### Data cleaning and preprocessing

In [3]:
#Drop unwanted columns
temp_df.drop(['Parameter Code',
              'Sample Duration', 
              'Pollutant Standard', 
              'Event Type', 
              'Observation Count', 
              'Observation Percent', 
              'AQI', 
              'Method Code', 
              'Method Name', 
              'Local Site Name', 
              'Address', 
              'Date of Last Change'],axis=1,inplace=True)

In [4]:
#Grouping by so we have 1 measurement per station per day per
temp_df = temp_df.groupby(['State Code', 
                           'State Name', 
                           'County Code', 
                           'County Name', 
                           'City Name', 
                           'Site Num', 
                           'Datum', 
                           'Latitude', 
                           'Longitude', 
                           'Date Local', 
                           'Parameter Name', 
                           'Units of Measure']).agg({'Arithmetic Mean':'mean', '1st Max Value':'max','1st Max Hour':'max','CBSA Name':'first'}).reset_index()

In [5]:
#Drop Mexico, Virgin Islands and Puerto Rico
temp_df.drop(temp_df[(temp_df['State Code']==80)|(temp_df['State Code']==78)|(temp_df['State Code']==72)].index,inplace=True)

In [6]:
#Renaming column to match the value measured and droping unnecessary columns
temp_df.rename(columns={'Arithmetic Mean':'Temperature(F)','1st Max Value':'Temperature(F) Max Value', '1st Max Hour':'Temperature(F) Max Hour'},inplace=True)
temp_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)

In [7]:
#create unique identifier for each measurement
temp_df['measurement_id'] = temp_df['State Code'].apply(str) + '_' + temp_df['County Code'].apply(str) + '_' + temp_df['Site Num'].apply(str) + '_' + temp_df['Date Local'].str.replace('-','')

In [8]:
#Convert date column to datetime
temp_df['Date Local'] = pd.to_datetime(temp_df['Date Local'])

In [9]:
#Fahrenheit to Celsius conversion
temp_df['Temperature(F)'] =((( temp_df['Temperature(F)'] - 32 ) * 5 )/ 9).round(2)
temp_df.rename(columns={'Temperature(F)':'Temperature(C)'}, inplace=True)
temp_df['Temperature(F) Max Value'] = ((( temp_df['Temperature(F) Max Value'] - 32 ) * 5) / 9).round(2)
temp_df.rename(columns={'Temperature(F) Max Value':'Temperature(C) Max Value'}, inplace=True)

In [10]:
#Clearing outliers

#Define reasonable ranges
reasonable_temp_range = (-50, 50)  # Celsius

# Filter out unreasonable values for temperature
temp_df.loc[~temp_df['Temperature(C)'].between(*reasonable_temp_range), 'Temperature(C)'] = None
temp_df.loc[~temp_df['Temperature(C) Max Value'].between(*reasonable_temp_range), 'Temperature(C) Max Value'] = None

# Apply linear interpolation to fill NaN values
temp_df[['Temperature(C)','Temperature(C) Max Value']].interpolate(method='linear', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[['Temperature(C)','Temperature(C) Max Value']].interpolate(method='linear', inplace=True)


In [11]:
temp_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,Temperature(C),Temperature(C) Max Value,Temperature(F) Max Hour,CBSA Name,measurement_id
0,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-06,20.69,23.33,16,"Florence-Muscle Shoals, AL",1_33_1003_20090506
1,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-07,22.71,29.44,16,"Florence-Muscle Shoals, AL",1_33_1003_20090507
2,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-08,25.44,29.44,15,"Florence-Muscle Shoals, AL",1_33_1003_20090508
3,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-09,21.83,26.11,14,"Florence-Muscle Shoals, AL",1_33_1003_20090509
4,1,Alabama,33,Colbert,Not in a city,1003,WGS84,34.7589,-88.0138,2009-05-10,17.15,18.89,15,"Florence-Muscle Shoals, AL",1_33_1003_20090510


In [12]:
#Saving final df to csv
#temp_df.to_csv(r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Cleaning\all_temperature_clean.csv",index=False)
temp_df.to_csv(r"C:\Users\stlva\Google Drive\Big blue data academy\main_branch\concatenated data\all_temp_clean.csv",index=False)