In [1]:
import pandas as pd
import glob 

### Import and concatenate all wind csv files into one

In [2]:
# Path to the directory containing CSV files 
csv_files_path = r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Data\Wind\*.csv" # Adjust this path as necessary 

# Use glob to get all CSV file paths 
csv_files = glob.glob(csv_files_path) 

# Use list comprehension to read all CSV files and concatenate them into a single DataFrame 
wind_df = pd.concat([pd.read_csv(file,low_memory=False) for file in csv_files], ignore_index=True) 

# Now wind_df contains all the data from the 20 CSV files 


### Data cleaning and preprocessing

In [3]:
# Drop unwanted columns
wind_df.drop(['Parameter Code', 
              'Sample Duration', 
              'Pollutant Standard', 
              'Event Type', 
              'Observation Count', 
              'Observation Percent', 
              'AQI', 'Method Code', 
              'Method Name', 
              'Local Site Name', 
              'Address', 
              'Date of Last Change'],axis=1,inplace=True)

In [4]:
# Aggregating so we have 1 measurement per station per day per parameter
wind_df = wind_df.groupby(['State Code', 
                           'State Name', 
                           'County Code', 
                           'County Name', 
                           'City Name', 
                           'Site Num', 
                           'Datum', 
                           'Latitude', 
                           'Longitude', 
                           'Date Local', 
                           'Parameter Name', 
                           'Units of Measure']).agg({'Arithmetic Mean':'mean', '1st Max Value':'max','1st Max Hour':'max','CBSA Name':'first'}).reset_index()

In [5]:
# Drop Mexico, Virgin Islands and Puerto Rico
wind_df.drop(wind_df[(wind_df['State Code']==80)|(wind_df['State Code']==78)|(wind_df['State Code']==72)].index,inplace=True)

In [6]:
# Create unique identifier for each measurement
wind_df['measurement_id'] = wind_df['State Code'].apply(str) + '_' + wind_df['County Code'].apply(str) + '_' + wind_df['Site Num'].apply(str) + '_' + wind_df['Date Local'].str.replace('-','')

In [7]:
# In 'Parameter Name' column, both wind speed and wind direction were recorded. So we had to separate them in order to proceed
# Spliting the dataframe based on the parameter name
wind_dir_df = wind_df[wind_df['Parameter Name']=='Wind Direction - Resultant']
wind_speed_df = wind_df[wind_df['Parameter Name']=='Wind Speed - Resultant']

In [8]:
# Renaming column to match the value measured and droping unnecessary columns
wind_dir_df.rename(columns={'Arithmetic Mean':'Wind Direction(deg)','1st Max Value':'Wind Direction(deg) Max Value', '1st Max Hour':'Wind Direction(deg) Max Hour'},inplace=True)
wind_dir_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wind_dir_df.rename(columns={'Arithmetic Mean':'Wind Direction(deg)','1st Max Value':'Wind Direction(deg) Max Value', '1st Max Hour':'Wind Direction(deg) Max Hour'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wind_dir_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)


In [9]:
# Renaming column to match the value measured and droping unnecessary columns
wind_speed_df.rename(columns={'Arithmetic Mean':'Wind Speed(kn)','1st Max Value':'Wind Speed(kn) Max Value', '1st Max Hour':'Wind Speed(kn) Max Hour'},inplace=True)
wind_speed_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wind_speed_df.rename(columns={'Arithmetic Mean':'Wind Speed(kn)','1st Max Value':'Wind Speed(kn) Max Value', '1st Max Hour':'Wind Speed(kn) Max Hour'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wind_speed_df.drop(['Parameter Name', 'Units of Measure'],axis=1,inplace=True)


In [10]:
# Convert date column to datetime
wind_dir_df['Date Local'] = pd.to_datetime(wind_dir_df['Date Local'])
wind_speed_df['Date Local'] = pd.to_datetime(wind_speed_df['Date Local'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wind_dir_df['Date Local'] = pd.to_datetime(wind_dir_df['Date Local'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wind_speed_df['Date Local'] = pd.to_datetime(wind_speed_df['Date Local'])


In [11]:
wind_dir_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4064328 entries, 0 to 8085267
Data columns (total 15 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   State Code                     int64         
 1   State Name                     object        
 2   County Code                    int64         
 3   County Name                    object        
 4   City Name                      object        
 5   Site Num                       int64         
 6   Datum                          object        
 7   Latitude                       float64       
 8   Longitude                      float64       
 9   Date Local                     datetime64[ns]
 10  Wind Direction(deg)            float64       
 11  Wind Direction(deg) Max Value  float64       
 12  Wind Direction(deg) Max Hour   int64         
 13  CBSA Name                      object        
 14  measurement_id                 object        
dtypes: datetime64[ns](1)

In [12]:
wind_speed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4020941 entries, 1 to 8085268
Data columns (total 15 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   State Code                int64         
 1   State Name                object        
 2   County Code               int64         
 3   County Name               object        
 4   City Name                 object        
 5   Site Num                  int64         
 6   Datum                     object        
 7   Latitude                  float64       
 8   Longitude                 float64       
 9   Date Local                datetime64[ns]
 10  Wind Speed(kn)            float64       
 11  Wind Speed(kn) Max Value  float64       
 12  Wind Speed(kn) Max Hour   int64         
 13  CBSA Name                 object        
 14  measurement_id            object        
dtypes: datetime64[ns](1), float64(4), int64(4), object(6)
memory usage: 490.8+ MB


In [13]:
# Merging the two dataframes again
wind_df = pd.merge(wind_dir_df,wind_speed_df,how='outer',on='measurement_id')

In [14]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in wind_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in wind_df.columns:
            wind_df[column] = wind_df[column].fillna(wind_df[corresponding_column])

In [15]:
# Drop all columns from the right table, so that we don't have dupblicate columns
wind_df = wind_df.drop(columns=wind_df.filter(regex='_y$').columns)

In [16]:
# Rename the columns
wind_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name', 
                        'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude', 'Date Local_x':'Date Local','CBSA Name_x':'CBSA Name'},inplace=True)

In [17]:
wind_df.head()

Unnamed: 0,State Code,State Name,County Code,County Name,City Name,Site Num,Datum,Latitude,Longitude,Date Local,Wind Direction(deg),Wind Direction(deg) Max Value,Wind Direction(deg) Max Hour,CBSA Name,measurement_id,Wind Speed(kn),Wind Speed(kn) Max Value,Wind Speed(kn) Max Hour
0,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2012-02-01,119.166667,150.7,22.0,"Dover, DE",10_1_2_20120201,8.708333,14.5,13.0
1,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2012-02-02,109.125,159.7,19.0,"Dover, DE",10_1_2_20120202,6.158333,11.4,21.0
2,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2012-02-03,137.041667,161.3,1.0,"Dover, DE",10_1_2_20120203,5.541667,8.9,10.0
3,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2012-02-04,83.866667,157.4,23.0,"Dover, DE",10_1_2_20120204,4.254167,6.9,11.0
4,10.0,Delaware,1.0,Kent,Not in a city,2.0,WGS84,38.986672,-75.5568,2012-02-05,93.604167,158.8,17.0,"Dover, DE",10_1_2_20120205,5.9625,10.0,8.0


In [18]:
#Saving final df to csv
#wind_df.to_csv(r"C:\Users\stlva\Documents\Python_Scripts\Data_analytics_bootcamp\air-quality-and-weather-analysis\Cleaning\all_winds_clean.csv",index=False)
wind_df.to_csv(r"C:\Users\stlva\Google Drive\Big blue data academy\main_branch\concatenated data\all_wind_clean.csv",index=False)