In [1]:
import pandas as pd
import numpy as np
import gc

## Importing and merging preprocessed weather data

In [2]:
# Import clean pressure data
pres_df = pd.read_csv("all_pressure_clean.csv")

In [3]:
pres_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949314 entries, 0 to 1949313
Data columns (total 15 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   State Code                         int64  
 1   State Name                         object 
 2   County Code                        int64  
 3   County Name                        object 
 4   City Name                          object 
 5   Site Num                           int64  
 6   Datum                              object 
 7   Latitude                           float64
 8   Longitude                          float64
 9   Date Local                         object 
 10  Barometric pressure(mb)            float64
 11  Barometric pressure(mb) Max Value  float64
 12  Barometric pressure(mb) Max Hour   int64  
 13  CBSA Name                          object 
 14  measurement_id                     object 
dtypes: float64(4), int64(4), object(7)
memory usage: 223.1+ MB


In [5]:
# Import clean RH data
rh_df = pd.read_csv("all_rh_clean.csv", low_memory=False)

In [6]:
rh_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2621757 entries, 0 to 2621756
Data columns (total 15 columns):
 #   Column           Dtype  
---  ------           -----  
 0   State Code       int64  
 1   State Name       object 
 2   County Code      int64  
 3   County Name      object 
 4   City Name        object 
 5   Site Num         int64  
 6   Datum            object 
 7   Latitude         float64
 8   Longitude        float64
 9   Date Local       object 
 10  RH(%)            float64
 11  RH(%) Max Value  float64
 12  RH(%) Max Hour   int64  
 13  CBSA Name        object 
 14  measurement_id   object 
dtypes: float64(4), int64(4), object(7)
memory usage: 300.0+ MB


In [7]:
rh_df.isnull().sum()

State Code              0
State Name              0
County Code             0
County Name             0
City Name               0
Site Num                0
Datum                   0
Latitude                0
Longitude               0
Date Local              0
RH(%)                   0
RH(%) Max Value         0
RH(%) Max Hour          0
CBSA Name          364069
measurement_id          0
dtype: int64

In [8]:
# Merge pressure and RH
all_df = pd.merge(pres_df, rh_df, how='outer', on='measurement_id')

In [9]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in all_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in all_df.columns:
            all_df[column] = all_df[column].fillna(all_df[corresponding_column])

In [10]:
# Drop all columns from the right table, so that we don't have dupblicate columns
all_df = all_df.drop(columns=all_df.filter(regex='_y$').columns)

In [11]:
# Rename the columns
all_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name',
       'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude',
       'Date Local_x':'Date Local','CBSA Name_x':'CBSA Name'},inplace=True)

In [16]:
# Import clean temperature data
temp_df = pd.read_csv("all_temperature_clean.csv", low_memory=False)

In [18]:
# Merge temperature data with the previously merged table
all_df = pd.merge(all_df, temp_df, how='outer', on='measurement_id')

In [19]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in all_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in all_df.columns:
            all_df[column] = all_df[column].fillna(all_df[corresponding_column])

# Drop all columns from the right table, so that we don't have dupblicate columns
all_df = all_df.drop(columns=all_df.filter(regex='_y$').columns)

# Rename the columns
all_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name',
       'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude',
       'Date Local_x':'Date Local','CBSA Name_x':'CBSA Name'},inplace=True)

In [22]:
# Import clean wind data
wind_df = pd.read_csv("all_winds_clean.csv", low_memory=False)

In [24]:
# Merge wind data with the previously merged table
all_df = pd.merge(all_df, wind_df, how='outer', on='measurement_id')

In [25]:
# Fill NA values in the left table, from the corresponing column in the right table.
for column in all_df.columns:
    if column.endswith('_x'):
        corresponding_column = column[:-2] + '_y'
        if corresponding_column in all_df.columns:
            all_df[column] = all_df[column].fillna(all_df[corresponding_column])

# Drop all columns from the right table, so that we don't have dupblicate columns
all_df = all_df.drop(columns=all_df.filter(regex='_y$').columns)

# Rename the columns
all_df.rename(columns={'State Code_x':'State Code', 'State Name_x':'State Name', 'County Code_x':'County Code', 'County Name_x':'County Name',
       'City Name_x':'City Name', 'Site Num_x':'Site Num', 'Datum_x':'Datum', 'Latitude_x':'Latitude', 'Longitude_x':'Longitude',
       'Date Local_x':'Date Local','CBSA Name_x':'CBSA Name'},inplace=True)

In [27]:
all_df.columns

Index(['State Code', 'State Name', 'County Code', 'County Name', 'City Name',
       'Site Num', 'Datum', 'Latitude', 'Longitude', 'Date Local',
       'Barometric pressure(mb)', 'Barometric pressure(mb) Max Value',
       'Barometric pressure(mb) Max Hour', 'CBSA Name', 'measurement_id',
       'RH(%)', 'RH(%) Max Value', 'RH(%) Max Hour', 'Temperature(C)',
       'Temperature(C) Max Value', 'Temperature(F) Max Hour',
       'Wind Direction(deg)', 'Wind Direction(deg) Max Value',
       'Wind Direction(deg) Max Hour', 'Wind Speed(kn)',
       'Wind Speed(kn) Max Value', 'Wind Speed(kn) Max Hour'],
      dtype='object')

In [28]:
all_df['State Code'] = all_df['State Code'].astype(int).astype(str)
all_df['County Code'] = all_df['County Code'].astype(int).astype(str)
all_df['Site Num'] = all_df['Site Num'].astype(int).astype(str)

# Create a unique value for every station
all_df['station_id'] = all_df['State Code'].apply(str) + '_' + all_df['County Code'].apply(str) + '_' + all_df['Site Num'].apply(str)

In [29]:
# Save the merged df to csv
all_df.to_csv("weather_data.csv",index=False)