# ENGO 645 Final Project: Preprocessing

By: Lalitha Guru Swaminathan, Mabel Heffring, Saroj Kumar

## Part 1: Data Cleaning and Preparation

### Step 1: Extract Raster Data for each Water Quality Station using QGIS
Talk about it here (to be filled)

### Step 2: Cleaning and Preparing River Water Quality (WQ) Data

- Selecting relevant values
- Removing missing values
- Checking column types and removing prefix


In [38]:
import pandas as pd


# Importing WQ data for all 5 years
wq2018_rivers = pd.read_csv("data/water_quality/2018_WQ_rivers.csv")
wq2019_rivers = pd.read_csv("data/water_quality/2019_WQ_rivers.csv")
wq2020_rivers = pd.read_csv("data/water_quality/2020_WQ_rivers.csv")
wq2021_rivers = pd.read_csv("data/water_quality/2021_WQ_rivers.csv")
wq2022_rivers = pd.read_csv("data/water_quality/2022_WQ_rivers.csv")

# Selecting relevant features
relevant_features = ['StationNumber', 'LatitudeDecimalDegrees','LongitudeDecimalDegrees', 'SampleDatetime',
                     'RiverSubBasinCode', '100923 PH (FIELD) pH units','80558 OXYGEN DISSOLVED (FIELD METER) mg/L',
                     '10602 HARDNESS TOTAL CACO3 (CALCD.) mg/L','102647 NITROGEN NITRATE mg/L','2014 PHOSPHATE DISSOLVED ORTHO mg/L',
                     '2002 TURBIDITY NTU','100629 COLIFORMS FECAL No/100 mL','2003 CHLORIDE DISSOLVED mg/L',
                     '201 TOTAL DISSOLVED SOLIDS (CALCD.) mg/L',
                     #'103949 LEAD DISSOLVED ug/L','103928 ARSENIC DISSOLVED ug/L', '109749 MERCURY DISSOLVED ng/L',
                     '100924 SPECIFIC CONDUCTANCE (FIELD) uS/cm','106256 FLOW, ESTIMATE N/A',
                     '100925 TEMPERATURE WATER deg C']

wq2018_rivers_cleaned = wq2018_rivers[relevant_features]
wq2019_rivers_cleaned = wq2019_rivers[relevant_features]
wq2020_rivers_cleaned = wq2020_rivers[relevant_features]
wq2021_rivers_cleaned = wq2021_rivers[relevant_features]
wq2022_rivers_cleaned = wq2022_rivers[relevant_features]

# removing nan values
wq2018_rivers_cleaned = wq2018_rivers_cleaned.dropna()
wq2019_rivers_cleaned = wq2019_rivers_cleaned.dropna()
wq2020_rivers_cleaned = wq2020_rivers_cleaned.dropna()
wq2021_rivers_cleaned = wq2021_rivers_cleaned.dropna()
wq2022_rivers_cleaned = wq2022_rivers_cleaned.dropna()

# function to fix the data types and remove L and G prefix (standing for less than or greater than)
def change_datatypes(data):
    for column_name in data.columns:
        if column_name not in ['StationNumber','SampleDatetime', 'RiverSubBasinCode']:
            if data[column_name].dtype != float:
                data[column_name] = data[column_name].replace({'L': '','G':''}, regex=True).astype(float)
        elif column_name == 'SampleDatetime':
            data[column_name] = pd.to_datetime(data[column_name])
    return data

wq2018_rivers_cleaned = change_datatypes(wq2018_rivers_cleaned)
wq2019_rivers_cleaned = change_datatypes(wq2019_rivers_cleaned)
wq2020_rivers_cleaned = change_datatypes(wq2020_rivers_cleaned)
wq2021_rivers_cleaned = change_datatypes(wq2021_rivers_cleaned)
wq2022_rivers_cleaned = change_datatypes(wq2022_rivers_cleaned)

# combining water quality results for all 5 years
wq_rivers = pd.concat([wq2018_rivers_cleaned, wq2019_rivers_cleaned, wq2020_rivers_cleaned, wq2021_rivers_cleaned, wq2022_rivers_cleaned], axis=0, ignore_index=True)

# creating year and month columns
wq_rivers['year'] = wq_rivers['SampleDatetime'].dt.year
wq_rivers['month'] = wq_rivers['SampleDatetime'].dt.month

# printing data frame information
wq_rivers.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909 entries, 0 to 4908
Data columns (total 19 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   StationNumber                              4909 non-null   object        
 1   LatitudeDecimalDegrees                     4909 non-null   float64       
 2   LongitudeDecimalDegrees                    4909 non-null   float64       
 3   SampleDatetime                             4909 non-null   datetime64[ns]
 4   RiverSubBasinCode                          4909 non-null   object        
 5   100923 PH (FIELD) pH units                 4909 non-null   float64       
 6   80558 OXYGEN DISSOLVED (FIELD METER) mg/L  4909 non-null   float64       
 7   10602 HARDNESS TOTAL CACO3 (CALCD.) mg/L   4909 non-null   float64       
 8   102647 NITROGEN NITRATE mg/L               4909 non-null   float64       
 9   2014 PHOSPHATE DISS

### Step 3: Cleaning and Preparing Lake Water Quality (WQ) Data

- Selecting relevant values
- Removing missing values
- Checking column types and removing prefix

In [39]:
# Importing WQ data for all 5 years
wq2018_lakes = pd.read_csv("data/water_quality/2018_WQ_lakes.csv")
wq2019_lakes = pd.read_csv("data/water_quality/2019_WQ_lakes.csv")
wq2020_lakes = pd.read_csv("data/water_quality/2020_WQ_lakes.csv")
wq2021_lakes = pd.read_csv("data/water_quality/2021_WQ_lakes.csv")
wq2022_lakes = pd.read_csv("data/water_quality/2022_WQ_lakes.csv")

# Selecting relevant features
relevant_features = ['StationNumber', 'LatitudeDecimalDegrees','LongitudeDecimalDegrees', 'SampleDatetime',
                     'RiverSubBasinCode', '100923 PH (FIELD) pH units','80558 OXYGEN DISSOLVED (FIELD METER) mg/L',
                     '100924 SPECIFIC CONDUCTANCE (FIELD) uS/cm','100925 TEMPERATURE WATER deg C']

wq2018_lakes_cleaned = wq2018_lakes[relevant_features]
wq2019_lakes_cleaned = wq2019_lakes[relevant_features]
wq2020_lakes_cleaned = wq2020_lakes[relevant_features]
wq2021_lakes_cleaned = wq2021_lakes[relevant_features]
wq2022_lakes_cleaned = wq2022_lakes[relevant_features]

# removing nan values
wq2018_lakes_cleaned = wq2018_lakes_cleaned.dropna()
wq2019_lakes_cleaned = wq2019_lakes_cleaned.dropna()
wq2020_lakes_cleaned = wq2020_lakes_cleaned.dropna()
wq2021_lakes_cleaned = wq2021_lakes_cleaned.dropna()
wq2022_lakes_cleaned = wq2022_lakes_cleaned.dropna()

# fixing data types
wq2018_lakes_cleaned = change_datatypes(wq2018_lakes_cleaned)
wq2019_lakes_cleaned = change_datatypes(wq2019_lakes_cleaned)
wq2020_lakes_cleaned = change_datatypes(wq2020_lakes_cleaned)
wq2021_lakes_cleaned = change_datatypes(wq2021_lakes_cleaned)
wq2022_lakes_cleaned = change_datatypes(wq2022_lakes_cleaned)

# combining water quality results for all 5 years
wq_lakes = pd.concat([wq2018_lakes_cleaned, wq2019_lakes_cleaned, wq2020_lakes_cleaned, wq2021_lakes_cleaned, wq2022_lakes_cleaned], axis=0, ignore_index=True)

# creating year and month columns
wq_lakes['year'] = wq_lakes['SampleDatetime'].dt.year
wq_lakes['month'] = wq_lakes['SampleDatetime'].dt.month

# printing data frame information
wq_lakes.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14088 entries, 0 to 14087
Data columns (total 11 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   StationNumber                              14088 non-null  object        
 1   LatitudeDecimalDegrees                     14088 non-null  float64       
 2   LongitudeDecimalDegrees                    14088 non-null  float64       
 3   SampleDatetime                             14088 non-null  datetime64[ns]
 4   RiverSubBasinCode                          14088 non-null  object        
 5   100923 PH (FIELD) pH units                 14088 non-null  float64       
 6   80558 OXYGEN DISSOLVED (FIELD METER) mg/L  14088 non-null  float64       
 7   100924 SPECIFIC CONDUCTANCE (FIELD) uS/cm  14088 non-null  float64       
 8   100925 TEMPERATURE WATER deg C             14088 non-null  float64       
 9   year             

### Step 4: Cleaning and Preparing Climate Data
- Selecting relevant values
- Combining data for each month
- Removing missing values 

In [40]:
import os

# path to files
dir = 'data/precipitation'

# initiating data frame
climate_data = pd.DataFrame()

# months to loop through
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# years to loop through
years = ['2018', '2019', '2020', '2021', '2022']

# Looping through each file and selecting relevant values
for y in years:
    for m in months:
        current_path = os.path.join(dir, 'en_climate_summaries_AB_' + m + '-' + y +'.csv')
        current_data = pd.read_csv(current_path)

        current_data_clean = current_data[['Long', 'Lat', 'Clim_ID', 'Tm', 'P']].copy()

        current_data_clean['year'] = int(y)
        current_data_clean['month'] = int(m)

        climate_data = pd.concat([climate_data, current_data_clean], axis=0, ignore_index=True)

# removing missing values
climate_data = climate_data.dropna()

# printing data frame information
climate_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 14036 entries, 0 to 15148
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Long     14036 non-null  float64
 1   Lat      14036 non-null  float64
 2   Clim_ID  14036 non-null  object 
 3   Tm       14036 non-null  float64
 4   P        14036 non-null  float64
 5   year     14036 non-null  int64  
 6   month    14036 non-null  int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 877.2+ KB


### Step 5: Cleaning and Preparing NDVI Data
- Convert date column
- remove missing values


In [41]:
# reading in the NDVI data
ndvi_data = pd.read_csv("data/ndvi/NDVI.csv")

# converting date column to datetime
ndvi_data['Week'] = pd.to_datetime(ndvi_data['Week'])

# removing missing values
ndvi_data = ndvi_data.dropna()

# printing data type values
ndvi_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Week    135 non-null    datetime64[ns]
 1   4810    135 non-null    float64       
 2   4820    135 non-null    float64       
 3   4830    135 non-null    float64       
 4   4840    135 non-null    float64       
 5   4841    135 non-null    float64       
 6   4850    135 non-null    float64       
 7   4860    135 non-null    float64       
 8   4870    135 non-null    float64       
dtypes: datetime64[ns](1), float64(8)
memory usage: 9.6 KB


### Step 6: Cleaning and Preparing Emission Data
- Selecting relevant values
- Combining data for each year
- Removing missing values 

In [42]:
import os

# path to files
dir = 'data/release_points'

# initiating data frame
emission_data = pd.DataFrame()

# years to loop through
years = ['2018', '2019', '2020', '2021', '2022']

# Looping through each file and selecting relevant values
for y in years:
    current_path = os.path.join(dir, y + '_release_points.csv')
    with open(current_path, encoding='utf-8', errors='replace') as f:
        current_data = pd.read_csv(f)

        current_data_clean = current_data[['FacilityName', 'RPLatitude', 'RPLongitude']].copy()

        current_data_clean['year'] = int(y)

        emission_data = pd.concat([emission_data, current_data_clean], axis=0, ignore_index=True)

# removing missing values
emission_data = emission_data.dropna()

# printing data frame information
emission_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28971 entries, 0 to 29094
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   FacilityName  28971 non-null  object 
 1   RPLatitude    28971 non-null  float64
 2   RPLongitude   28971 non-null  float64
 3   year          28971 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 1.1+ MB


### Step 7: Cleaning and Preparing QGIS data
- Selecting relevant values
- Removing missing values 

In [43]:
# reading in the NDVI data
qgis_data = pd.read_csv("data/Station_Inventory_Filtered_w_dem_slope_LC_1km_5km.csv")

# dividing river and lake data
qgis_data_rivers = qgis_data[qgis_data['StationType'] == '0 (RIVER OR STREAM)']
qgis_data_lakes = qgis_data[qgis_data['StationType'] == '1 (LAKE)']

# selecting relevant data
qgis_data_rivers = qgis_data_rivers[['StationNumber', 'LatitudeDecimalDegrees', 'LongitudeDecimalDegrees', 'AGG_ID', 'Elevation', 'Slope', 'LC_1km', 'LC_5km']]
qgis_data_lakes = qgis_data_lakes[['StationNumber', 'LatitudeDecimalDegrees', 'LongitudeDecimalDegrees', 'AGG_ID', 'Elevation', 'Slope', 'LC_1km', 'LC_5km']]

# removing missing values
qgis_data_rivers = qgis_data_rivers.dropna()
qgis_data_lakes = qgis_data_lakes.dropna()

# printing data type values
qgis_data_rivers.info()
qgis_data_lakes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 209 entries, 4 to 431
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   StationNumber            209 non-null    object 
 1   LatitudeDecimalDegrees   209 non-null    float64
 2   LongitudeDecimalDegrees  209 non-null    float64
 3   AGG_ID                   209 non-null    int64  
 4   Elevation                209 non-null    float64
 5   Slope                    209 non-null    float64
 6   LC_1km                   209 non-null    float64
 7   LC_5km                   209 non-null    float64
dtypes: float64(6), int64(1), object(1)
memory usage: 14.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 191 entries, 2 to 428
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   StationNumber            191 non-null    object 
 1   LatitudeDecimalDegrees   191 non

## Part 2: Feature Engineering

### Step 1: Calculate Number of Nearby Emission Release Points

In [44]:
import geopandas as gpd

# convert emission data and qgis data to geopandas
emission_data_geo = gpd.GeoDataFrame(emission_data, geometry=gpd.points_from_xy(emission_data['RPLongitude'], emission_data['RPLatitude']))
qgis_data_rivers_geo = gpd.GeoDataFrame(qgis_data_rivers, geometry=gpd.points_from_xy(qgis_data_rivers['LongitudeDecimalDegrees'], qgis_data_rivers['LatitudeDecimalDegrees']))
qgis_data_lakes_geo = gpd.GeoDataFrame(qgis_data_lakes, geometry=gpd.points_from_xy(qgis_data_lakes['LongitudeDecimalDegrees'], qgis_data_lakes['LatitudeDecimalDegrees']))

# apply WGS84 crs to baseline datasets
emission_data_geo.set_crs(4326, allow_override=True, inplace=True)
qgis_data_rivers_geo.set_crs(4326, allow_override=True, inplace=True)
qgis_data_lakes_geo.set_crs(4326, allow_override=True, inplace=True)

# define proximity radius
proximity_radius = 1000 # 1000 m, or 1 km

# creating projected versions of data for accurate distance calculations (EPSG:3400)
emission_data_proj= emission_data_geo.to_crs(epsg=3400) 
qgis_data_rivers_proj = qgis_data_rivers_geo.to_crs(epsg=3400)
qgis_data_lakes_proj = qgis_data_lakes_geo.to_crs(epsg=3400)

# creating function for calculating number of nearby emission points
def compute_nearby_rp(row, emission_data, radius):
    # point to point distance
    dpoints = emission_data.geometry.distance(row.geometry)
    nearby_points = dpoints[dpoints <= radius]
    
    # Count how many points are within the defined radius
    nearby_count = len(nearby_points)
    
    return nearby_count

# looping through each year and calculating nearby emission release points
years = ['2018', '2019', '2020', '2021', '2022']
for y in years:
    current_emission_data = emission_data_proj[emission_data_proj['year']== int(y)]
    feature = 'RP_'+ y

    qgis_data_rivers_geo[feature] = qgis_data_rivers_proj.apply(compute_nearby_rp, axis=1, emission_data=current_emission_data, radius=proximity_radius)
    qgis_data_lakes_geo[feature] = qgis_data_lakes_proj.apply(compute_nearby_rp, axis=1, emission_data=current_emission_data, radius=proximity_radius)



### Step 2: Merge  QGIS data and Water Quality Data

In [45]:
# merging according to matching Station Number
wq_rivers_merged = pd.merge(wq_rivers, qgis_data_rivers_geo, on='StationNumber', how='inner')
wq_lakes_merged = pd.merge(wq_lakes, qgis_data_lakes_geo, on='StationNumber', how='inner')

# printing new data frame information
wq_rivers_merged.info()
wq_lakes_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909 entries, 0 to 4908
Data columns (total 32 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   StationNumber                              4909 non-null   object        
 1   LatitudeDecimalDegrees_x                   4909 non-null   float64       
 2   LongitudeDecimalDegrees_x                  4909 non-null   float64       
 3   SampleDatetime                             4909 non-null   datetime64[ns]
 4   RiverSubBasinCode                          4909 non-null   object        
 5   100923 PH (FIELD) pH units                 4909 non-null   float64       
 6   80558 OXYGEN DISSOLVED (FIELD METER) mg/L  4909 non-null   float64       
 7   10602 HARDNESS TOTAL CACO3 (CALCD.) mg/L   4909 non-null   float64       
 8   102647 NITROGEN NITRATE mg/L               4909 non-null   float64       
 9   2014 PHOSPHATE DISS

### Step 3: Adding NDVI values
- pivot the NDVI look up table
- apply NDVI based on week number, year, and Agg_ID
- remove any missing values

In [46]:
import pandas as pd

# calculate week number for water quality data and NDVI data
wq_rivers_merged['week_number'] = wq_rivers_merged['SampleDatetime'].dt.isocalendar().week
wq_lakes_merged['week_number'] = wq_lakes_merged['SampleDatetime'].dt.isocalendar().week
ndvi_data['week_number'] = ndvi_data['Week'].dt.isocalendar().week
ndvi_data['year'] = ndvi_data['Week'].dt.year

# create NDVI look up table according to Agg_ID and Week number
ndvi_lookup = ndvi_data.melt(id_vars=['week_number', 'year'], value_vars=['4810', '4820', '4830', '4840', '4841', '4850', '4860', '4870'], var_name='AGG_ID', value_name='NDVI')
ndvi_lookup['AGG_ID'] = ndvi_lookup['AGG_ID'].astype(int)

# combining 
wq_rivers_merged_wNDVI = pd.merge(wq_rivers_merged, ndvi_lookup, on=['week_number', 'year', 'AGG_ID'], how='left')
wq_lakes_merged_wNDVI = pd.merge(wq_lakes_merged, ndvi_lookup, on=['week_number', 'year', 'AGG_ID'], how='left')

# removing missing values (NDVI only available from October to May)
wq_rivers_merged_wNDVI = wq_rivers_merged_wNDVI.dropna()
wq_lakes_merged_wNDVI = wq_lakes_merged_wNDVI.dropna()

# printing updated data frame information
wq_rivers_merged_wNDVI.info()
wq_lakes_merged_wNDVI.info()



<class 'pandas.core.frame.DataFrame'>
Index: 2870 entries, 3 to 4907
Data columns (total 34 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   StationNumber                              2870 non-null   object        
 1   LatitudeDecimalDegrees_x                   2870 non-null   float64       
 2   LongitudeDecimalDegrees_x                  2870 non-null   float64       
 3   SampleDatetime                             2870 non-null   datetime64[ns]
 4   RiverSubBasinCode                          2870 non-null   object        
 5   100923 PH (FIELD) pH units                 2870 non-null   float64       
 6   80558 OXYGEN DISSOLVED (FIELD METER) mg/L  2870 non-null   float64       
 7   10602 HARDNESS TOTAL CACO3 (CALCD.) mg/L   2870 non-null   float64       
 8   102647 NITROGEN NITRATE mg/L               2870 non-null   float64       
 9   2014 PHOSPHATE DISSOLVED

### Step 4: Adding Climate Data

In [47]:
import geopandas as gpd

# months to loop through
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# years to loop through
years = ['2018', '2019', '2020', '2021', '2022']

wq_rivers_merged_wNDVI_wClim = pd.DataFrame()
wq_lakes_merged_wNDVI_wClim = pd.DataFrame()

# Looping through each month and year join climate data
for y in years:
    for m in months:
        # extract data for current month and year
        current_climate_data = climate_data[(climate_data['year']==int(y)) & (climate_data['month']==int(m))]
        current_wq_rivers = wq_rivers_merged_wNDVI[(wq_rivers_merged_wNDVI['year']==int(y)) & (wq_rivers_merged_wNDVI['month']==int(m))]
        current_wq_lakes = wq_lakes_merged_wNDVI[(wq_lakes_merged_wNDVI['year']==int(y)) & (wq_lakes_merged_wNDVI['month']==int(m))]

        # converting to geopandas
        current_climate_data_geo = gpd.GeoDataFrame(current_climate_data, geometry=gpd.points_from_xy(current_climate_data['Long'], current_climate_data['Lat']))
        current_wq_rivers_geo = gpd.GeoDataFrame(current_wq_rivers, geometry=gpd.points_from_xy(current_wq_rivers['LongitudeDecimalDegrees_x'], current_wq_rivers['LatitudeDecimalDegrees_x']))
        current_wq_lakes_geo = gpd.GeoDataFrame(current_wq_lakes, geometry=gpd.points_from_xy(current_wq_lakes['LongitudeDecimalDegrees_x'], current_wq_lakes['LatitudeDecimalDegrees_x']))

        # apply WGS84 crs to baseline datasets
        current_climate_data_geo.set_crs(4326, allow_override=True, inplace=True)
        current_wq_rivers_geo.set_crs(4326, allow_override=True, inplace=True)
        current_wq_lakes_geo.set_crs(4326, allow_override=True, inplace=True)

        # creating projected versions of data for accurate distance calculations (EPSG:3400)
        current_climate_data_proj= current_climate_data_geo.to_crs(epsg=3400) 
        current_wq_rivers_proj = current_wq_rivers_geo.to_crs(epsg=3400)
        current_wq_lakes_proj = current_wq_lakes_geo.to_crs(epsg=3400)

        # computing nearest climate station
        current_wq_rivers_merged = gpd.sjoin_nearest(current_wq_rivers_proj, current_climate_data_proj, how="left", distance_col="clim_distance")
        current_wq_lakes_merged = gpd.sjoin_nearest(current_wq_lakes_proj, current_climate_data_proj, how="left", distance_col="clim_distance")

        # combining results for each month
        wq_rivers_merged_wNDVI_wClim = pd.concat([wq_rivers_merged_wNDVI_wClim, current_wq_rivers_merged], axis=0, ignore_index=True)
        wq_lakes_merged_wNDVI_wClim = pd.concat([wq_lakes_merged_wNDVI_wClim, current_wq_lakes_merged], axis=0, ignore_index=True)

# output new data frame information
wq_rivers_merged_wNDVI_wClim.info()
wq_lakes_merged_wNDVI_wClim.info()
        

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2916 entries, 0 to 2915
Data columns (total 43 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   StationNumber                              2916 non-null   object        
 1   LatitudeDecimalDegrees_x                   2916 non-null   float64       
 2   LongitudeDecimalDegrees_x                  2916 non-null   float64       
 3   SampleDatetime                             2916 non-null   datetime64[ns]
 4   RiverSubBasinCode                          2916 non-null   object        
 5   100923 PH (FIELD) pH units                 2916 non-null   float64       
 6   80558 OXYGEN DISSOLVED (FIELD METER) mg/L  2916 non-null   float64       
 7   10602 HARDNESS TOTAL CACO3 (CALCD.) mg/L   2916 non-null   float64       
 8   102647 NITROGEN NITRATE mg/L               2916 non-null   float64       
 9   2014 PHOSPH

### Step 5: Cleaning and outputting database

In [48]:
# selecting relevant values
relevant_features_rivers = ['StationNumber', 'LatitudeDecimalDegrees_x','LongitudeDecimalDegrees_x', 'SampleDatetime',
                     'RiverSubBasinCode', '100923 PH (FIELD) pH units','80558 OXYGEN DISSOLVED (FIELD METER) mg/L',
                     '10602 HARDNESS TOTAL CACO3 (CALCD.) mg/L','102647 NITROGEN NITRATE mg/L','2014 PHOSPHATE DISSOLVED ORTHO mg/L',
                     '2002 TURBIDITY NTU','100629 COLIFORMS FECAL No/100 mL','2003 CHLORIDE DISSOLVED mg/L',
                     '201 TOTAL DISSOLVED SOLIDS (CALCD.) mg/L',
                     #'103949 LEAD DISSOLVED ug/L','103928 ARSENIC DISSOLVED ug/L','109749 MERCURY DISSOLVED ng/L',
                     '100924 SPECIFIC CONDUCTANCE (FIELD) uS/cm','106256 FLOW, ESTIMATE N/A',
                     '100925 TEMPERATURE WATER deg C','year_left', 'month_left', 'Elevation', 'Slope', 
                     'LC_1km', 'LC_5km', 'RP_2018', 'RP_2019', 'RP_2020', 'RP_2021', 'RP_2022', 'NDVI', 'Tm', 'P']

wq_rivers_cleaned = wq_rivers_merged_wNDVI_wClim[relevant_features_rivers]

relevant_features_lakes = ['StationNumber', 'LatitudeDecimalDegrees_x','LongitudeDecimalDegrees_x', 'SampleDatetime',
                     'RiverSubBasinCode', '100923 PH (FIELD) pH units','80558 OXYGEN DISSOLVED (FIELD METER) mg/L',
                     '100924 SPECIFIC CONDUCTANCE (FIELD) uS/cm','100925 TEMPERATURE WATER deg C', 'year_left', 'month_left', 'Elevation', 'Slope', 
                     'LC_1km', 'LC_5km', 'RP_2018', 'RP_2019', 'RP_2020', 'RP_2021', 'RP_2022', 'NDVI', 'Tm', 'P']

wq_lakes_cleaned = wq_lakes_merged_wNDVI_wClim[relevant_features_lakes]

# renaming columns
wq_rivers_cleaned = wq_rivers_cleaned.rename(columns={
    'LatitudeDecimalDegrees_x': 'Latitude',
    'LongitudeDecimalDegrees_x': 'Longitude',
    'year_left': 'Year',
    'month_left': 'Month',
    'Tm' : 'Temperature',
    'P': 'Precipitation'
})

wq_lakes_cleaned = wq_lakes_cleaned.rename(columns={
    'LatitudeDecimalDegrees_x': 'Latitude',
    'LongitudeDecimalDegrees_x': 'Longitude',
    'year_left': 'Year',
    'month_left': 'Month',
    'Tm' : 'Temperature',
    'P': 'Precipitation'
})

# calculating the number of release points for the specific year
def rp_per_year(row):
    current_year = row['Year']
    feature = 'RP_'+str(current_year)
    RP_count = row[feature]
    return RP_count

wq_rivers_cleaned['RP_count'] = wq_rivers_cleaned.apply(rp_per_year, axis=1)
wq_lakes_cleaned['RP_count'] = wq_lakes_cleaned.apply(rp_per_year, axis=1)

# dropping unnecessary columns
wq_rivers_cleaned = wq_rivers_cleaned.drop(columns=['RP_2018', 'RP_2019', 'RP_2020', 'RP_2021', 'RP_2022'])
wq_lakes_cleaned = wq_lakes_cleaned.drop(columns=['RP_2018', 'RP_2019', 'RP_2020', 'RP_2021', 'RP_2022'])

# printing final data frame information
wq_rivers_cleaned.info()
wq_lakes_cleaned.info()

# printing data to csv for further use
wq_rivers_cleaned.to_csv('data/cleaned/wq_rivers.csv', index=False)
wq_lakes_cleaned.to_csv('data/cleaned/wq_lakes.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916 entries, 0 to 2915
Data columns (total 27 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   StationNumber                              2916 non-null   object        
 1   Latitude                                   2916 non-null   float64       
 2   Longitude                                  2916 non-null   float64       
 3   SampleDatetime                             2916 non-null   datetime64[ns]
 4   RiverSubBasinCode                          2916 non-null   object        
 5   100923 PH (FIELD) pH units                 2916 non-null   float64       
 6   80558 OXYGEN DISSOLVED (FIELD METER) mg/L  2916 non-null   float64       
 7   10602 HARDNESS TOTAL CACO3 (CALCD.) mg/L   2916 non-null   float64       
 8   102647 NITROGEN NITRATE mg/L               2916 non-null   float64       
 9   2014 PHOSPHATE DISS