In [3]:
import pandas as pd
import numpy as np
import os

In [6]:
air_quality = pd.read_csv("../data/processed/air_quality_cleaned.csv")
water_quality = pd.read_csv("../data/processed/water_quality_cleaned.csv")
deforestation = pd.read_csv("../data/processed/deforestation_cleaned.csv")

In [7]:
print("Air Quality Dataset:")
display(air_quality.head())

Air Quality Dataset:


Unnamed: 0,Name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value
0,Boiler Emissions- Total SO2 Emissions,Number per km2,number,UHF42,409.0,Southeast Queens,2015,01/01/2015,0.3
1,Boiler Emissions- Total SO2 Emissions,Number per km2,number,UHF42,209.0,Bensonhurst - Bay Ridge,2015,01/01/2015,1.2
2,Fine particles (PM 2.5),Mean,mcg/m3,UHF42,209.0,Bensonhurst - Bay Ridge,Annual Average 2012,12/01/2011,8.6
3,Fine particles (PM 2.5),Mean,mcg/m3,UHF42,409.0,Southeast Queens,Annual Average 2012,12/01/2011,8.0
4,Fine particles (PM 2.5),Mean,mcg/m3,UHF42,409.0,Southeast Queens,Summer 2022,06/01/2022,6.1


In [8]:
print("Water Quality Dataset:")
display(water_quality.head())

Water Quality Dataset:


Unnamed: 0,Site_Id,Read_Date,Salinity (ppt),Dissolved Oxygen (mg/L),pH (standard units),Secchi Depth (m),Water Depth (m),Water Temp (?C),Air Temp (?F),AirTemp (C),Year
0,Bay,1/3/1994,1.3,11.7,7.3,0.4,0.4,5.9,46.4,8.0,1994
1,Bay,1/31/1994,1.5,12.0,7.4,0.2,0.35,3.0,36.68,2.6,1994
2,Bay,2/7/1994,1.0,10.5,7.2,0.25,0.6,5.9,45.68,7.6,1994
3,Bay,2/23/1994,1.0,10.1,7.4,0.35,0.5,10.0,36.86,2.7,1994
4,Bay,2/28/1994,1.0,12.6,7.2,0.2,0.4,1.6,32.0,0.0,1994


In [9]:
print("Deforestation Dataset:")
display(deforestation.head())

Deforestation Dataset:


Unnamed: 0,iso3c,forests_2000,forests_2020,trend
0,AFG,1.9,1.9,0.0
1,ALB,28.1,28.8,2.5
2,DZA,0.7,0.8,14.3
3,ASM,88.7,85.7,-3.4
4,AND,34.0,34.0,0.0


In [11]:
print("\nFeature Engineering: Air Quality")
if 'Time Period' in air_quality.columns:
    air_quality['Year'] = air_quality['Time Period'].str.extract(r'(\d{4})').astype(float)


Feature Engineering: Air Quality


In [12]:
air_quality['Data Value Normalized'] = (
    air_quality['Data Value'] - air_quality['Data Value'].min()
) / (air_quality['Data Value'].max() - air_quality['Data Value'].min())

In [13]:
print("\nFeature Engineering: Water Quality")
if "Water Temp (?C)" in water_quality.columns:
    water_quality['High Temp Flag'] = water_quality['Water Temp (?C)'] > 25


Feature Engineering: Water Quality


In [14]:
if 'Read_Date' in water_quality.columns:
    water_quality['Year'] = pd.to_datetime(water_quality['Read_Date'], errors='coerce').dt.year

In [15]:
if 'pH (standard units)' in water_quality.columns:
    water_quality['pH_Filled'] = water_quality['pH (standard units)'].fillna(water_quality['pH (standard units)'].mean())

In [16]:
print("\nFeature Engineering: Deforestation")
deforestation['forest_change_percentage'] = (
    (deforestation['forests_2020'] - deforestation['forests_2000']) / deforestation['forests_2000'] * 100
)


Feature Engineering: Deforestation


In [17]:
deforestation['trend_category'] = np.where(deforestation['trend'] > 0, 'Positive', 'Negative')

In [18]:
air_quality_transformed_path = "../data/processed/air_quality_transformed.csv"
water_quality_transformed_path = "../data/processed/water_quality_transformed.csv"
deforestation_transformed_path = "../data/processed/deforestation_transformed.csv"

In [19]:
air_quality.to_csv(air_quality_transformed_path, index=False)
water_quality.to_csv(water_quality_transformed_path, index=False)
deforestation.to_csv(deforestation_transformed_path, index=False)