# EDA 
Getting the data organized into a master data sheet

In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import xarray as xr
import numpy as np
import netCDF4

In [22]:
folder_path = 'dataset/Training_Anomalies_Station Data'

# Process each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Ensure it's a CSV file
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        
        # Drop the longitude and latitude columns if they exist
        df = df.drop(columns=['longitude', 'latitude'], errors='ignore')
        
        # Save the updated DataFrame back to the same file (or modify as needed)
        df.to_csv(file_path, index=False)

In [51]:
folder_path = 'dataset/Training_Anomalies_Station Data'
all_data = []

# Process each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Ensure it's a CSV file
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        df = df[['t', 'anomaly', 'location']]  # Ensure the required columns
        all_data.append(df)

# Concatenate all data into a single DataFrame
combined_df = pd.concat(all_data)

# Handle duplicates by aggregating using mean or another function
combined_df = combined_df.groupby(['t', 'location']).mean().reset_index()

# Pivot the DataFrame so each location is a column
result = combined_df.pivot(index='t', columns='location', values='anomaly')

# Reset the index name to "time"
result.index.name = 'Date'

result.to_csv('Station_Anomaly.csv')


Station_Anomaly = pd.read_csv('Station_Anomaly.csv')
Station_Anomaly.head()

Unnamed: 0,Date,Atlantic City,Baltimore,Eastport,Fort Pulaski,Lewes,New London,Newport,Portland,Sandy Hook,Sewells Point,The Battery,Washington
0,1993-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1993-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1993-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1993-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1993-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
directory = "dataset/Copernicus_ENA_Satelite_Maps_Training_Data"
data = []

for filename in os.listdir(directory):
    if filename.endswith(".nc"):
        file_path = os.path.join(directory, filename)

        # Extract the date part from the filename and format it
        date_str = filename.split("_")[2]
        if len(date_str) == 8:
            formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"

            # Open the .nc file
            dataset = netCDF4.Dataset(file_path, mode="r")

            # Extract the 'sla' variable
            sla = dataset.variables["sla"][:][0]
            sla = sla.filled(-10)

            data.append({'Date': formatted_date, 'Map': sla})
            
Map_df = pd.DataFrame(data)
Map_df.head()

Unnamed: 0,Date,Map
0,1993-01-01,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [78]:
Raw_Master_df = Station_Anomaly.merge(Map_df[['Date', 'Map']], on = 'Date')
print(Raw_Master_df.shape)
Raw_Master_df.head()

(7302, 14)


Unnamed: 0,Date,Atlantic City,Baltimore,Eastport,Fort Pulaski,Lewes,New London,Newport,Portland,Sandy Hook,Sewells Point,The Battery,Washington,Map
0,1993-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [79]:
Atlantic_City = Raw_Master_df[['Date', 'Atlantic City', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Atlantic_City.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Atlantic_City.csv")
Atlantic_City.to_csv(output_file, index=False)
print(Atlantic_City.shape)
Atlantic_City.head()

(7063, 3)


Unnamed: 0,Date,Atlantic City,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [80]:
Baltimore = Raw_Master_df[['Date', 'Baltimore', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Baltimore.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Baltimore.csv")
Baltimore.to_csv(output_file, index=False)
print(Baltimore.shape)
Baltimore.head()

(7286, 3)


Unnamed: 0,Date,Baltimore,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [81]:
Eastport = Raw_Master_df[['Date', 'Eastport', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Eastport.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Eastport.csv")
Eastport.to_csv(output_file, index=False)
print(Eastport.shape)
Eastport.head()

(7127, 3)


Unnamed: 0,Date,Eastport,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [83]:
Fort_Pulaski = Raw_Master_df[['Date', 'Fort Pulaski', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Fort_Pulaski.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Fort Pulaski.csv")
Fort_Pulaski.to_csv(output_file, index=False)
print(Fort_Pulaski.shape)
Fort_Pulaski.head()

(7262, 3)


Unnamed: 0,Date,Fort Pulaski,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [84]:
Lewes = Raw_Master_df[['Date', 'Lewes', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Lewes.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Lewes.csv")
Lewes.to_csv(output_file, index=False)
print(Lewes.shape)
Lewes.head()

(7271, 3)


Unnamed: 0,Date,Lewes,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [85]:
New_London = Raw_Master_df[['Date', 'New London', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
New_London.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "New_London.csv")
New_London.to_csv(output_file, index=False)
print(New_London.shape)
New_London.head()

(7240, 3)


Unnamed: 0,Date,New London,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [86]:
Newport = Raw_Master_df[['Date', 'Newport', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Newport.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Newport.csv")
Newport.to_csv(output_file, index=False)
print(Newport.shape)
Newport.head()

(7255, 3)


Unnamed: 0,Date,Newport,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [87]:
Portland = Raw_Master_df[['Date', 'Portland', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Portland.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Portland.csv")
Portland.to_csv(output_file, index=False)
print(Portland.shape)
Portland.head()

(7254, 3)


Unnamed: 0,Date,Portland,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [88]:
Sandy_Hook = Raw_Master_df[['Date', 'Sandy Hook', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Sandy_Hook.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Sandy_Hook.csv")
Sandy_Hook.to_csv(output_file, index=False)
print(Sandy_Hook.shape)
Sandy_Hook.head()

(7202, 3)


Unnamed: 0,Date,Sandy Hook,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [89]:
Sewells_Point = Raw_Master_df[['Date', 'Sewells Point', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Sewells_Point.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Sewells_Point.csv")
Sewells_Point.to_csv(output_file, index=False)
print(Sewells_Point.shape)
Sewells_Point.head()

(7290, 3)


Unnamed: 0,Date,Sewells Point,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [90]:
The_Battery = Raw_Master_df[['Date', 'The Battery', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
The_Battery.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "The_Battery.csv")
The_Battery.to_csv(output_file, index=False)
print(The_Battery.shape)
The_Battery.head()

(6892, 3)


Unnamed: 0,Date,The Battery,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [91]:
Washington = Raw_Master_df[['Date', 'Washington', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Washington.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Washington.csv")
Washington.to_csv(output_file, index=False)
print(Washington.shape)
Washington.head()

(6912, 3)


Unnamed: 0,Date,Washington,Map
0,1993-01-01,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
