# EDA 
Getting the data organized into a master data sheet

In [152]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import xarray as xr
import numpy as np
import netCDF4
from matplotlib import pyplot as plt
from collections import defaultdict
import sklearn
from sklearn import linear_model
import numpy
import gzip
import math


In [None]:
MIN_YEAR=1993
MIN_DAY=1
MIN_MONTH=1

MAX_YEAR=2013
MAX_DAY=31
MAX_MONTH=12

In [154]:
def preprocess_date(data_str):
    try:
        if len(data_str) != 10:
            raise ValueError(f"Invalid date format: {data_str}")
        year = int(data_str[:4])
        month = int(data_str[5:7])
        day = int(data_str[8:10])
        year_norm = (year - MIN_YEAR) / (MAX_YEAR - MIN_YEAR)
        month_sin = np.sin(2 * np.pi * (month - 1) / 12)
        month_cos = np.cos(2 * np.pi * (month - 1) / 12)
        day_sin = np.sin(2 * np.pi * (day - 1) / 31)
        day_cos = np.cos(2 * np.pi * (day - 1) / 31)
        return np.array([year_norm, month_sin, month_cos, day_sin, day_cos])
    except ValueError as e:
        print(e)
        return None

In [155]:
folder_path = 'dataset/Training_Anomalies_Station Data'

# Process each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Ensure it's a CSV file
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        
        # Drop the longitude and latitude columns if they exist
        df = df.drop(columns=['longitude', 'latitude'], errors='ignore')
        
        # Save the updated DataFrame back to the same file (or modify as needed)
        df.to_csv(file_path, index=False)

In [156]:
folder_path = 'dataset/Training_Anomalies_Station Data'
all_data = []

# Process each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Ensure it's a CSV file
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        df = df[['t', 'anomaly', 'location']]  # Ensure the required columns
        all_data.append(df)

# Concatenate all data into a single DataFrame
combined_df = pd.concat(all_data)

# Handle duplicates by aggregating using mean or another function
combined_df = combined_df.groupby(['t', 'location']).mean().reset_index()

# Pivot the DataFrame so each location is a column
result = combined_df.pivot(index='t', columns='location', values='anomaly')

# Reset the index name to "time"
result.index.name = 'Date'

result.to_csv('Station_Anomaly.csv')


Station_Anomaly = pd.read_csv('Station_Anomaly.csv')
Station_Anomaly.head()

Unnamed: 0,Date,Atlantic City,Baltimore,Eastport,Fort Pulaski,Lewes,New London,Newport,Portland,Sandy Hook,Sewells Point,The Battery,Washington
0,1993-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1993-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1993-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1993-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1993-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
directory = "dataset/Copernicus_ENA_Satelite_Maps_Training_Data"
data = []

for filename in os.listdir(directory):
    if filename.endswith(".nc"):
        file_path = os.path.join(directory, filename)

        # Extract the date part from the filename and format it
        date_str = filename.split("_")[2]
        if len(date_str) == 8:
            formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"

            # Open the .nc file
            dataset = netCDF4.Dataset(file_path, mode="r")

            # Extract the 'sla' variable
            sla = dataset.variables["sla"][:][0]
            sla = sla.filled(-10)

            data.append({'Date': formatted_date, 'Map': sla})
            
Map_df = pd.DataFrame(data)
Map_df.head()

Unnamed: 0,Date,Map
0,1993-01-01,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [158]:
Raw_Master_df = Station_Anomaly.merge(Map_df[['Date', 'Map']], on = 'Date')
print(Raw_Master_df.shape)
Raw_Master_df.head()

(7302, 14)


Unnamed: 0,Date,Atlantic City,Baltimore,Eastport,Fort Pulaski,Lewes,New London,Newport,Portland,Sandy Hook,Sewells Point,The Battery,Washington,Map
0,1993-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,1993-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,1993-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,1993-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,1993-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [159]:
Raw_Master_df['Date'] = Raw_Master_df['Date'].apply(preprocess_date)

In [161]:
def LogRegProc(City_df, City_name):
    x = City_df[['Date', 'Map']].to_numpy()
    y = City_df[[City_name]].to_numpy()
    X = []
    for row in x:
        X.append(numpy.concatenate((row[0], row[1].flatten())))
    X = np.array(X)
    return X, y

In [176]:
def scoreStation(df, name):
    X, y = LogRegProc(df, name)
    xTrain = X[:int(len(X)*0.8)]
    yTrain = y[:int(len(y)*0.8)]
    xTest = X[int(len(X)*0.8):]
    yTest = y[int(len(y)*0.8):]
    mod = sklearn.linear_model.LogisticRegression(fit_intercept=False)
    mod.fit(xTrain,yTrain)
    return mod.score(xTest, yTest)

In [180]:
Atlantic_City = Raw_Master_df[['Date', 'Atlantic City', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Atlantic_City.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Atlantic_City.csv")
Atlantic_City.to_csv(output_file, index=False)
Atlantic_City.head()

Unnamed: 0,Date,Atlantic City,Map
0,"[0.0, 0.0, 1.0, 0.0, 1.0]",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,"[0.0, 0.0, 1.0, 0.20129852008866006, 0.9795299...",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,"[0.0, 0.0, 1.0, 0.39435585511331855, 0.9189578...",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,"[0.0, 0.0, 1.0, 0.5712682150947923, 0.82076344...",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,"[0.0, 0.0, 1.0, 0.72479278722912, 0.6889669190...",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [165]:
Baltimore = Raw_Master_df[['Date', 'Baltimore', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Baltimore.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Baltimore.csv")
Baltimore.to_csv(output_file, index=False)

In [166]:
Eastport = Raw_Master_df[['Date', 'Eastport', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Eastport.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Eastport.csv")
Eastport.to_csv(output_file, index=False)

In [167]:
Fort_Pulaski = Raw_Master_df[['Date', 'Fort Pulaski', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Fort_Pulaski.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Fort Pulaski.csv")
Fort_Pulaski.to_csv(output_file, index=False)

In [168]:
Lewes = Raw_Master_df[['Date', 'Lewes', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Lewes.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Lewes.csv")
Lewes.to_csv(output_file, index=False)

In [169]:
New_London = Raw_Master_df[['Date', 'New London', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
New_London.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "New_London.csv")
New_London.to_csv(output_file, index=False)

In [170]:
Newport = Raw_Master_df[['Date', 'Newport', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Newport.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Newport.csv")
Newport.to_csv(output_file, index=False)

In [171]:
Portland = Raw_Master_df[['Date', 'Portland', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Portland.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Portland.csv")
Portland.to_csv(output_file, index=False)

In [172]:
Sandy_Hook = Raw_Master_df[['Date', 'Sandy Hook', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Sandy_Hook.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Sandy_Hook.csv")
Sandy_Hook.to_csv(output_file, index=False)

In [173]:
Sewells_Point = Raw_Master_df[['Date', 'Sewells Point', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Sewells_Point.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Sewells_Point.csv")
Sewells_Point.to_csv(output_file, index=False)

In [174]:
The_Battery = Raw_Master_df[['Date', 'The Battery', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
The_Battery.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "The_Battery.csv")
The_Battery.to_csv(output_file, index=False)

In [175]:
Washington = Raw_Master_df[['Date', 'Washington', 'Map']]
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
Washington.dropna(inplace=True)
output_dir = 'cleaned_dataset'
output_file = os.path.join(output_dir, "Washington.csv")
Washington.to_csv(output_file, index=False)
print(Washington.shape)
Washington.head()

(6912, 3)


Unnamed: 0,Date,Washington,Map
0,"[0.0, 0.0, 1.0, 0.0, 1.0]",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
1,"[0.0, 0.0, 1.0, 0.20129852008866006, 0.9795299...",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2,"[0.0, 0.0, 1.0, 0.39435585511331855, 0.9189578...",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
3,"[0.0, 0.0, 1.0, 0.5712682150947923, 0.82076344...",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
4,"[0.0, 0.0, 1.0, 0.72479278722912, 0.6889669190...",0.0,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [177]:
stations_dict = {
    'Atlantic City': Atlantic_City,
    'Baltimore': Baltimore,
    'Eastport': Eastport,
    'Fort Pulaski': Fort_Pulaski,
    'Lewes': Lewes,
    'New London': New_London,
    'Newport': Newport,
    'Portland': Portland,
    'Sandy Hook': Sandy_Hook,
    'Sewells Point': Sewells_Point,
    'The Battery': The_Battery,
    'Washington': Washington
}
score_dict = {}

In [178]:
for k, v in stations_dict.items():
    scoreStation(v, k)
    score_dict[k] = scoreStation(v, k)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

{'Atlantic City': 0.9200283085633404, 'Baltimore': 0.9554183813443072, 'Eastport': 0.906030855539972, 'Fort Pulaski': 0.975223675154852, 'Lewes': 0.931958762886598, 'New London': 0.9924033149171271, 'Newport': 0.9986216402481047, 'Portland': 0.9634734665747761, 'Sandy Hook': 0.922970159611381, 'Sewells Point': 0.9821673525377229, 'The Battery': 0.9862218999274837, 'Washington': 0.9045553145336226}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [179]:
print(score_dict)

{'Atlantic City': 0.9200283085633404, 'Baltimore': 0.9554183813443072, 'Eastport': 0.906030855539972, 'Fort Pulaski': 0.975223675154852, 'Lewes': 0.931958762886598, 'New London': 0.9924033149171271, 'Newport': 0.9986216402481047, 'Portland': 0.9634734665747761, 'Sandy Hook': 0.922970159611381, 'Sewells Point': 0.9821673525377229, 'The Battery': 0.9862218999274837, 'Washington': 0.9045553145336226}


In [None]:
class Model:
    def __init__(self):
        self.kmeans = sklearn.linear_model.LogisticRegression(fit_intercept=False)

    def fit(self, X, y):
        self.kmeans.fit(X=X, y=y)

    def predict(self, X):
        return self.kmeans.predict(X)