In [11]:
import pandas as pd
import numpy as np
from datetime import datetime
import math
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
from itertools import combinations
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression

In [19]:
def Clean_date(df):
    """
    Transfer value in "Collection_Date" column in to Datetime object
    """   
    df["Collection_Date"] = list(map(lambda date: datetime.strptime(date, '%Y-%m-%d %H:%M:%S'),
                                     df["Collection_Date"]))
    return df

In [20]:
Station_list = ["POLESOUT", "KISSR0.0", "LZ2", "S133", "TCNS228", "S135",
                "FEBIN", "MBOXSOU", "MH24000", "FEBOUT", "L005", "L008", "LZ40", "L004", "S308C",
                "PALMOUNT", "S169", "S236", "POLE3S", "RITTAE2", "LZ2FA", "L007", "PELBAY3", "L006", "LZ30"]
Variable_list = ['NITRATE+NITRITE-N', 'NITRITE-N', 'AMMONIA-N', 'KJELDAHL NITROGEN, TOTAL', 'PHOSPHATE, ORTHO AS P',
                 'PHOSPHATE, TOTAL AS P', 'NITRATE-N', 'SILICA', 'CARBON, TOTAL ORGANIC', 'CARBON, DISSOLVED ORGANIC',
                 'TOTAL NITROGEN', 'PHOSPHATE, DISSOLVED AS P', 'KJELDAHL NITROGEN, DIS', 'CARBON, TOTAL', 
                 'CARBON, TOTAL INORGANIC', 'NITROGEN, TOTAL DISSOLVED', 'CHLOROPHYLL-A(LC)']

Others = ['CHLOROPHYLL-A', 'PHEOPHYTIN', 'CHLOROPHYLL-A, CORRECTED', 
         'CHLOROPHYLL-C', 'CAROTENOIDS', 'CHLOROPHYLL-B', 'CHLOROPHYLL-A(LC)', 
         'PHEOPHYTIN-A(LC)', 'CHLOROPHYLL-B(LC)', 'RESP. PLANKTONIC']

Nitrogen_list = ['NITRATE+NITRITE-N', 'NITRITE-N', 'AMMONIA-N', 'KJELDAHL NITROGEN, TOTAL',
                'TOTAL NITROGEN','NITRATE-N','KJELDAHL NITROGEN, DIS','NITROGEN, TOTAL DISSOLVED']
Phosphorus_list = ['PHOSPHATE, ORTHO AS P','PHOSPHATE, TOTAL AS P','PHOSPHATE, DISSOLVED AS P']
Carbon_list = ['CARBON, TOTAL ORGANIC', 'CARBON, DISSOLVED ORGANIC','CARBON, TOTAL','CARBON, TOTAL INORGANIC']

In [21]:
def Data_clean_up_3(data, monthly_average=False):
    """
    Similar to Data_clean_up_1, only change the Variable_list 
    and combining the chlorophyll-A (corrected) with chlorophyll-A (LC)
    """

    #clean date: 
    data = Clean_date(data)
    Variable_list = ['CHLOROPHYLL-A(LC)', 'CHLOROPHYLL-A, CORRECTED']
    if monthly_average:
        Time_array = []
        for year in range(2000, 2020):
            for month in range(1,13):
                Time_array.append(datetime(year, month, 1))

        clean_data = pd.DataFrame({"MonthYear": Time_array})

        for variable in Variable_list:
            Value_array = []
            for time in Time_array:
                #select rows based on the given date and variable:
                index_given_time = [True if (x.month == time.month and x.year == time.year) else False for x in data["Collection_Date"]]
                index_given_variable = data["Test Name"] == variable
                #combine 2 criteria:
                index_to_choose = index_given_time & index_given_variable

                values = data.loc[index_to_choose, "Value"]
                #Filter negative and NA values:
                values = [x for x in values if (not math.isnan(x)) and x >= 0]

                if len(values) > 0:
                    Value_array.append(np.mean(values))

                else:
                    Value_array.append(None)

            clean_data[variable] = Value_array
        clean_data.fillna(value=np.nan,inplace=True)
        
    else:
        group_data = []
        for variable in Variable_list:
            #select rows based on the given date and variable:
            index_given_variable = data["Test Name"] == variable
            values = data.loc[index_given_variable, ["Collection_Date", "Value"]]
            #Filter negative and NA values:
            values.dropna(inplace=True)
            values = values.loc[values["Value"]>=0]
            values = values.rename(columns={"Value":variable})
            group_data.append(values)
        #clean_data = pd.concat(group_data)
        clean_data = group_data[0]
        for data in group_data[1:]:
            clean_data = clean_data.merge(data, on=["Collection_Date"], how='left')
        
    if sum(list(clean_data[['CHLOROPHYLL-A(LC)', 'CHLOROPHYLL-A, CORRECTED']].describe().loc["count",:])) != 0:
        clean_data["FINAL CHLOROPHYLL-A"] = np.nanmean(clean_data[['CHLOROPHYLL-A(LC)', 'CHLOROPHYLL-A, CORRECTED']], axis=1)
    return clean_data

In [22]:
station = "POLESOUT"
data = Data_clean_up_3(pd.read_csv(f"{station}.csv"), monthly_average=False).sort_values(by='Collection_Date', ascending=True)

In [23]:
data

Unnamed: 0,Collection_Date,CHLOROPHYLL-A(LC),"CHLOROPHYLL-A, CORRECTED",FINAL CHLOROPHYLL-A
0,2010-10-20 12:01:00,6.1,7.0,6.55
1,2010-12-15 14:50:00,7.4,8.0,7.70
2,2011-01-04 11:14:00,20.5,20.0,20.25
3,2011-02-14 11:05:00,12.6,15.0,13.80
4,2011-03-08 13:26:00,12.4,19.0,15.70
...,...,...,...,...
113,2020-04-07 13:39:00,34.3,,34.30
114,2020-05-05 13:14:00,18.8,,18.80
115,2020-05-20 13:23:00,35.0,,35.00
116,2020-06-02 14:15:00,46.5,,46.50
