# Loading Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from hashlib import md5
from IPython.display import display, Markdown, Latex, Code
from scipy.special import logsumexp
import os.path


# Preprocessing data

In [None]:

def process_data(data_path, mw_path, debug=False):
    print(f"Processing data from files: {data_path}, {mw_path}")
    
    data = pd.read_csv(data_path)
    mwdata = pd.read_csv(mw_path)
    year_means = []
    
    for year in range(1971, 2019):
        year_means.append(np.mean(data.loc[data['Year'] == year]['Water level (mm)'].values))

    if debug:
        plt.title('Data relative to MW')
        plt.show()
        plt.title('MW data')
        plt.plot(mwdata.values[:, 0], mwdata.values[:, 1])
        plt.show()
    
    # Normalize data
    normalized_data_path = f"{data_path}_n.csv"
    if os.path.isfile(normalized_data_path):
        n_data = pd.read_csv(normalized_data_path)
        n_data['date'] = pd.to_datetime(n_data['date'])
        n_data.set_index('date', inplace=True)
        print(f"Read normalized data from {normalized_data_path}")
    else:
        n_data = data.copy()
        print(f"Normalizing {data_path}")
        for i, row in n_data.iterrows():
            if i % 1000 == 0:
                print('\r ready: %.2f%%' % (i / len(n_data) * 100), end=" ")
            n_data.iat[i, 5] = row['Vedenkorkeus (mm)'] - mwdata.loc[mwdata['Vuosi'] == row['Vuosi']].values[0][1]
    
        # Add date index
        import datetime
        def row_to_timestamp(row):
            year = row['Vuosi']
            month = row['Kk']
            day = row['Pv']
            hours = datetime.datetime.strptime(row['Klo'], "%H:%M").hour
            date = pd.to_datetime(datetime.datetime(year, month, day, hours))
            return date
    
        n_data['date'] = n_data.apply(row_to_timestamp, axis=1)
        n_data.set_index('date', inplace=True)
        n_data['year'] = n_data.index.year + (n_data.index.dayofyear - 1) / 365 + (n_data.index.hour) / (365 * 24)
    
        n_data = n_data.rename(columns={'Vedenkorkeus (mm)': 'water_level'})
    
        n_data.to_csv(normalized_data_path)
        print(f"\nWrote output to {normalized_data_path}")
    
    # daily_means = n_data.resample('D').mean().dropna()
    monthly_means = n_data.resample('M').mean().dropna()
    yearly_means = n_data.resample('Y').mean().dropna()
    
    nans = np.argwhere(np.isnan(monthly_means['water_level']))
    
    if debug:
        plt.title('Year means')
        plt.plot(yearly_means['year'], yearly_means['water_level'])
        plt.show()
    
        plt.title('Month means')
        plt.plot(monthly_means['year'], monthly_means['water_level'])
        plt.show()
    
    return yearly_means, monthly_means
