# Project for Don Bowes 2023-01-12

This notebook takes an input CSV, *Turtle_Daily.csv", and produces a single row for each date in the data set that has flow data. Each row contains two columns: the date and the corresponding flow value.

In [None]:
# Load the file and confirm
import pandas as pd
import re
from typing import List
df = pd.read_csv("Turtle_Daily.csv")

In [None]:
# Keep only the required YEAR, MONTH, & FLOW[1..31]columns and ignore the rest
day_cols: List[str] = [f'FLOW{str(x)}' for x in range(1,32)]
cols_to_keep: List[str] = ["YEAR", "MONTH"] + day_cols

In [None]:
# Given a DataFrame with columns YEAR, MONTH and then 31 other columns called FLOW1 through FLOW31, 
# we use melt() to create a dataframe with only two columns: a YEAR-MONTH-DAY column called "Date" 
# and a Flow column containing the corresponding value from the original dataframe.

# First do the melt, using dropna() to ignore missing values
df_melted = pd.melt(df, id_vars=['YEAR', 'MONTH'], value_vars=day_cols, var_name='FLOW', value_name='Flow').dropna()

# Then create a new column called 'Date' by joining YEAR, MONTH and the day from FLOW.
df_melted['Date'] = df_melted['YEAR'].astype(str) + '-' + df_melted['MONTH'].astype(str).str.zfill(2) + '-' + df_melted['FLOW'].str.replace('FLOW', '').str.zfill(2)

# Drop the unused columns
df_melted.drop(['YEAR', 'MONTH','FLOW'], axis=1, inplace=True)

# Use pd.to_datetime() to convert the DATE column to datetime format and set it as index
df_melted['Date'] = pd.to_datetime(df_melted['Date'],format='%Y-%m-%d')
df_melted.set_index('Date',inplace=True)

# Preview
df_melted.head()

In [None]:
# Output to CSV
df_melted.sort_values(by=['Date']).to_csv("output.csv", encoding='utf-8')