### Import libraries

In [58]:
import pandas as pd
import numpy as np

### Read input data

In [63]:
path = 'datasets/HOT-DOGS_timeSerie.txt'
columns = [
    'id',
    'julian',
    'pression',
    'temperature',
    'salinity',
    'fluor',
    'nitrate'
]

data = pd.read_csv(
    path,
    sep=",",
    skiprows=2,
    skipinitialspace=True,
    header=None
)
data.drop(7, axis=1, inplace=True)
data.columns = columns
data.head()

Unnamed: 0,id,julian,pression,temperature,salinity,fluor,nitrate
0,1,30,10.0,26.2697,35.2109,0.056,-9.0
1,2,62,10.0,25.6832,34.9849,0.0793,-9.0
2,3,99,10.0,24.6181,35.0315,0.2356,-9.0
3,4,148,10.0,23.5353,34.8368,0.1414,-9.0
4,5,177,10.0,24.4434,34.6834,-9.0,-9.0


### Mark not available values

Currently all -9.0 values, are measurements that weren't available.

In [64]:
data.replace(-9.0, np.nan, inplace=True)

data.head()

Unnamed: 0,id,julian,pression,temperature,salinity,fluor,nitrate
0,1,30,10.0,26.2697,35.2109,0.056,
1,2,62,10.0,25.6832,34.9849,0.0793,
2,3,99,10.0,24.6181,35.0315,0.2356,
3,4,148,10.0,23.5353,34.8368,0.1414,
4,5,177,10.0,24.4434,34.6834,,


### Convert dates

Taking into account that the initial day is `1988-10-01`.

In [65]:
import datetime

ORIGIN = datetime.date(1988, 10, 1)

def julian_to_date(julian):
    return ORIGIN + datetime.timedelta(days=julian)

data['date'] = data.apply(lambda row: julian_to_date(row['julian']), axis=1)

data.head()

Unnamed: 0,id,julian,pression,temperature,salinity,fluor,nitrate,date
0,1,30,10.0,26.2697,35.2109,0.056,,1988-10-31
1,2,62,10.0,25.6832,34.9849,0.0793,,1988-12-02
2,3,99,10.0,24.6181,35.0315,0.2356,,1989-01-08
3,4,148,10.0,23.5353,34.8368,0.1414,,1989-02-26
4,5,177,10.0,24.4434,34.6834,,,1989-03-27


### Build output dataset

In [66]:
columns = ['date', 'temperature']
output_path = 'datasets/TemperatureByDate.csv'

data.to_csv(
    output_path,
    sep=',',
    columns=columns,
    index=False
)