# Working with Dates

> with reference to EpiRHandbook Chapter 9

In [None]:
#|hide
from nbdev import *
from fastcore.test import *
from fastcore.utils import *

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.style as style

In [None]:
#|hide
# To use colourbind colour schemes (optional).  To get the color, see https://github.com/matplotlib/matplotlib/blob/main/lib/matplotlib/mpl-data/stylelib/tableau-colorblind10.mplstyle

In [None]:
#|hide
style.use('tableau-colorblind10')

In [None]:
#|hide
# set display options in pandas
pd.set_option('display.max_columns', 100)  
pd.set_option('display.max_rows',100)
pd.set_option('display.width', 1000)

## Data

Import data from https://github.com/appliedepi/epiRhandbook_eng/blob/master/data/linelist_cleaned.xlsx and then save under "epiRhandbook_data" folder.  
Note: Installation of "openpyxl" is required.

In [None]:
linelist = pd.read_excel('../epiRhandbook_data/linelist_cleaned.xlsx')
linelist.head(3)

Unnamed: 0,case_id,generation,date_infection,date_onset,date_hospitalisation,date_outcome,outcome,gender,age,age_unit,age_years,age_cat,age_cat5,hospital,lon,lat,infector,source,wt_kg,ht_cm,ct_blood,fever,chills,cough,aches,vomit,temp,time_admission,bmi,days_onset_hosp
0,5fe599,4,2014-05-08,2014-05-13,2014-05-15,NaT,,m,2.0,years,2.0,0-4,0-4,Other,-13.215735,8.468973,f547d6,other,27,48,22,no,no,yes,no,yes,36.8,,117.1875,2.0
1,8689b7,4,NaT,2014-05-13,2014-05-14,2014-05-18,Recover,f,3.0,years,3.0,0-4,0-4,Missing,-13.215234,8.451719,,,25,59,22,,,,,,36.9,09:36,71.818443,1.0
2,11f8ea,2,NaT,2014-05-16,2014-05-18,2014-05-30,Recover,m,56.0,years,56.0,50-69,55-59,St. Mark's Maternity Hospital (SMMH),-13.212911,8.464817,,,91,238,21,,,,,,36.9,16:48,16.06525,2.0


## Current Date

Print system date and time using Linux comment (starting with `!`)

In [None]:
!date

Thu  5 Jan 10:50:30 AWST 2023


## Convert to Date

### Reading date columns (use parse_dates to specify the columns if necessary)

In [None]:
df_date = pd.read_excel('../epiRhandbook_data/linelist_cleaned.xlsx', 
                        parse_dates=['date_infection', 'date_onset', 'date_hospitalisation', 'date_outcome', 'time_admission'])
df_date.head(3)

Unnamed: 0,case_id,generation,date_infection,date_onset,date_hospitalisation,date_outcome,outcome,gender,age,age_unit,age_years,age_cat,age_cat5,hospital,lon,lat,infector,source,wt_kg,ht_cm,ct_blood,fever,chills,cough,aches,vomit,temp,time_admission,bmi,days_onset_hosp
0,5fe599,4,2014-05-08,2014-05-13,2014-05-15,NaT,,m,2.0,years,2.0,0-4,0-4,Other,-13.215735,8.468973,f547d6,other,27,48,22,no,no,yes,no,yes,36.8,,117.1875,2.0
1,8689b7,4,NaT,2014-05-13,2014-05-14,2014-05-18,Recover,f,3.0,years,3.0,0-4,0-4,Missing,-13.215234,8.451719,,,25,59,22,,,,,,36.9,09:36,71.818443,1.0
2,11f8ea,2,NaT,2014-05-16,2014-05-18,2014-05-30,Recover,m,56.0,years,56.0,50-69,55-59,St. Mark's Maternity Hospital (SMMH),-13.212911,8.464817,,,91,238,21,,,,,,36.9,16:48,16.06525,2.0


### Customizing a Date Time using `dt`

In [None]:
df_dp = pd.read_excel('../epiRhandbook_data/linelist_cleaned.xlsx')
df_dp['date_infection'] = df_dp['date_infection'].dt.strftime('%Y-%m-%d')  # YYYY-MM-DD
df_dp['date_onset'] = df_dp['date_onset'].dt.strftime('%b/%d/%y')  # MMM-DD-YY
df_dp['date_hospitalisation'] = df_dp['date_hospitalisation'].dt.strftime('%a %b/%d/%Y')  # DDD DD/MM/YYYY
df_dp.head()

Unnamed: 0,case_id,generation,date_infection,date_onset,date_hospitalisation,date_outcome,outcome,gender,age,age_unit,age_years,age_cat,age_cat5,hospital,lon,lat,infector,source,wt_kg,ht_cm,ct_blood,fever,chills,cough,aches,vomit,temp,time_admission,bmi,days_onset_hosp
0,5fe599,4,2014-05-08,May/13/14,Thu May/15/2014,NaT,,m,2.0,years,2.0,0-4,0-4,Other,-13.215735,8.468973,f547d6,other,27,48,22,no,no,yes,no,yes,36.8,,117.1875,2.0
1,8689b7,4,,May/13/14,Wed May/14/2014,2014-05-18,Recover,f,3.0,years,3.0,0-4,0-4,Missing,-13.215234,8.451719,,,25,59,22,,,,,,36.9,09:36,71.818443,1.0
2,11f8ea,2,,May/16/14,Sun May/18/2014,2014-05-30,Recover,m,56.0,years,56.0,50-69,55-59,St. Mark's Maternity Hospital (SMMH),-13.212911,8.464817,,,91,238,21,,,,,,36.9,16:48,16.06525,2.0
3,b8812a,3,2014-05-04,May/18/14,Tue May/20/2014,NaT,,f,18.0,years,18.0,15-19,15-19,Port Hospital,-13.236371,8.475476,f90f5f,other,41,135,23,no,no,no,no,no,36.8,11:22,22.496571,2.0
4,893f25,3,2014-05-18,May/21/14,Thu May/22/2014,2014-05-29,Recover,m,3.0,years,3.0,0-4,0-4,Military Hospital,-13.222864,8.460824,11f8ea,other,36,71,23,no,no,yes,no,yes,36.9,12:60,71.414402,1.0


Tips: For the full list of `strftime` and `strptime` format codes, see https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

To convert a 2-digit year into a 4-digit year (all in the same century)

In [None]:
two_digit_years = ["15", "15", "16", "17"]
['20'+yy for yy in two_digit_years]

['2015', '2015', '2016', '2017']

### Combine columns to create a date column

In [None]:
dd = [1,2,3]
mm = [5,6,7]
yy = [21, 22, 23]
df = pd.DataFrame(list(zip(dd, mm, yy)), columns = ['dd', 'mm', 'yy'])
df

Unnamed: 0,dd,mm,yy
0,1,5,21
1,2,6,22
2,3,7,23


In [None]:
df['yy-mm-dd'] = pd.to_datetime(df['dd'].astype(str) + "-" + 
                                df['mm'].astype(str) + "-" + 
                                df['yy'].astype(str))
df

Unnamed: 0,dd,mm,yy,yy-mm-dd
0,1,5,21,2021-01-05
1,2,6,22,2022-02-06
2,3,7,23,2023-03-07


## Excel Dates

If the dataset you imported from Excel shows dates as numbers or characters like “41369”, add `origin = '1899-12-30'` when reading the file.

In [None]:
date = [41369, 41370, 41371]

In [None]:
df = pd.DataFrame(list(date), columns = ['Excel_date'])
df.to_excel('Excel_date.xlsx', index=False)

In [None]:
df = pd.read_excel('Excel_date.xlsx', index_col=False)
df['date'] = pd.to_datetime(df['Excel_date'], unit='d', origin='1899-12-30')

In [None]:
df

Unnamed: 0,Excel_date,date
0,41369,2013-04-05
1,41370,2013-04-06
2,41371,2013-04-07


## Messy Dates

In [None]:
m_dates = ("03 January 2018", "07/03/1982", "08/20/85")
df = pd.DataFrame(list(m_dates), columns = ['Messy_date'])
df['date'] = pd.to_datetime(df['Messy_date'])
df

Unnamed: 0,Messy_date,date
0,03 January 2018,2018-01-03
1,07/03/1982,1982-07-03
2,08/20/85,1985-08-20


# Working with Date-time class

### Convert dates with times (most of the cases)

In [None]:
dt_time = ("2020-01-01 16hrs", "2020-01-01 4PM", "01 January 2020 16:20", 
           "01 January 2020, 16:20:40", "01 January 2020, 16:20:40 PST")

In [None]:
df = pd.DataFrame(list(dt_time), columns = ['Date_time'])

In [None]:
df['PublishDateTime'] = pd.to_datetime(df['Date_time'], 
                                       errors='coerce')



In [None]:
df

Unnamed: 0,Date_time,PublishDateTime
0,2020-01-01 16hrs,NaT
1,2020-01-01 4PM,2020-01-01 16:00:00
2,01 January 2020 16:20,2020-01-01 16:20:00
3,"01 January 2020, 16:20:40",2020-01-01 16:20:40
4,"01 January 2020, 16:20:40 PST",2020-01-01 16:20:40


## Convert times alone 

In [None]:
time1 = "13:45"
time2 = "15:20"

from datetime import datetime, timedelta

In [None]:
t1 = datetime.strptime(time1, '%H:%M')
t1

datetime.datetime(1900, 1, 1, 13, 45)

In [None]:
t2 = datetime.strptime(time2, '%H:%M')
t2

datetime.datetime(1900, 1, 1, 15, 20)

In [None]:
T=(t2-t1)
T

datetime.timedelta(seconds=5700)

In [None]:
5700/60/60

1.5833333333333333

In [None]:
def format_timedelta(td):
    minutes, seconds = divmod(td.seconds + td.days * 86400, 60)
    hours, minutes = divmod(minutes, 60)
    return '{:d}:{:02d}:{:02d}'.format(hours, minutes, seconds)

In [None]:
format_timedelta(T)

'1:35:00'

Alternatively, including microseconds

In [None]:
def format_timedelta(td):
    minutes, seconds = divmod(td.seconds + td.days * 86400, 60)
    hours, minutes = divmod(minutes, 60)
    return '{:d}:{:02d}:{:02d}.{:06d}'.format(hours, minutes, seconds, td.microseconds)

In [None]:
format_timedelta(T)

'1:35:00.000000'

## Extract time

In [None]:
hh = list()
for i, r in enumerate(linelist['time_admission']):
    try:
        h = int(r[:2])
        if (h > 6) & (h < 12):
            hh.append("Morning")
        elif (h >= 12) & (h <17):
            hh.append("Afternoon")
        elif (h >=17) & (h < 21):
            hh.append("Evening")
        else:
            hh.append("Night")
    except:
        hh.append("No info")

In [None]:
len(hh)

5888

In [None]:
m = 0
a = 0
e = 0
n = 0
ni = 0
for i in hh:
    if i == "Morning":
        m+=1
    elif i == "Afternoon":
        a += 1
    elif i == "Evening":
        e += 1
    elif i == "Night":
        n += 1
    elif i == "No info":
        ni += 1

In [None]:
m, a, e, n, ni, m+a+e+n+ni

(1596, 2312, 823, 392, 765, 5888)

# Working with Dates 

In [None]:
example_date = datetime.strptime("2023-01-04", '%Y-%m-%d')
example_date

datetime.datetime(2023, 1, 4, 0, 0)

## Extract Date Components

In [None]:
example_date.month  # month number

1

In [None]:
example_date.year  # year

2023

In [None]:
example_date.day  # day (number) of the month

4

In [None]:
example_date.weekday()  # `()` is required in this case  2 == Wednesday

2

To display the date in English

In [None]:
import calendar
my_date = datetime.today()
calendar.day_name[my_date.weekday()]  

'Thursday'

## Date Math

In [None]:
example_date + timedelta(days=3)

datetime.datetime(2023, 1, 7, 0, 0)

In [None]:
example_date + timedelta(weeks=2, days=-2)

datetime.datetime(2023, 1, 16, 0, 0)

## Date intervals

In [None]:
output = example_date - datetime.strptime("2022-02-20", '%Y-%m-%d')
output

datetime.timedelta(days=318)

In [None]:
df_di = linelist[['date_onset', 'date_hospitalisation']]
df_di.head()

Unnamed: 0,date_onset,date_hospitalisation
0,2014-05-13,2014-05-15
1,2014-05-13,2014-05-14
2,2014-05-16,2014-05-18
3,2014-05-18,2014-05-20
4,2014-05-21,2014-05-22


In [None]:
df_march = df_di[df_di['date_onset'].dt.month == 3]
df_march

Unnamed: 0,date_onset,date_hospitalisation
993,2015-03-01,2015-03-02
994,2015-03-01,2015-03-02
995,2015-03-01,2015-03-01
996,2015-03-02,2015-03-04
997,2015-03-02,2015-03-03
...,...,...
5864,2015-03-26,2015-03-29
5865,2015-03-27,2015-03-30
5866,2015-03-27,2015-03-31
5867,2015-03-27,2015-03-30


In [None]:
df_mar = df_march.copy(deep=True)

In [None]:
df_mar.notnull()

Unnamed: 0,date_onset,date_hospitalisation
993,True,True
994,True,True
995,True,True
996,True,True
997,True,True
...,...,...
5864,True,True
5865,True,True
5866,True,True
5867,True,True


In [None]:
df_mar['days_onset_to_hosp'] = df_mar['date_hospitalisation'] - df_mar['date_onset']
df_mar

Unnamed: 0,date_onset,date_hospitalisation,days_onset_to_hosp
993,2015-03-01,2015-03-02,1 days
994,2015-03-01,2015-03-02,1 days
995,2015-03-01,2015-03-01,0 days
996,2015-03-02,2015-03-04,2 days
997,2015-03-02,2015-03-03,1 days
...,...,...,...
5864,2015-03-26,2015-03-29,3 days
5865,2015-03-27,2015-03-30,3 days
5866,2015-03-27,2015-03-31,4 days
5867,2015-03-27,2015-03-30,3 days


In [None]:
df_mar['days_onset_to_hosp'].median(skipna=True)

Timedelta('1 days 00:00:00')

In [None]:
df_mar['days_onset_to_hosp'].median()

Timedelta('1 days 00:00:00')

# Date display

In [None]:
current_date = datetime.now().date()
current_date

datetime.date(2023, 1, 5)

In [None]:
formatted_date = current_date.strftime('%d %B %Y')
formatted_date

'05 January 2023'

This will output the week number starting from the first week of the year, according to the ISO-8601 standard:

In [None]:
weeks = current_date.isocalendar()[1]
weeks

1

In [None]:
current_date_time = datetime.now()
current_date_time

datetime.datetime(2023, 1, 5, 10, 50, 34, 186921)

In [None]:
formatted_date_time = current_date_time.strftime('%A, %B %d %Y, %z, %Z, %H:%M:%S')
formatted_date_time

'Thursday, January 05 2023, , , 10:50:34'

#TODO need to make sure systemd setup correctly.  

## Month-year

In [None]:
s = pd.Series(linelist['date_onset'])

# Convert the values in the series to datetime objects
s = pd.to_datetime(s)

# Extract the month and year from the datetime values
s = s.dt.strftime('%b %Y')

# Count the number of occurrences of each unique value
counts = s.value_counts()

# Sort the counts by the month and year
counts = counts.sort_index()

# Reset the index of the counts, so that the unique values become columns
counts = counts.reset_index()

# Rename the columns
counts.columns = ['month_year', 'count']

counts

Unnamed: 0,month_year,count
0,Apr 2014,7
1,Apr 2015,186
2,Aug 2014,528
3,Dec 2014,562
4,Feb 2015,306
5,Jan 2015,431
6,Jul 2014,226
7,Jun 2014,100
8,Mar 2015,277
9,May 2014,64


Sort by month

In [None]:
df = pd.DataFrame(counts)
df['month_year'] = pd.to_datetime(df['month_year'])
df.sort_values(by='month_year')

Unnamed: 0,month_year,count
0,2014-04-01,7
9,2014-05-01,64
7,2014-06-01,100
6,2014-07-01,226
2,2014-08-01,528
12,2014-09-01,1070
11,2014-10-01,1112
10,2014-11-01,763
3,2014-12-01,562
5,2015-01-01,431


## Epidemiological weeks

In [None]:
#|hide
w_cnt = (df_date["date_onset"].dropna())
df = pd.DataFrame(w_cnt) - pd.to_timedelta(7, unit='d')  # swift 7 days prior
df

Unnamed: 0,date_onset
0,2014-05-06
1,2014-05-06
2,2014-05-09
3,2014-05-11
4,2014-05-14
...,...
5883,2015-04-11
5884,2015-04-12
5885,2015-04-14
5886,2015-04-15


In [None]:
#|hide
df['WeekNumber'] = df['date_onset'].dt.isocalendar().week
df

Unnamed: 0,date_onset,WeekNumber
0,2014-05-06,19
1,2014-05-06,19
2,2014-05-09,19
3,2014-05-11,19
4,2014-05-14,20
...,...,...
5883,2015-04-11,15
5884,2015-04-12,15
5885,2015-04-14,16
5886,2015-04-15,16


In [None]:
#|hide
weeks = pd.Series(df['WeekNumber'])
years = pd.Series(df['date_onset'].dt.year)

In [None]:
#|hide
from datetime import date
dates = [date.fromisocalendar(year, week, 7) for year, week in zip(years, weeks)]
df['weekly_cases'] = dates

In [None]:
#|hide
df

Unnamed: 0,date_onset,WeekNumber,weekly_cases
0,2014-05-06,19,2014-05-11
1,2014-05-06,19,2014-05-11
2,2014-05-09,19,2014-05-11
3,2014-05-11,19,2014-05-11
4,2014-05-14,20,2014-05-18
...,...,...,...
5883,2015-04-11,15,2015-04-12
5884,2015-04-12,15,2015-04-12
5885,2015-04-14,16,2015-04-19
5886,2015-04-15,16,2015-04-19


In [None]:
#|hide
n = df['weekly_cases'].value_counts()
n = n.sort_index()
n = n.reset_index()
# Rename the columns
n.columns = ['weekly_cases', 'n']
n

Unnamed: 0,weekly_cases,n
0,2014-01-05,49
1,2014-04-06,1
2,2014-04-13,1
3,2014-04-20,5
4,2014-04-27,4
5,2014-05-04,12
6,2014-05-11,17
7,2014-05-18,13
8,2014-05-25,18
9,2014-06-01,22


In [None]:
#|hide
w_cnt = (df_date["date_onset"].dropna())
df = pd.DataFrame(w_cnt) - pd.to_timedelta(7, unit='d')  # swift 7 days prior
df['WeekNumber'] = df['date_onset'].dt.isocalendar().week
weeks = pd.Series(df['WeekNumber'])
years = pd.Series(df['date_onset'].dt.year)
dates = [date.fromisocalendar(year, week, 7) for year, week in zip(years, weeks)]
df['weekly_cases'] = dates
n = df['weekly_cases'].value_counts().sort_index().reset_index()
n.columns = ['weekly_cases', 'n']
n

Unnamed: 0,weekly_cases,n
0,2014-01-05,49
1,2014-04-06,1
2,2014-04-13,1
3,2014-04-20,5
4,2014-04-27,4
5,2014-05-04,12
6,2014-05-11,17
7,2014-05-18,13
8,2014-05-25,18
9,2014-06-01,22


In [None]:
df["date_onset"] = pd.to_datetime(df_date["date_onset"])

# Shift the dates back 7 days and extract the ISO week number and year from each date
df["date_onset"] -= pd.to_timedelta(7, unit='d')
df["WeekNumber"] = df["date_onset"].dt.isocalendar().week
df["Year"] = df["date_onset"].dt.year

# Convert the week numbers to dates and add them to the DataFrame
df["week_started"] = df.apply(lambda row: date.fromisocalendar(row["Year"], row["WeekNumber"], 7), axis=1)

# Count the number of cases per week and create a new DataFrame
n = df["week_started"].value_counts().sort_index().reset_index()
n.columns = ["week_started", "weekly_cases"]
n

Unnamed: 0,week_started,weekly_cases
0,2014-01-05,49
1,2014-04-06,1
2,2014-04-13,1
3,2014-04-20,5
4,2014-04-27,4
5,2014-05-04,12
6,2014-05-11,17
7,2014-05-18,13
8,2014-05-25,18
9,2014-06-01,22


Using `Epiweek` as an alternative to `fromisocalendar`, see https://pypi.org/project/epiweeks/

# Converting dates/time zones

In [None]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 10:56:40


In [None]:
import time
t = time.localtime()
current_time = time.strftime("%H:%M:%S", t)
print("Current Time =", current_time)

Current Time = 10:56:40


In [None]:
import pytz

# Get the timezone object for New York
tz_NY = pytz.timezone('America/New_York') 

# Get the current time in New York
datetime_NY = datetime.now(tz_NY)

# Format the time as a string and print it
print("NY time:", datetime_NY.strftime("%H:%M:%S"))

# Get the timezone object for London
tz_London = pytz.timezone('Europe/London')

# Get the current time in London
datetime_London = datetime.now(tz_London)

# Format the time as a string and print it
print("London time:", datetime_London.strftime("%H:%M:%S"))

# Get the timezone object for London
tz_Perth = pytz.timezone('Australia/Perth')

# Get the current time in London
datetime_Perth = datetime.now(tz_Perth)

# Format the time as a string and print it
print("Perth time:", datetime_Perth.strftime("%H:%M:%S"))

NY time: 22:05:47
London time: 03:05:47
Perth time: 11:05:47


In [None]:
time_diff = datetime_London - datetime_Perth

seconds_per_hour = 3600

hours = time_diff.total_seconds() / seconds_per_hour
hours

-5.997222222222222e-07

## Lagging and Leading Calculation

In [None]:
n.head()

Unnamed: 0,week_started,weekly_cases
0,2014-01-05,49
1,2014-04-06,1
2,2014-04-13,1
3,2014-04-20,5
4,2014-04-27,4


In [None]:
n['lagged_week'] = n['weekly_cases'].shift(periods=1)
n['lead_week'] = n['weekly_cases'].shift(periods=-1)
n.head()

Unnamed: 0,week_started,weekly_cases,lagged_week,lead_week
0,2014-01-05,49,,1.0
1,2014-04-06,1,49.0,1.0
2,2014-04-13,1,1.0,5.0
3,2014-04-20,5,1.0,4.0
4,2014-04-27,4,5.0,12.0


In [None]:
#| hide
nbdev_export()