In [12]:
import numpy as np
import pandas as pd

three modules
- `datetime`
- `time`
- `calendar`

# `datetime`

In [13]:
from datetime import datetime
now = datetime.now()
print(now)
print(now.year, now.month, now.day)

2022-07-22 21:08:13.795270
2022 7 22


`timedelta` represents temporal difference

In [20]:
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(days=926, seconds=56700)

In [21]:
delta.days

926

In [22]:
delta.seconds

56700

In [23]:
from datetime import timedelta
start = datetime(2011, 1, 7)
delta = timedelta(12) # add 12 days
start + delta

datetime.datetime(2011, 1, 19, 0, 0)

In [24]:
start - 2 * delta

datetime.datetime(2010, 12, 14, 0, 0)

- `date` stores date(year, month,day) using Georgian calendar
- `time` stores time of day(hrs, mins, secs, microsecs)
- `datetime` stores both date and time
- `timedelta` difference between two datetime objects as (days, seconds, microsecs)
- `tzinfo` base type for storing time zone info 

converting between string and datetime
using `str` or `strftime`

In [34]:
stamp = datetime(2011, 1, 3, 23)
str(stamp)


'2011-01-03 23:00:00'

In [44]:
stamp.strftime("%Y-%m-%d-%I-%S-%f-%Z")

'2011-01-03-11-00-000000-'

In [46]:
# convert string to datetime
value = "2011-01-03"
datetime.strptime(value, "%Y-%m-%d")

datetime.datetime(2011, 1, 3, 0, 0)

`pandas` datetime

In [50]:
datestrs = ["2011-07-06 12:00:00", "2011-08-06 00:00:00"]
pd.to_datetime(datestrs)


DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [53]:
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT', 'NaT',
               'NaT'],
              dtype='datetime64[ns]', freq=None)

In [56]:
datestrs.append(None)
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT', 'NaT',
               'NaT', 'NaT'],
              dtype='datetime64[ns]', freq=None)

`NaT` is not a time

In [57]:
pd.isna(idx)

array([False, False,  True,  True,  True])

# Pandas TimeSeries Basics

In [58]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)
ts

2011-01-02    1.091964
2011-01-05    0.365038
2011-01-07    1.440131
2011-01-08   -0.429243
2011-01-10    0.108160
2011-01-12   -0.636599
dtype: float64

operations on time indexed dataframes aligns on dates

In [60]:
ts + ts[::2] # adds every other value with itself

2011-01-02    2.183928
2011-01-05         NaN
2011-01-07    2.880262
2011-01-08         NaN
2011-01-10    0.216321
2011-01-12         NaN
dtype: float64

In [62]:
ts.index.dtype # stored as numpy's datetime64 at nanoseconds resolution

dtype('<M8[ns]')

`pandas.Timestamp` can be substituted for `datetime` but *may* not the other way around
- Timestamp stores at nanosecond resolution whereas datetime is microseconds
- Timestamp can handle timezone conversions and store frequency information

In [63]:
ts.index[0]

Timestamp('2011-01-02 00:00:00')

Indexing

In [64]:
ts["2011-01-05"]

0.365038271379911

In [67]:
stamp = ts.index[2]
ts.loc[stamp]
ts[stamp]

1.4401309360428818

 default `freq` is days `D` in `date_range`, periods are generated based on frequency

In [71]:
longer_ts = pd.Series(np.random.standard_normal(1000),
                     index=pd.date_range("2000-01-01-09", periods=1000))
                     
longer_ts                

2000-01-01 09:00:00    0.196652
2000-01-02 09:00:00   -0.978825
2000-01-03 09:00:00   -0.227206
2000-01-04 09:00:00    0.078074
2000-01-05 09:00:00    0.450239
                         ...   
2002-09-22 09:00:00    0.635619
2002-09-23 09:00:00   -0.305815
2002-09-24 09:00:00    1.369600
2002-09-25 09:00:00   -1.954314
2002-09-26 09:00:00    0.111125
Freq: D, Length: 1000, dtype: float64

In [72]:
# subset by year
longer_ts["2001"]

2001-01-01 09:00:00    2.751242
2001-01-02 09:00:00   -2.179290
2001-01-03 09:00:00   -0.146192
2001-01-04 09:00:00   -0.055184
2001-01-05 09:00:00   -0.582200
                         ...   
2001-12-27 09:00:00    0.220069
2001-12-28 09:00:00    0.256989
2001-12-29 09:00:00   -1.219508
2001-12-30 09:00:00    1.027289
2001-12-31 09:00:00    1.148620
Freq: D, Length: 365, dtype: float64

In [73]:
# subset by month
longer_ts["2001-05"]

2001-05-01 09:00:00   -0.863711
2001-05-02 09:00:00   -0.911403
2001-05-03 09:00:00   -0.451811
2001-05-04 09:00:00    0.705887
2001-05-05 09:00:00    0.325411
2001-05-06 09:00:00    0.893961
2001-05-07 09:00:00    1.360390
2001-05-08 09:00:00    0.458763
2001-05-09 09:00:00   -0.039651
2001-05-10 09:00:00    0.007843
2001-05-11 09:00:00    1.271037
2001-05-12 09:00:00   -0.893327
2001-05-13 09:00:00    0.515144
2001-05-14 09:00:00   -0.084740
2001-05-15 09:00:00   -0.173309
2001-05-16 09:00:00   -0.802105
2001-05-17 09:00:00   -0.215242
2001-05-18 09:00:00    0.148674
2001-05-19 09:00:00   -0.894670
2001-05-20 09:00:00   -1.493811
2001-05-21 09:00:00    0.504539
2001-05-22 09:00:00   -0.671674
2001-05-23 09:00:00    1.371451
2001-05-24 09:00:00   -0.255703
2001-05-25 09:00:00    0.356486
2001-05-26 09:00:00   -0.290510
2001-05-27 09:00:00   -2.227703
2001-05-28 09:00:00   -0.946760
2001-05-29 09:00:00    0.194913
2001-05-30 09:00:00   -0.952172
2001-05-31 09:00:00    1.036555
Freq: D,

Slicing with `datetime` objects

In [74]:
ts[datetime(2011,1,7):datetime(2011,1,10)]

2011-01-07    1.440131
2011-01-08   -0.429243
2011-01-10    0.108160
dtype: float64

In [78]:
longer_ts[datetime(2001, 5, 25):datetime(2001, 5, 30)]

2001-05-25 09:00:00    0.356486
2001-05-26 09:00:00   -0.290510
2001-05-27 09:00:00   -2.227703
2001-05-28 09:00:00   -0.946760
2001-05-29 09:00:00    0.194913
Freq: D, dtype: float64

In [77]:
longer_ts.index[0]

Timestamp('2000-01-01 09:00:00', freq='D')

In [79]:
longer_ts["2001-05-25":"2001-05-30"]

2001-05-25 09:00:00    0.356486
2001-05-26 09:00:00   -0.290510
2001-05-27 09:00:00   -2.227703
2001-05-28 09:00:00   -0.946760
2001-05-29 09:00:00    0.194913
2001-05-30 09:00:00   -0.952172
Freq: D, dtype: float64

**NOTE** slicing produces views not copies, changes will reflected in original data

`truncate` to slice series

In [80]:
longer_ts.truncate(after="2001-05-30")

2000-01-01 09:00:00    0.196652
2000-01-02 09:00:00   -0.978825
2000-01-03 09:00:00   -0.227206
2000-01-04 09:00:00    0.078074
2000-01-05 09:00:00    0.450239
                         ...   
2001-05-25 09:00:00    0.356486
2001-05-26 09:00:00   -0.290510
2001-05-27 09:00:00   -2.227703
2001-05-28 09:00:00   -0.946760
2001-05-29 09:00:00    0.194913
Freq: D, Length: 515, dtype: float64

In [84]:
dates = pd.date_range("2000-03-11", periods=30, freq="W-WED") # weekly wednesday freq
long_df = pd.DataFrame(np.random.standard_normal((30, 4)),
    index=dates,
    columns=["Colorado", "Texas", "New York", "Ohio"])

In [88]:
long_df.loc["2000-04-12"]

Colorado    1.248780
Texas      -0.657757
New York    0.078938
Ohio        0.006289
Name: 2000-04-12 00:00:00, dtype: float64

Duplicate indices

In [89]:
dates = pd.DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-02",
    "2000-01-02", "2000-01-03"])
dup_ts = pd.Series(np.random.standard_normal(dates.shape[0]), index=dates)    
dup_ts


2000-01-01   -0.748545
2000-01-02    1.721871
2000-01-02   -1.342584
2000-01-02   -0.154013
2000-01-03   -0.517951
dtype: float64

In [90]:
dup_ts.index.is_unique

False

In [91]:
dup_ts["2000-01-02"]

2000-01-02    1.721871
2000-01-02   -1.342584
2000-01-02   -0.154013
dtype: float64

In [93]:
grouped = dup_ts.groupby(level=0) # use grouping by levels to group on duplicate timestamps
grouped.mean()

2000-01-01   -0.748545
2000-01-02    0.075091
2000-01-03   -0.517951
dtype: float64

# Date Ranges in depth