In [1]:
from IPython.display import Image

----------------
## PANDAS - Working with dates and times data
------------


In [2]:
import numpy as np
import pandas as pd

In [3]:
# read a dataset of top-rated IMDb movies into a DataFrame
url = 'http://bit.ly/uforeports'
ufo = pd.read_csv(url)

In [5]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [4]:
ufo.shape

(18241, 5)

In [5]:
ufo.dtypes

City               object
Colors Reported    object
Shape Reported     object
State              object
Time               object
dtype: object

convert the Time column to `datatime` format

In [6]:
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.dtypes

City                       object
Colors Reported            object
Shape Reported             object
State                      object
Time               datetime64[ns]
dtype: object

In [7]:
ufo.Time.dt.hour.head()

0    22
1    20
2    14
3    13
4    19
Name: Time, dtype: int64

In [8]:
ufo.Time.dt.weekday.head()

0    6
1    0
2    6
3    0
4    1
Name: Time, dtype: int64

In [9]:
ufo.Time.dt.day_name().head()

0     Sunday
1     Monday
2     Sunday
3     Monday
4    Tuesday
Name: Time, dtype: object

In [10]:
ufo.Time.dt.dayofyear.head()

0    152
1    181
2     46
3    152
4    108
Name: Time, dtype: int64

In [13]:
ufo.Time.max(), ufo.Time.min()

(Timestamp('2000-12-31 23:59:00'), Timestamp('1930-06-01 22:00:00'))

In [14]:
ufo.Time.max() - ufo.Time.min()

Timedelta('25781 days 01:59:00')

In [15]:
(ufo.Time.max() - ufo.Time.min()).days

25781

In [16]:
(ufo.Time.max() - ufo.Time.min()).seconds

7140

##### Example 

In [11]:
location = r'D:\MYLEARN\datasets\opsd_germany_daily.csv'

In [12]:
opsd_daily = pd.read_csv(location)
opsd_daily.shape

(4383, 5)

In [13]:
opsd_daily.head()

Unnamed: 0,Date,Consumption,Wind,Solar,Wind+Solar
0,2006-01-01,1069.184,,,
1,2006-01-02,1380.521,,,
2,2006-01-03,1442.533,,,
3,2006-01-04,1457.217,,,
4,2006-01-05,1477.131,,,


In [20]:
# check out the data types of each column.
opsd_daily.dtypes

Date            object
Consumption    float64
Wind           float64
Solar          float64
Wind+Solar     float64
dtype: object

In [21]:
format = '%Y-%m-%d'

opsd_daily['Date'] = pd.to_datetime(opsd_daily['Date'], format=format)
opsd_daily.head()

Unnamed: 0,Date,Consumption,Wind,Solar,Wind+Solar
0,2006-01-01,1069.184,,,
1,2006-01-02,1380.521,,,
2,2006-01-03,1442.533,,,
3,2006-01-04,1457.217,,,
4,2006-01-05,1477.131,,,


In [22]:
opsd_daily.set_index('Date', inplace=True)
opsd_daily.head(3)

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-01-01,1069.184,,,
2006-01-02,1380.521,,,
2006-01-03,1442.533,,,


In [23]:
opsd_daily.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4383 entries, 2006-01-01 to 2017-12-31
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Consumption  4383 non-null   float64
 1   Wind         2920 non-null   float64
 2   Solar        2188 non-null   float64
 3   Wind+Solar   2187 non-null   float64
dtypes: float64(4)
memory usage: 171.2 KB


In [24]:
opsd_daily.index

DatetimeIndex(['2006-01-01', '2006-01-02', '2006-01-03', '2006-01-04',
               '2006-01-05', '2006-01-06', '2006-01-07', '2006-01-08',
               '2006-01-09', '2006-01-10',
               ...
               '2017-12-22', '2017-12-23', '2017-12-24', '2017-12-25',
               '2017-12-26', '2017-12-27', '2017-12-28', '2017-12-29',
               '2017-12-30', '2017-12-31'],
              dtype='datetime64[ns]', name='Date', length=4383, freq=None)

In [25]:
pd.DatetimeIndex(opsd_daily.index).year

Int64Index([2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006,
            ...
            2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017],
           dtype='int64', name='Date', length=4383)

Another useful aspect of the DatetimeIndex is that the individual date/time components are all available as attributes such as year, month, day, and so on. 

Let’s add a few more columns to opsd_daily, containing the year, month, and weekday name.

In [26]:
pd.DatetimeIndex(opsd_daily.index).week

  pd.DatetimeIndex(opsd_daily.index).week


Int64Index([52,  1,  1,  1,  1,  1,  1,  1,  2,  2,
            ...
            51, 51, 51, 52, 52, 52, 52, 52, 52, 52],
           dtype='int64', name='Date', length=4383)

In [27]:
# Add columns with year, month, and weekday name
opsd_daily['Year']         = pd.DatetimeIndex(opsd_daily.index).year
opsd_daily['Month']        = pd.DatetimeIndex(opsd_daily.index).month
opsd_daily['Weekday Name'] = pd.DatetimeIndex(opsd_daily.index).weekday

# Display a random sampling of 5 rows
opsd_daily.sample(5, random_state=0)

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar,Year,Month,Weekday Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-08-23,1152.011,,,,2008,8,5
2013-08-08,1291.984,79.666,93.371,173.037,2013,8,3
2009-08-27,1281.057,,,,2009,8,3
2015-10-02,1391.05,81.229,160.641,241.87,2015,10,4
2009-06-02,1201.522,,,,2009,6,1


#### Time-based indexing

In [28]:
# select data for a single day using a string such as '2017-08-10'.
opsd_daily.loc['2017-08-10']

Consumption     1351.491
Wind             100.274
Solar             71.160
Wind+Solar       171.434
Year            2017.000
Month              8.000
Weekday Name       3.000
Name: 2017-08-10 00:00:00, dtype: float64

In [29]:
# select a slice of days, such as '2014-01-20':'2014-01-22'. As with regular label-based indexing with loc, the slice is inclusive of both endpoints.
opsd_daily.loc['2014-01-20':'2014-01-25']

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar,Year,Month,Weekday Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-20,1590.687,78.647,6.371,85.018,2014,1,0
2014-01-21,1624.806,15.643,5.835,21.478,2014,1,1
2014-01-22,1625.155,60.259,11.992,72.251,2014,1,2
2014-01-23,1631.373,125.177,13.661,138.838,2014,1,3
2014-01-24,1617.411,106.527,9.807,116.334,2014,1,4
2014-01-25,1399.124,145.786,19.051,164.837,2014,1,5


#### partial-string indexing

- select all date/times which partially match a given string. 

- For example, 
    - we can select the entire year 2006 with opsd_daily.loc['2006'], 
    - or the entire month of February 2012 with opsd_daily.loc['2012-02'].

In [30]:
# get records of Feb 2012
opsd_daily.loc['2012-02':]

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar,Year,Month,Weekday Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-02-01,1511.86600,199.607,43.502,243.109,2012,2,2
2012-02-02,1563.40700,73.469,44.675,118.144,2012,2,3
2012-02-03,1563.63100,36.352,46.510,82.862,2012,2,4
2012-02-04,1372.61400,20.551,45.225,65.776,2012,2,5
2012-02-05,1279.43200,55.522,54.572,110.094,2012,2,6
...,...,...,...,...,...,...,...
2017-12-27,1263.94091,394.507,16.530,411.037,2017,12,2
2017-12-28,1299.86398,506.424,14.162,520.586,2017,12,3
2017-12-29,1295.08753,584.277,29.854,614.131,2017,12,4
2017-12-30,1215.44897,721.247,7.467,728.714,2017,12,5
