![alt text](pandas.png "Title")

In [6]:
from datetime import datetime
import pandas as pd

# Date and time

We saw that Python has methods to handle dates and times, let's see what pandas has to offer.

In [7]:
# We can easily create some test data
data = pd.date_range('2023-08-01', periods = 31, freq ='D') # 31 records for each day in August 2023
df = pd.DataFrame(data, columns=['date'])
df.head() # display only 5 records

Unnamed: 0,date
0,2023-08-01
1,2023-08-02
2,2023-08-03
3,2023-08-04
4,2023-08-05


In [8]:
# we can extract info from the date using the dt accessor
df['year']     = df['date'].dt.year        # dt has propertys (week, day, month, dayofyear, is_leap_year etc)
df['day_name'] = df['date'].dt.day_name()  # dt has methods (day_name, month_name etc)
df.head()

Unnamed: 0,date,year,day_name
0,2023-08-01,2023,Tuesday
1,2023-08-02,2023,Wednesday
2,2023-08-03,2023,Thursday
3,2023-08-04,2023,Friday
4,2023-08-05,2023,Saturday


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      31 non-null     datetime64[ns]
 1   year      31 non-null     int64         
 2   day_name  31 non-null     object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 872.0+ bytes


In [10]:
# Pandas can also deal with datetime objects
df = pd.DataFrame(data=[datetime.now()], columns=['DateTime'])
df['Date'] = df['DateTime'].dt.date
df

Unnamed: 0,DateTime,Date
0,2023-08-25 08:14:14.745573,2023-08-25


In [11]:
# Note the different object types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  1 non-null      datetime64[ns]
 1   Date      1 non-null      object        
dtypes: datetime64[ns](1), object(1)
memory usage: 144.0+ bytes


In [12]:
print( type(df['DateTime'][0]) )
print( type(df['Date'][0]) )

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'datetime.date'>


In [13]:
# you can also get the object type this way:
print( df['DateTime'].dtype)

datetime64[ns]


In [14]:
df

Unnamed: 0,DateTime,Date
0,2023-08-25 08:14:14.745573,2023-08-25


In [40]:
# You can normalize a datetime to reset time to midnight. Interestingly it preserves the data type.
df['Normalized'] = df['DateTime'].dt.normalize() 
print(df)
print(df['Normalized'].dtype)
print(df['Date'].dtype)
print(df['DateTime'].dtype)

                    DateTime        Date Normalized
0 2023-08-14 04:21:39.810505  2023-08-14 2023-08-14
datetime64[ns]
object
datetime64[ns]


## Converting strings to datetime

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html

In [15]:
# Let's convert strings to datetime
df = pd.DataFrame(
    data = [['2023-08-01'], ['2023-08'], ['2023/01/08'], ['2023/23/08'], ['2023-Aug-05']],
    columns=['raw_date']
)
df

Unnamed: 0,raw_date
0,2023-08-01
1,2023-08
2,2023/01/08
3,2023/23/08
4,2023-Aug-05


In [19]:
# Using default parsing. Does already a nice job but be careful
df['DateTime'] = pd.to_datetime(df['raw_date'], errors='coerce') # 'errors' prevents from raising an Exception & crash
df

# errors = 'raise' | 'ignore' | 'coerce'

Unnamed: 0,raw_date,DateTime
0,2023-08-01,2023-08-01
1,2023-08,2023-08-01
2,2023/01/08,2023-01-08
3,2023/23/08,NaT
4,2023-Aug-05,2023-08-05


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   raw_date  3 non-null      object        
 1   DateTime  3 non-null      datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 176.0+ bytes


In [61]:
# we can use our own formats. There's no magic however: if your raw date is ambiguous, so will be the datetime
Myformat='%Y/%d/%m'
df['DateTime'] = pd.to_datetime(df['raw_date'], format=Myformat, errors='coerce')
df

# NaT: not a Datetime

Unnamed: 0,raw_date,DateTime
0,2023-08-01,NaT
1,2023-08,NaT
2,2023/01/08,2023-08-01
3,2023/23/08,2023-08-23
4,2023-Aug-05,NaT


See Reference on formats:
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

## Operations

In [98]:
df = pd.DataFrame(
    data   = [['2018-07-15'], ['1998-07-12']],
    columns= ['best_FIFA_worldcups']
)
df

Unnamed: 0,best_FIFA_worldcups
0,2018-07-15
1,1998-07-12


In [99]:
# Simple substraction will get you a timedelta object
df['days'] = datetime.now() - pd.to_datetime(df['best_FIFA_worldcups'])

# This object also has a dt accessor, which allows you to convert it to numeric
df['years'] = round(df['days'].dt.days / 365.25, 1)
df

Unnamed: 0,best_FIFA_worldcups,days,years
0,2018-07-15,1856 days 07:16:07.574944,5.1
1,1998-07-12,9164 days 07:16:07.574944,25.1


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype          
---  ------               --------------  -----          
 0   best_FIFA_worldcups  2 non-null      object         
 1   days                 2 non-null      timedelta64[ns]
 2   years                2 non-null      float64        
dtypes: float64(1), object(1), timedelta64[ns](1)
memory usage: 176.0+ bytes


In [102]:
# Compare two dates
df['XXI century'] = pd.to_datetime("2000-01-01") < pd.to_datetime(df['best_FIFA_worldcups'])
df

Unnamed: 0,best_FIFA_worldcups,days,years,XXI century
0,2018-07-15,1856 days 07:16:07.574944,5.1,True
1,1998-07-12,9164 days 07:16:07.574944,25.1,False


__________________________________________________
Nicolas Dupuis, Methodology and Innovation (IDAR C&SP), 2020+