# Dealing with Dates

#### by June 3, 2019

Getting dates into a numeric format and extracting features of dates like month and year into new variables can be useful preprocessing steps using pandas.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creating DataFrame

df = pd.DataFrame(
   [['4/22/1996', '22-Apr-96', 'Tue Aug 11 09:50:35 1996', '2007-06-22'],
    ['4/23/1996', '22-Apr-96', 'Tue Aug 12 19:50:35 2016', '2017-01-09'],
    ['5/14/1996', '14-May-96', 'Mon Oct 14 09:50:35 2017', '1998-04-12'],
    ['5/15/1996', '15-May-96', 'Tue Jan 11 09:50:35 2018', '2027-07-22'],
    ['5/16/2001', '16-May-01', 'Fri Mar 11 07:30:36 2019' , '1945-11-15'],
    ['5/17/2002', '17-May-02', 'Tue Aug 11 09:50:35 2020', '1942-06-22'],
    ['5/18/2003', '18-May-03', 'Wed Dec 21 09:50:35 2021', '1887-06-13'],
    ['5/19/2004', '19-May-04', 'Tue Jan 11 09:50:35 2022', '1912-01-25'],
    ['5/20/2005', '20-May-05', 'Sun Jul 10 19:40:25 2023', '2007-06-22']
   ],
    index=[1,2,3,4,5,6,7,8,9],
    columns=['month_day_year', 'day_month_year', 'date_time', 'year_month_day']
)
df

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day
1,4/22/1996,22-Apr-96,Tue Aug 11 09:50:35 1996,2007-06-22
2,4/23/1996,22-Apr-96,Tue Aug 12 19:50:35 2016,2017-01-09
3,5/14/1996,14-May-96,Mon Oct 14 09:50:35 2017,1998-04-12
4,5/15/1996,15-May-96,Tue Jan 11 09:50:35 2018,2027-07-22
5,5/16/2001,16-May-01,Fri Mar 11 07:30:36 2019,1945-11-15
6,5/17/2002,17-May-02,Tue Aug 11 09:50:35 2020,1942-06-22
7,5/18/2003,18-May-03,Wed Dec 21 09:50:35 2021,1887-06-13
8,5/19/2004,19-May-04,Tue Jan 11 09:50:35 2022,1912-01-25
9,5/20/2005,20-May-05,Sun Jul 10 19:40:25 2023,2007-06-22


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 1 to 9
Data columns (total 4 columns):
month_day_year    9 non-null object
day_month_year    9 non-null object
date_time         9 non-null object
year_month_day    9 non-null object
dtypes: object(4)
memory usage: 360.0+ bytes


In [4]:
# Date data are loaded as string by default

for i in df:
    print(type(df['month_day_year'][1]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


### Convert string type into date type

In [5]:
df.head()

Unnamed: 0,month_day_year,day_month_year,date_time,year_month_day
1,4/22/1996,22-Apr-96,Tue Aug 11 09:50:35 1996,2007-06-22
2,4/23/1996,22-Apr-96,Tue Aug 12 19:50:35 2016,2017-01-09
3,5/14/1996,14-May-96,Mon Oct 14 09:50:35 2017,1998-04-12
4,5/15/1996,15-May-96,Tue Jan 11 09:50:35 2018,2027-07-22
5,5/16/2001,16-May-01,Fri Mar 11 07:30:36 2019,1945-11-15


In [6]:
for i in df:
    df[i] = pd.to_datetime(df[i])
    print(type(df[i][1]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [7]:
odd_date = "12:30:15 2015-29-11"
print(type(odd_date))
#new_odd_date = pd.to_datetime(odd_date)

<class 'str'>


The default to_datetime parser will fail to convert this date because it expects dates in the form year-month-day. In cases like this, specify the date's format to convert it to Timestamp like below.

In [8]:
new_odd_date = pd.to_datetime(odd_date, format = "%H:%M:%S %Y-%d-%m")
print(type(new_odd_date))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


## Convert date type into year, month and day

In [10]:
column_1 = df.iloc[: , 0]

new_df_column_1 = pd.DataFrame(
    {
        "year" : column_1.dt.year,
        "month" : column_1.dt.month,
        "day" : column_1.dt.day,
        "hour" : column_1.dt.hour,
        "dayofyear" : column_1.dt.dayofyear,
        "week":column_1.dt.week,
        "weekofyear" : column_1.dt.weekofyear,
        "dayofweek" : column_1.dt.weekday,
        "quarter" : column_1.dt.quarter,
        "day_of_week" : column_1.dt.weekday_name
    }
)

new_df_column_1

Unnamed: 0,year,month,day,hour,dayofyear,week,weekofyear,dayofweek,quarter,day_of_week
1,1996,4,22,0,113,17,17,0,2,Monday
2,1996,4,23,0,114,17,17,1,2,Tuesday
3,1996,5,14,0,135,20,20,1,2,Tuesday
4,1996,5,15,0,136,20,20,2,2,Wednesday
5,2001,5,16,0,136,20,20,2,2,Wednesday
6,2002,5,17,0,137,20,20,4,2,Friday
7,2003,5,18,0,138,20,20,6,2,Sunday
8,2004,5,19,0,140,21,21,2,2,Wednesday
9,2005,5,20,0,140,20,20,4,2,Friday


In [11]:
new_df_column_1.groupby('day_of_week').size()

day_of_week
Friday       2
Monday       1
Sunday       1
Tuesday      2
Wednesday    3
dtype: int64

## Using subtraction operator

In [34]:
print(df.iloc[1, 0])
print(df.iloc[3, 0])
print(df.iloc[3, 0] - df.iloc[1, 0])

1996-04-23 00:00:00
1996-05-15 00:00:00
22 days 00:00:00


## Convert date type into unix time

In [13]:
df['date_time'].view('int64')

1     839757035000000000
2    1471031435000000000
3    1507974635000000000
4    1515664235000000000
5    1552289436000000000
6    1597139435000000000
7    1640080235000000000
8    1641894635000000000
9    1689018025000000000
Name: date_time, dtype: int64