# Pandas: to_datetime()
### One of the most common problem in data analysis is the lack of uniformity in the structure of the input data. 
### For e.g. Jan 12, 2019 can be represented in all the following formats:
### 1. 2019-01-12 
### 2. Jan 12, 2019
### 3. 01/12/2019
### 4. 2019.01.12
### 5. 2019/01/12
### 6. 20190112

## Thus one of the most crucial task is the data pre-processing where in we standardize the date format to a uniform pattern. 


In [1]:
import pandas as pd
dates = ['2019-01-12', 'Jan 12, 2019', '01/12/2019', '2019.01.12', '2019/01/12', '20190112']   # All possible date formats

# Standardize the dates using pandas to_datetime()
pd.to_datetime(dates)

# Observe that all dates are mapped to a uniform pattern  of YYYY-MM-DD

DatetimeIndex(['2019-01-12', '2019-01-12', '2019-01-12', '2019-01-12',
               '2019-01-12', '2019-01-12'],
              dtype='datetime64[ns]', freq=None)

# Standardizing Dates with time

In [2]:
# Lets say we are talking about 12th Jan, 2019 2:30PM
dates = ['2019-01-12 2:30:00 PM', 'Jan 12, 2019 14:30:00']   # All possible date and time formats
pd.to_datetime(dates)

# Observe that the datetime stamps are standardized to the format: YYYY-MM-DD HH:MM:SS


DatetimeIndex(['2019-01-12 14:30:00', '2019-01-12 14:30:00'], dtype='datetime64[ns]', freq=None)

# Standardizing dates as per the country US/ Europe
### For e.g. the Jan 12, 2019 is represented in following ways:
### USA: 01/12/2019    [MM / DD / YYYY - pandas's default]
### UK  : 12/01/2019     [DD / MM / YYYY]


In [3]:
# Lets say we wish to represent the UK date in US pattern. 
pd.to_datetime('12/01/2019', dayfirst=True) 


Timestamp('2019-01-12 00:00:00')

# Custom Date format: Assume that your custom date uses # as the date delimiter

In [4]:
# The date Jan 12, 2019 in custom format is represented as 01#12#2019. 
# Standardize this.
pd.to_datetime('01#12#2019', format='%m#%d#%Y')


Timestamp('2019-01-12 00:00:00')

# Standardize the dates while ignoring the error

In [5]:
# Lets say our dates are in varied formats of which few formats are absolute junk. 
# The default handling of the to_datetime() for such junk values is to throw an exception. However if we want to_datetime() 
# to simple ignore such values:
dates = ['2019-01-12', 'Jan 12, 2019', '01/12/2019', '2019.01.12', '2019/01/12', '20190112', 'APPLE']   

# Standardize the dates using pandas to_datetime()
pd.to_datetime(dates, errors='ignore')

# Observe: the Junk value was ignored! But the standardization of the remaining dates was skipped too...

array(['2019-01-12', 'Jan 12, 2019', '01/12/2019', '2019.01.12',
       '2019/01/12', '20190112', 'APPLE'], dtype=object)

In [6]:
# To selectively ignore the junk value while standardizing the remaining values:
dates = ['2019-01-12', 'Jan 12, 2019', '01/12/2019', '2019.01.12', '2019/01/12', '20190112', 'APPLE']   

# Standardize the dates using pandas to_datetime()
pd.to_datetime(dates, errors='coerce')

# Observe: The remaining dates are standardized while the junk value is marked as 'NaT' i.e. 'Not a Timestamp'

DatetimeIndex(['2019-01-12', '2019-01-12', '2019-01-12', '2019-01-12',
               '2019-01-12', '2019-01-12',        'NaT'],
              dtype='datetime64[ns]', freq=None)

# Handling UNIX time or epoch time
The UNIX time is the number of seconds that have passed since, Jan 01, 1970 00:00:00 UTC. 
The Epoch time 1547551766 is equivalent to [01/15/2019 11:29:26]

In [14]:
t = 1547551766                                                         # These are the number of seconds since Jan 01, 1970

# Converting epoch to Timestamp
t_conv = pd.to_datetime(t, unit='s')                                   # Specify the unit as 'seconds'
print(type(t_conv))                                                    # [Timestamp]
print(t_conv)

print()
# Converting epoch to DateTimeIndex 
t_conv = pd.to_datetime([t], unit='s')
print(type(t_conv))                                                    # [DatetimeIndex] - Simply supply epoch time as array []
print(t_conv)

print()
# Converting DatetimeTindex back to epoch
print(t_conv.view('int64'))                                            # [1547551766000000000] Its in nanosecondss


<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2019-01-15 11:29:26

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
DatetimeIndex(['2019-01-15 11:29:26'], dtype='datetime64[ns]', freq=None)

[1547551766000000000]
