In [2]:
import pandas as pd
import numpy as np

In [11]:
data = pd.read_fwf("http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii", parse_dates = [[0, 1]], infer_datetime_format = True, header = None,)

In [12]:
data.head()

Unnamed: 0,0_1,2
0,1950-01-01,-0.06031
1,1950-02-01,0.62681
2,1950-03-01,-0.008128
3,1950-04-01,0.5551
4,1950-05-01,0.071577


In [13]:
data.columns = ['month', 'value']
data.index = data.month
data = data.drop('month', 1)

In [14]:
data.head()

Unnamed: 0_level_0,value
month,Unnamed: 1_level_1
1950-01-01,-0.06031
1950-02-01,0.62681
1950-03-01,-0.008128
1950-04-01,0.5551
1950-05-01,0.071577


In [15]:
data['1950':'1952'] #What do you notice about the range of dates?

Unnamed: 0_level_0,value
month,Unnamed: 1_level_1
1950-01-01,-0.06031
1950-02-01,0.62681
1950-03-01,-0.008128
1950-04-01,0.5551
1950-05-01,0.071577
1950-06-01,0.53857
1950-07-01,-0.80248
1950-08-01,-0.85101
1950-09-01,0.35797
1950-10-01,-0.3789


In [16]:
# What is the empirical range of dates?
print(min(data.index), max(data.index))

1950-01-01 00:00:00 2018-01-01 00:00:00


In [None]:
# How to visualize?

In [17]:
# What kind of index do we have?
type(data.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [18]:
data['1951-11-11':'1951-11-12']

Unnamed: 0_level_0,value
month,Unnamed: 1_level_1


In [19]:
# What if we want a period index?
data_pd = data.to_period()

In [20]:
data_pd['1951-11-11':'1951-11-12']

Unnamed: 0_level_0,value
month,Unnamed: 1_level_1
1951-11,-0.068519


In [21]:
data_pd['1951-11-11':'1952-01-12']

Unnamed: 0_level_0,value
month,Unnamed: 1_level_1
1951-11,-0.068519
1951-12,1.9872
1952-01,0.36825


In [22]:
# Which is more appropriate for this data?
#period is appropriate for this data since we are dealing with monthly average

In [3]:
import timeit

print("infer_datetime_format = True, no date parser")
%timeit pd.read_fwf("http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii", parse_dates = [[0, 1]], infer_datetime_format = True, header = None,)

infer_datetime_format = True, no date parser
197 ms ± 20.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
# How do various data loads perform?
#infer_datetime_format, no date parser is fast (unless we have something to use)
#connection reset by peer (error) when using python 3. Changed to python 2.
import timeit

print("infer_datetime_format = True, no date parser")
%timeit pd.read_fwf("http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii", parse_dates = [[0, 1]], infer_datetime_format = True, header = None,)

print("infer_datetime_format = False, no date parser")
%timeit pd.read_fwf("http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii", parse_dates = [[0, 1]], infer_datetime_format = False, header = None,)

print("infer_datetime_format = True, date parser provided")
dateparse = lambda x, y: pd.datetime.strptime('%s-%s'%(x,y), '%Y-%m')
%timeit pd.read_fwf("http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii", parse_dates = [[0, 1]], infer_datetime_format = True, date_parser = dateparse,  header = None,)

print("infer_datetime_format = False, date parser provided")
dateparse = lambda x, y: pd.datetime.strptime('%s-%s'%(x,y), '%Y-%m')
%timeit pd.read_fwf("http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii", parse_dates = [[0, 1]], infer_datetime_format = False, date_parser = dateparse,  header = None,)

infer_datetime_format = True, no date parser
485 ms ± 177 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
infer_datetime_format = False, no date parser
312 ms ± 76.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
infer_datetime_format = True, date parser provided
316 ms ± 100 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
infer_datetime_format = False, date parser provided
The slowest run took 9.11 times longer than the fastest. This could mean that an intermediate result is being cached.
462 ms ± 466 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
# What if you already have the data frame and want to parse columns?
df = pd.DataFrame({'year': [2015, 2016],'month': [2, 3],'day': [4, 5],'hour': [2, 3]})
df

Unnamed: 0,day,hour,month,year
0,4,2,2,2015
1,5,3,3,2016


In [6]:
pd.to_datetime(df)

0   2015-02-04 02:00:00
1   2016-03-05 03:00:00
dtype: datetime64[ns]

In [7]:
pd.to_datetime(df[['year', 'month', 'day']])

0   2015-02-04
1   2016-03-05
dtype: datetime64[ns]

In [9]:
# Does it work with other column names?
df = pd.DataFrame({'yr': [2015, 2016],'mon': [2, 3],'d': [4, 5],'hr': [2, 3]})
df

Unnamed: 0,d,hr,mon,yr
0,4,2,2,2015
1,5,3,3,2016


In [29]:
# Go get your own time series data, load it in, and see what you can see
# Hint: http://pandas.pydata.org/pandas-docs/stable/remote_data.html
import pandas_datareader as web

import datetime

start = datetime.datetime(2010, 1, 1)

end = datetime.datetime(2013, 1, 27)

gdp=web.DataReader("GDP", "fred", start, end)

In [30]:
inflation = web.DataReader(["CPIAUCSL", "CPILFESL"], "fred", start, end)
inflation.head()

Unnamed: 0_level_0,CPIAUCSL,CPILFESL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,217.488,220.633
2010-02-01,217.281,220.731
2010-03-01,217.353,220.783
2010-04-01,217.403,220.822
2010-05-01,217.29,220.962


In [None]:
# 1. plot 2. get range of dates 3. convert between time and period index

In [32]:
#range of dates
print(min(inflation.index), max(inflation.index))

2010-01-01 00:00:00 2013-01-01 00:00:00


In [35]:
type(inflation.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [33]:
inflation_pd = inflation.to_period()

In [36]:
type(inflation_pd.index)

pandas.core.indexes.period.PeriodIndex

In [12]:
# Let's experiment with truncate convenience function
ts = pd.Series(range(10), index = pd.date_range('7/31/2015', freq = 'M', periods = 10))
ts

2015-07-31    0
2015-08-31    1
2015-09-30    2
2015-10-31    3
2015-11-30    4
2015-12-31    5
2016-01-31    6
2016-02-29    7
2016-03-31    8
2016-04-30    9
Freq: M, dtype: int64

In [13]:
ts.truncate(before='10/31/2015', after='12/31/2015')

2015-10-31    3
2015-11-30    4
2015-12-31    5
Freq: M, dtype: int64

In [14]:
# You can truncate in a way that breaks frequency
ts[[0, 2, 6]].index

DatetimeIndex(['2015-07-31', '2015-09-30', '2016-01-31'], dtype='datetime64[ns]', freq=None)

In [15]:
# It will save you when it can
ts.ix[0:10:2].index

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


DatetimeIndex(['2015-07-31', '2015-09-30', '2015-11-30', '2016-01-31',
               '2016-03-31'],
              dtype='datetime64[ns]', freq='2M')