In [36]:
import pandas as pd
import numpy as np
import re
import pytz
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
from pandas.tseries.offsets import Day, MonthEnd

# binning data

In [2]:
binningDataArr = range(10)
binningDataArr

range(0, 10)

In [3]:
binningCategory = pd.cut(binningDataArr, [0, 3, 7, 10], right=False, labels=['group_0', 'group_1', 'group_2'])
binningCategory.value_counts()

group_0    3
group_1    4
group_2    3
dtype: int64

# quantile binning data

In [4]:
quantileBinningDataArr = range(10)
quantileBinningDataArr

range(0, 10)

In [5]:
quantileBinningCategory = pd.qcut(quantileBinningDataArr, 4, labels=['group_0', 'group_1', 'group_2', 'group_3'])
quantileBinningCategory.value_counts()

group_0    3
group_1    2
group_2    2
group_3    3
dtype: int64

# generate datetime index

In [2]:
dates = pd.date_range('1/1/2000', periods=7)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

# crosstab

In [3]:
df = pd.DataFrame({
    'Sample': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Nationality': ['USA', 'Japan', 'USA', 'Japan', 'Japan', 'Japan', 'USA', 'USA', 'Japan', 'USA'],
    'Handedness': ['Right-handed', 'Left-handed', 'Right-handed', 'Right-handed', 'Left-handed', 'Right-handed', 'Right-handed', 'Left-handed', 'Right-handed', 'Right-handed']
})
df

Unnamed: 0,Sample,Nationality,Handedness
0,1,USA,Right-handed
1,2,Japan,Left-handed
2,3,USA,Right-handed
3,4,Japan,Right-handed
4,5,Japan,Left-handed
5,6,Japan,Right-handed
6,7,USA,Right-handed
7,8,USA,Left-handed
8,9,Japan,Right-handed
9,10,USA,Right-handed


In [4]:
pd.crosstab(df.Nationality, df.Handedness, margins=True)

Handedness,Left-handed,Right-handed,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10


In [5]:
tips = pd.read_csv('/home/hsiehpinghan/git/python/pandas-python/data/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[0:5]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [6]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244


# datetime

In [9]:
now = datetime.now()
now

datetime.datetime(2018, 7, 5, 5, 56, 33, 149451)

In [16]:
str(now)

'2018-07-05 06:00:29.864645'

In [20]:
nowStr = now.strftime('%Y-%m-%d %H:%M:%S')
nowStr

'2018-07-05 06:00:29'

In [21]:
datetime.strptime(nowStr, '%Y-%m-%d %H:%M:%S')

datetime.datetime(2018, 7, 5, 6, 0, 29)

In [12]:
delta = datetime(2018, 7, 1, 0, 0, 1) - datetime(2018, 7, 1, 0, 0, 0)
delta

datetime.timedelta(0, 1)

In [26]:
dateStrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
pd.to_datetime(dateStrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

# timedelta

In [14]:
now = datetime.now()
now

datetime.datetime(2018, 7, 5, 6, 0, 29, 864645)

In [15]:
now + timedelta(1)

datetime.datetime(2018, 7, 6, 6, 0, 29, 864645)

# date parser

In [23]:
parse('2018-07-05 06:00:29')

datetime.datetime(2018, 7, 5, 6, 0, 29)

In [24]:
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [25]:
parse('6/12/2011', dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

# shift date

In [28]:
now = datetime(2001, 1, 1)
now + 3 * Day()

Timestamp('2001-01-04 00:00:00')

In [32]:
now + 2 * MonthEnd()

Timestamp('2001-02-28 00:00:00')

In [34]:
offset = MonthEnd()
offset.rollforward(now)

Timestamp('2001-01-31 00:00:00')

In [35]:
offset.rollback(now)

Timestamp('2000-12-31 00:00:00')

# timezone

In [40]:
pytz.common_timezones[-5:]

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

In [41]:
pytz.timezone('Asia/Taipei')

<DstTzInfo 'Asia/Taipei' LMT+8:06:00 STD>

In [46]:
ts = pd.date_range('2018-1-1 00:00:00', periods=10, freq='D')
ts

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10'],
              dtype='datetime64[ns]', freq='D')

In [47]:
ts_taipei = ts.tz_localize('Asia/Taipei')
ts_taipei

DatetimeIndex(['2018-01-01 00:00:00+08:00', '2018-01-02 00:00:00+08:00',
               '2018-01-03 00:00:00+08:00', '2018-01-04 00:00:00+08:00',
               '2018-01-05 00:00:00+08:00', '2018-01-06 00:00:00+08:00',
               '2018-01-07 00:00:00+08:00', '2018-01-08 00:00:00+08:00',
               '2018-01-09 00:00:00+08:00', '2018-01-10 00:00:00+08:00'],
              dtype='datetime64[ns, Asia/Taipei]', freq='D')

In [48]:
ts_utc = ts_taipei.tz_convert('UTC')
ts_utc

DatetimeIndex(['2017-12-31 16:00:00+00:00', '2018-01-01 16:00:00+00:00',
               '2018-01-02 16:00:00+00:00', '2018-01-03 16:00:00+00:00',
               '2018-01-04 16:00:00+00:00', '2018-01-05 16:00:00+00:00',
               '2018-01-06 16:00:00+00:00', '2018-01-07 16:00:00+00:00',
               '2018-01-08 16:00:00+00:00', '2018-01-09 16:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

# period

In [55]:
p = pd.Period(2018, freq='A-JUN')
p

Period('2018', 'A-JUN')

In [56]:
p + 5

Period('2023', 'A-JUN')

In [57]:
p - pd.Period(2000, freq='A-JUN')

18

In [58]:
p.asfreq('M', how='start')

Period('2017-07', 'M')

In [61]:
p = pd.Period('2018Q1', freq='Q-DEC')
p

Period('2018Q1', 'Q-DEC')

# period range

In [52]:
pr = pd.period_range('2000-01-01', '2000-06-30', freq='M')
pr

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')

In [67]:
pr = pd.period_range('2000-01-01', periods=6, freq='M')
pr

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')

# period index

In [53]:
values = ['2001Q3', '2002Q2', '2003Q1']
index = pd.PeriodIndex(values, freq='Q-DEC')
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')

In [77]:
data = pd.read_csv('/home/hsiehpinghan/git/python/pandas-python/data/macrodata.csv')
data[:5]

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959,1,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959,2,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959,3,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959,4,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960,1,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [78]:
index = pd.PeriodIndex(year=data.year, quarter=data.quarter, freq='Q-DEC')
index

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', length=203, freq='Q-DEC')