# Pandas

# Object Creation

In [4]:
import pandas as pd
import numpy as np

In [5]:
# Series
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:
# DataFrame with date range
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
# DataFrame
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.18641,-2.030437,0.60914,-0.318064
2013-01-02,0.103963,0.793337,-0.952406,-0.820519
2013-01-03,1.491392,-1.282663,1.20797,-0.832034
2013-01-04,3.063255,0.351649,-0.700836,-0.388052
2013-01-05,-0.386104,-0.150433,-1.173677,1.888422
2013-01-06,-1.925189,-0.943621,-0.174591,-0.562941


In [10]:
df2 = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'),'C' : pd.Series(1,index=list(range(4)),dtype='float32'),'D' : np.array([3] * 4,dtype='int32'),'E' : pd.Categorical(["test","train","test","train"]),'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [12]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [13]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.18641,-2.030437,0.60914,-0.318064
2013-01-02,0.103963,0.793337,-0.952406,-0.820519
2013-01-03,1.491392,-1.282663,1.20797,-0.832034
2013-01-04,3.063255,0.351649,-0.700836,-0.388052
2013-01-05,-0.386104,-0.150433,-1.173677,1.888422


In [14]:
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,0.103963,0.793337,-0.952406,-0.820519
2013-01-03,1.491392,-1.282663,1.20797,-0.832034
2013-01-04,3.063255,0.351649,-0.700836,-0.388052
2013-01-05,-0.386104,-0.150433,-1.173677,1.888422
2013-01-06,-1.925189,-0.943621,-0.174591,-0.562941


In [18]:
df.index # index, columns, and the underlying numpy data

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [19]:
df.describe() # quick statistic summary

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.588954,-0.543695,-0.1974,-0.172198
std,1.720488,1.063989,0.93837,1.031737
min,-1.925189,-2.030437,-1.173677,-0.832034
25%,-0.263587,-1.197902,-0.889513,-0.756124
50%,0.645187,-0.547027,-0.437714,-0.475497
75%,1.415146,0.226128,0.413207,-0.335561
max,3.063255,0.793337,1.20797,1.888422


In [20]:
df.T # Transposing your data

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.18641,0.103963,1.491392,3.063255,-0.386104,-1.925189
B,-2.030437,0.793337,-1.282663,0.351649,-0.150433,-0.943621
C,0.60914,-0.952406,1.20797,-0.700836,-1.173677,-0.174591
D,-0.318064,-0.820519,-0.832034,-0.388052,1.888422,-0.562941


In [21]:
df.sort_index(axis=1, ascending=False) # Sorting by an axis

Unnamed: 0,D,C,B,A
2013-01-01,-0.318064,0.60914,-2.030437,1.18641
2013-01-02,-0.820519,-0.952406,0.793337,0.103963
2013-01-03,-0.832034,1.20797,-1.282663,1.491392
2013-01-04,-0.388052,-0.700836,0.351649,3.063255
2013-01-05,1.888422,-1.173677,-0.150433,-0.386104
2013-01-06,-0.562941,-0.174591,-0.943621,-1.925189


In [22]:
df.sort_values(by='B') # Sorting by values

Unnamed: 0,A,B,C,D
2013-01-01,1.18641,-2.030437,0.60914,-0.318064
2013-01-03,1.491392,-1.282663,1.20797,-0.832034
2013-01-06,-1.925189,-0.943621,-0.174591,-0.562941
2013-01-05,-0.386104,-0.150433,-1.173677,1.888422
2013-01-04,3.063255,0.351649,-0.700836,-0.388052
2013-01-02,0.103963,0.793337,-0.952406,-0.820519


# Selection by Label .loc

In [23]:
df['A'] # Selecting a single column

2013-01-01    1.186410
2013-01-02    0.103963
2013-01-03    1.491392
2013-01-04    3.063255
2013-01-05   -0.386104
2013-01-06   -1.925189
Freq: D, Name: A, dtype: float64

In [24]:
df[0:3] # slices the rows

Unnamed: 0,A,B,C,D
2013-01-01,1.18641,-2.030437,0.60914,-0.318064
2013-01-02,0.103963,0.793337,-0.952406,-0.820519
2013-01-03,1.491392,-1.282663,1.20797,-0.832034


In [27]:
# LOC selecting by label
df.loc[dates[0]]

A    1.186410
B   -2.030437
C    0.609140
D   -0.318064
Name: 2013-01-01 00:00:00, dtype: float64

In [28]:
df.loc[:,['A','B']] # Selecting on a multi-axis by label

Unnamed: 0,A,B
2013-01-01,1.18641,-2.030437
2013-01-02,0.103963,0.793337
2013-01-03,1.491392,-1.282663
2013-01-04,3.063255,0.351649
2013-01-05,-0.386104,-0.150433
2013-01-06,-1.925189,-0.943621


In [29]:
df.loc['20130102':'20130104',['A','B']] #  label slicing, both endpoints are included

Unnamed: 0,A,B
2013-01-02,0.103963,0.793337
2013-01-03,1.491392,-1.282663
2013-01-04,3.063255,0.351649


In [30]:
df.loc['20130102',['A','B']] # Reduction in the dimensions of the returned object

A    0.103963
B    0.793337
Name: 2013-01-02 00:00:00, dtype: float64

In [31]:
df.loc[dates[0],'A'] # For getting a scalar value

1.1864103896178819

In [33]:
df.at[dates[0],'A'] # For getting fast access to a scalar (equiv to the df.loc[dates[0],'A'])

1.1864103896178819

# Selecting by Position .iloc

In [34]:
df.iloc[3]

A    3.063255
B    0.351649
C   -0.700836
D   -0.388052
Name: 2013-01-04 00:00:00, dtype: float64

In [35]:
df.iloc[3:5,0:2] # integer slices

Unnamed: 0,A,B
2013-01-04,3.063255,0.351649
2013-01-05,-0.386104,-0.150433


In [36]:
df.iloc[[1,2,4],[0,2]] # lists of integer position locations

Unnamed: 0,A,C
2013-01-02,0.103963,-0.952406
2013-01-03,1.491392,1.20797
2013-01-05,-0.386104,-1.173677


In [37]:
df.iloc[1:3,:] # For slicing rows explicitly

Unnamed: 0,A,B,C,D
2013-01-02,0.103963,0.793337,-0.952406,-0.820519
2013-01-03,1.491392,-1.282663,1.20797,-0.832034


In [38]:
df.iloc[:,1:3] # For slicing columns explicitly

Unnamed: 0,B,C
2013-01-01,-2.030437,0.60914
2013-01-02,0.793337,-0.952406
2013-01-03,-1.282663,1.20797
2013-01-04,0.351649,-0.700836
2013-01-05,-0.150433,-1.173677
2013-01-06,-0.943621,-0.174591


In [39]:
df.iloc[1,1] # For getting a value explicitly

0.79333691212982826

In [40]:
df.iat[1,1] # For getting fast access to a scalar (equiv to df.iloc[1,1])

0.79333691212982826

# Boolean Indexing

In [42]:
df[df.A > 0] # single column’s values to select data

Unnamed: 0,A,B,C,D
2013-01-01,1.18641,-2.030437,0.60914,-0.318064
2013-01-02,0.103963,0.793337,-0.952406,-0.820519
2013-01-03,1.491392,-1.282663,1.20797,-0.832034
2013-01-04,3.063255,0.351649,-0.700836,-0.388052


In [43]:
df[df > 0] # values from a DataFrame where a boolean condition is met

Unnamed: 0,A,B,C,D
2013-01-01,1.18641,,0.60914,
2013-01-02,0.103963,0.793337,,
2013-01-03,1.491392,,1.20797,
2013-01-04,3.063255,0.351649,,
2013-01-05,,,,1.888422
2013-01-06,,,,


In [44]:
# isin() method

df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']

df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.491392,-1.282663,1.20797,-0.832034,two
2013-01-05,-0.386104,-0.150433,-1.173677,1.888422,four


# Missing Data - N/A

In [46]:
# Reindexing allows you to change/add/delete the index on a specified axis
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

df1.loc[dates[0]:dates[1],'E'] = 1

df1

Unnamed: 0,A,B,C,D,E
2013-01-01,1.18641,-2.030437,0.60914,-0.318064,1.0
2013-01-02,0.103963,0.793337,-0.952406,-0.820519,1.0
2013-01-03,1.491392,-1.282663,1.20797,-0.832034,
2013-01-04,3.063255,0.351649,-0.700836,-0.388052,


In [47]:
df1.dropna(how='any') # To drop any rows that have missing data

Unnamed: 0,A,B,C,D,E
2013-01-01,1.18641,-2.030437,0.60914,-0.318064,1.0
2013-01-02,0.103963,0.793337,-0.952406,-0.820519,1.0


In [48]:
df1.fillna(value=5) # Filling missing data

Unnamed: 0,A,B,C,D,E
2013-01-01,1.18641,-2.030437,0.60914,-0.318064,1.0
2013-01-02,0.103963,0.793337,-0.952406,-0.820519,1.0
2013-01-03,1.491392,-1.282663,1.20797,-0.832034,5.0
2013-01-04,3.063255,0.351649,-0.700836,-0.388052,5.0


In [49]:
pd.isnull(df1) # To get the boolean mask where values are nan

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


# Apply

In [50]:
# Applying functions to the data
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,1.18641,-2.030437,0.60914,-0.318064
2013-01-02,1.290373,-1.2371,-0.343266,-1.138582
2013-01-03,2.781765,-2.519763,0.864704,-1.970616
2013-01-04,5.84502,-2.168114,0.163868,-2.358669
2013-01-05,5.458916,-2.318548,-1.00981,-0.470247
2013-01-06,3.533727,-3.262168,-1.184401,-1.033188


In [51]:
df.apply(lambda x: x.max() - x.min())

A    4.988445
B    2.823774
C    2.381647
D    2.720456
dtype: float64

# String Method

In [52]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# Concat

In [54]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-1.8311,-1.698392,0.428465,-1.402597
1,1.013522,0.409826,-0.016556,-0.77826
2,0.466544,0.863051,-0.446702,1.120851
3,-0.187634,-1.060949,1.644218,0.193328
4,-1.294702,-0.401302,-0.541305,0.873059
5,-1.557999,-0.668212,-0.936609,0.089707
6,0.286813,0.339061,-1.220601,0.320429
7,0.789188,-0.477588,-0.957007,-0.033991
8,-0.173521,0.302503,0.841213,0.487697
9,-0.753279,-0.142951,-0.087861,0.346232


In [57]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -1.831100 -1.698392  0.428465 -1.402597
 1  1.013522  0.409826 -0.016556 -0.778260
 2  0.466544  0.863051 -0.446702  1.120851,
           0         1         2         3
 3 -0.187634 -1.060949  1.644218  0.193328
 4 -1.294702 -0.401302 -0.541305  0.873059
 5 -1.557999 -0.668212 -0.936609  0.089707
 6  0.286813  0.339061 -1.220601  0.320429,
           0         1         2         3
 7  0.789188 -0.477588 -0.957007 -0.033991
 8 -0.173521  0.302503  0.841213  0.487697
 9 -0.753279 -0.142951 -0.087861  0.346232]

In [59]:
pd.concat(pieces) # Concat

Unnamed: 0,0,1,2,3
0,-1.8311,-1.698392,0.428465,-1.402597
1,1.013522,0.409826,-0.016556,-0.77826
2,0.466544,0.863051,-0.446702,1.120851
3,-0.187634,-1.060949,1.644218,0.193328
4,-1.294702,-0.401302,-0.541305,0.873059
5,-1.557999,-0.668212,-0.936609,0.089707
6,0.286813,0.339061,-1.220601,0.320429
7,0.789188,-0.477588,-0.957007,-0.033991
8,-0.173521,0.302503,0.841213,0.487697
9,-0.753279,-0.142951,-0.087861,0.346232


# Append

In [60]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,-2.541782,-1.799927,0.769105,1.185652
1,-0.444851,0.736044,-1.160132,0.475277
2,0.280573,-0.186498,0.839501,0.382112
3,-1.315426,-1.753241,0.779684,-0.215327
4,0.482174,1.814365,-0.34913,-0.936462
5,0.01478,-0.578703,-1.149891,-1.271338
6,-0.089566,-1.615522,0.638443,0.419746
7,-0.172184,1.952367,-1.801034,0.116279


In [62]:
s = df.iloc[3]

df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-2.541782,-1.799927,0.769105,1.185652
1,-0.444851,0.736044,-1.160132,0.475277
2,0.280573,-0.186498,0.839501,0.382112
3,-1.315426,-1.753241,0.779684,-0.215327
4,0.482174,1.814365,-0.34913,-0.936462
5,0.01478,-0.578703,-1.149891,-1.271338
6,-0.089566,-1.615522,0.638443,0.419746
7,-0.172184,1.952367,-1.801034,0.116279
8,-1.315426,-1.753241,0.779684,-0.215327


# Merge

pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False)

left: A DataFrame object

right: Another DataFrame object

on: Columns (names) to join on. Must be found in both the left and right DataFrame objects. If not passed and left_index and right_index are False, the intersection of the columns in the DataFrames will be inferred to be the join keys

left_on: Columns from the left DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame

right_on: Columns from the right DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame

left_index: If True, use the index (row labels) from the left DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys from the right DataFrame

right_index: Same usage as left_index for the right DataFrame

how: One of 'left', 'right', 'outer', 'inner'. Defaults to inner. See below for more detailed description of each method

sort: Sort the result DataFrame by the join keys in lexicographical order. Defaults to True, setting to False will improve performance substantially in many cases

suffixes: A tuple of string suffixes to apply to overlapping columns. Defaults to ('_x', '_y').

copy: Always copy data (default True) from the passed DataFrame objects, even when reindexing is not necessary. Cannot be avoided in many cases but may improve performance / memory usage. The cases where copying can be avoided are somewhat pathological but this option is provided nonetheless.

indicator: Add a column to the output DataFrame called _merge with information on the source of each row. _merge is Categorical-type and takes on a value of left_only for observations whose merge key only appears in 'left' DataFrame, right_only for observations whose merge key only appears in 'right' DataFrame, and both if the observation’s merge key is found in both.

In [63]:
# Merge method	SQL Join Name	Description
# left	LEFT OUTER JOIN	Use keys from left frame only
# right	RIGHT OUTER JOIN	Use keys from right frame only
# outer	FULL OUTER JOIN	Use union of keys from both frames
# inner	INNER JOIN	Use intersection of keys from both frames

In [68]:
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,1.18641,-2.030437,0.60914,-0.318064,1.0
2013-01-02,0.103963,0.793337,-0.952406,-0.820519,1.0
2013-01-03,1.491392,-1.282663,1.20797,-0.832034,
2013-01-04,3.063255,0.351649,-0.700836,-0.388052,


In [69]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.18641,-2.030437,0.60914,-0.318064,one
2013-01-02,0.103963,0.793337,-0.952406,-0.820519,one
2013-01-03,1.491392,-1.282663,1.20797,-0.832034,two
2013-01-04,3.063255,0.351649,-0.700836,-0.388052,three
2013-01-05,-0.386104,-0.150433,-1.173677,1.888422,four
2013-01-06,-1.925189,-0.943621,-0.174591,-0.562941,three


In [67]:
pd.merge(df1, df2, on='A', how='outer', indicator='indicator_column')

Unnamed: 0,A,B_x,C_x,D_x,E_x,B_y,C_y,D_y,E_y,indicator_column
0,1.18641,-2.030437,0.60914,-0.318064,1.0,-2.030437,0.60914,-0.318064,one,both
1,0.103963,0.793337,-0.952406,-0.820519,1.0,0.793337,-0.952406,-0.820519,one,both
2,1.491392,-1.282663,1.20797,-0.832034,,-1.282663,1.20797,-0.832034,two,both
3,3.063255,0.351649,-0.700836,-0.388052,,0.351649,-0.700836,-0.388052,three,both
4,-0.386104,,,,,-0.150433,-1.173677,1.888422,four,right_only
5,-1.925189,,,,,-0.943621,-0.174591,-0.562941,three,right_only


# Grouping

#Splitting the data into groups based on some criteria
#Applying a function to each group independently
#Combining the results into a data structure

In [70]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],'C' : np.random.randn(8),'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.448879,1.655506
1,bar,one,0.9691,-0.420388
2,foo,two,-1.666621,-1.018367
3,bar,three,-1.813887,-1.083505
4,foo,two,0.005703,1.692501
5,bar,two,0.894096,2.025275
6,foo,one,0.375461,-0.883267
7,foo,three,0.172936,-0.692799


In [71]:
df.groupby('A').sum() # Grouping and then applying a function sum to the resulting groups

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.049309,0.521382
foo,-1.561401,0.753575


In [72]:
df.groupby(['A','B']).sum() # Grouping by multiple columns forms a hierarchical index, which we then apply the function

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.9691,-0.420388
bar,three,-1.813887,-1.083505
bar,two,0.894096,2.025275
foo,one,-0.073418,0.772239
foo,three,0.172936,-0.692799
foo,two,-1.660918,0.674134


# Stack

In [73]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.18641,-2.030437,0.60914,-0.318064,one
2013-01-02,0.103963,0.793337,-0.952406,-0.820519,one
2013-01-03,1.491392,-1.282663,1.20797,-0.832034,two
2013-01-04,3.063255,0.351649,-0.700836,-0.388052,three
2013-01-05,-0.386104,-0.150433,-1.173677,1.888422,four
2013-01-06,-1.925189,-0.943621,-0.174591,-0.562941,three


In [78]:
stacked = df2.stack() #  stack(0 method “compresses” a level in the DataFrame’s columns

stacked

2013-01-01  A     1.18641
            B    -2.03044
            C     0.60914
            D   -0.318064
            E         one
2013-01-02  A    0.103963
            B    0.793337
            C   -0.952406
            D   -0.820519
            E         one
2013-01-03  A     1.49139
            B    -1.28266
            C     1.20797
            D   -0.832034
            E         two
2013-01-04  A     3.06326
            B    0.351649
            C   -0.700836
            D   -0.388052
            E       three
2013-01-05  A   -0.386104
            B   -0.150433
            C    -1.17368
            D     1.88842
            E        four
2013-01-06  A    -1.92519
            B   -0.943621
            C   -0.174591
            D   -0.562941
            E       three
dtype: object

In [80]:
stacked.unstack(0)

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.18641,0.103963,1.49139,3.06326,-0.386104,-1.92519
B,-2.03044,0.793337,-1.28266,0.351649,-0.150433,-0.943621
C,0.60914,-0.952406,1.20797,-0.700836,-1.17368,-0.174591
D,-0.318064,-0.820519,-0.832034,-0.388052,1.88842,-0.562941
E,one,one,two,three,four,three


In [81]:
stacked.unstack(1)

Unnamed: 0,A,B,C,D,E
2013-01-01,1.18641,-2.03044,0.60914,-0.318064,one
2013-01-02,0.103963,0.793337,-0.952406,-0.820519,one
2013-01-03,1.49139,-1.28266,1.20797,-0.832034,two
2013-01-04,3.06326,0.351649,-0.700836,-0.388052,three
2013-01-05,-0.386104,-0.150433,-1.17368,1.88842,four
2013-01-06,-1.92519,-0.943621,-0.174591,-0.562941,three


# Pivot Table

In [83]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.448879,1.655506
1,bar,one,0.9691,-0.420388
2,foo,two,-1.666621,-1.018367
3,bar,three,-1.813887,-1.083505
4,foo,two,0.005703,1.692501
5,bar,two,0.894096,2.025275
6,foo,one,0.375461,-0.883267
7,foo,three,0.172936,-0.692799


In [87]:
pd.pivot_table(df, values='C', index=['A', 'B'], columns=['D'])

Unnamed: 0_level_0,D,-1.08350537184,-1.01836651467,-0.883266699043,-0.692798903248,-0.420388077554,1.65550593708,1.69250098123,2.02527509715
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bar,one,,,,,0.9691,,,
bar,three,-1.813887,,,,,,,
bar,two,,,,,,,,0.894096
foo,one,,,0.375461,,,-0.448879,,
foo,three,,,,0.172936,,,,
foo,two,,-1.666621,,,,,0.005703,


# Time Series

pandas has simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications

In [88]:
rng = pd.date_range('1/1/2012', periods=100, freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

ts.resample('5Min').sum()

2012-01-01    23966
Freq: 5T, dtype: int32

In [90]:
# Time zone representation
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')

ts = pd.Series(np.random.randn(len(rng)), rng)

ts

2012-03-06   -0.835197
2012-03-07   -1.286767
2012-03-08   -0.645936
2012-03-09    0.642472
2012-03-10    1.445010
Freq: D, dtype: float64

In [92]:
ts_utc = ts.tz_localize('UTC')

ts_utc

2012-03-06 00:00:00+00:00   -0.835197
2012-03-07 00:00:00+00:00   -1.286767
2012-03-08 00:00:00+00:00   -0.645936
2012-03-09 00:00:00+00:00    0.642472
2012-03-10 00:00:00+00:00    1.445010
Freq: D, dtype: float64

In [93]:
# Convert to another time zone
ts_utc.tz_convert('US/Eastern')

2012-03-05 19:00:00-05:00   -0.835197
2012-03-06 19:00:00-05:00   -1.286767
2012-03-07 19:00:00-05:00   -0.645936
2012-03-08 19:00:00-05:00    0.642472
2012-03-09 19:00:00-05:00    1.445010
Freq: D, dtype: float64

In [94]:
# Converting between time span representations
rng = pd.date_range('1/1/2012', periods=5, freq='M')

ts = pd.Series(np.random.randn(len(rng)), index=rng)

ts

2012-01-31    0.155968
2012-02-29   -1.622862
2012-03-31    1.194893
2012-04-30   -1.450909
2012-05-31    0.106971
Freq: M, dtype: float64

In [95]:
ps = ts.to_period()

ps

2012-01    0.155968
2012-02   -1.622862
2012-03    1.194893
2012-04   -1.450909
2012-05    0.106971
Freq: M, dtype: float64

In [96]:
ps.to_timestamp()

2012-01-01    0.155968
2012-02-01   -1.622862
2012-03-01    1.194893
2012-04-01   -1.450909
2012-05-01    0.106971
Freq: MS, dtype: float64

In [97]:
#Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, we convert a quarterly frequency with year ending in November to 9am of the end of the month following the quarter end
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')

ts = pd.Series(np.random.randn(len(prng)), prng)

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9

ts.head()

1990-03-01 09:00    0.661906
1990-06-01 09:00    0.219377
1990-09-01 09:00   -1.614897
1990-12-01 09:00    1.154671
1991-03-01 09:00   -0.207701
Freq: H, dtype: float64

# Categoricals

In [99]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [100]:
# Convert the raw grades to a categorical data type

df["grade"] = df["raw_grade"].astype("category")

df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

In [102]:
df["grade"].cat.categories = ["very good", "good", "very bad"] # Rename the categories to more meaningful names

df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]

In [112]:
df.sort_values(by="grade") # orting is per order in the categories, not lexical order

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
3,4,a,very good
4,5,a,very good
1,2,b,good
2,3,b,good
5,6,e,very bad


In [113]:
df.groupby("grade").size() # Grouping by a categorical column shows also empty categories

grade
very good    3
good         2
very bad     1
dtype: int64

# Getting Data In/Out

In [103]:
# CSV

df.to_csv('foo.csv') # Writing to a csv file

In [104]:
pd.read_csv('foo.csv')

Unnamed: 0.1,Unnamed: 0,id,raw_grade,grade
0,0,1,a,very good
1,1,2,b,good
2,2,3,b,good
3,3,4,a,very good
4,4,5,a,very good
5,5,6,e,very bad


In [105]:
df.to_excel('foo.xlsx', sheet_name='Sheet1') # Writing to a excel file

In [106]:
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) # Reading

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad
