In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
s = pd.Series([1,3,4,np.nan,6,8])

In [3]:
s

0     1
1     3
2     4
3   NaN
4     6
5     8
dtype: float64

Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:

In [4]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [10]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.868362,0.156558,0.263619,0.041226
2013-01-02,-0.48003,-0.229957,0.003774,-0.84304
2013-01-03,-1.730713,-1.775864,-0.417897,-0.90608
2013-01-04,1.151656,0.600024,1.286908,0.470892
2013-01-05,1.481628,-0.868354,-0.058743,0.298252
2013-01-06,0.899309,-0.365555,-1.714416,-1.175684


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [16]:
df2 = pd.DataFrame({
        'A': 1, # default dtype=int64
        'B': pd.Timestamp('20130101'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3] * 4, dtype='int32'),
        'E': pd.Categorical(['test','train','test','train']),
        'F': 'foo' # default dtype=object
    })

In [17]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-01,1,3,test,foo
1,1,2013-01-01,1,3,train,foo
2,1,2013-01-01,1,3,test,foo
3,1,2013-01-01,1,3,train,foo


In [18]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

See the top & bottom rows of the frame

In [20]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.868362,0.156558,0.263619,0.041226
2013-01-02,-0.48003,-0.229957,0.003774,-0.84304
2013-01-03,-1.730713,-1.775864,-0.417897,-0.90608
2013-01-04,1.151656,0.600024,1.286908,0.470892
2013-01-05,1.481628,-0.868354,-0.058743,0.298252


In [21]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,1.151656,0.600024,1.286908,0.470892
2013-01-05,1.481628,-0.868354,-0.058743,0.298252
2013-01-06,0.899309,-0.365555,-1.714416,-1.175684


Display the index, columns, and the underlying numpy data

In [22]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [23]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [29]:
df.values

array([[ 0.86836213,  0.15655762,  0.26361913,  0.04122574],
       [-0.48002976, -0.22995721,  0.00377381, -0.84303953],
       [-1.73071298, -1.77586369, -0.4178968 , -0.90608008],
       [ 1.15165613,  0.60002405,  1.2869078 ,  0.47089231],
       [ 1.48162783, -0.86835384, -0.05874288,  0.29825207],
       [ 0.89930881, -0.36555453, -1.7144156 , -1.1756839 ]])

In [31]:
type(df.values)

numpy.ndarray

In [27]:
df4 = pd.date_range('20140101', freq='M', periods=5) # what about frequency based on months rather than days?

In [28]:
df4

DatetimeIndex(['2014-01-31', '2014-02-28', '2014-03-31', '2014-04-30',
               '2014-05-31'],
              dtype='datetime64[ns]', freq='M')

Describe shows a quick statistic summary of your data

In [32]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.365035,-0.413858,-0.106126,-0.352406
std,1.225613,0.830796,0.977279,0.704441
min,-1.730713,-1.775864,-1.714416,-1.175684
25%,-0.142932,-0.742654,-0.328108,-0.89032
50%,0.883835,-0.297756,-0.027485,-0.400907
75%,1.088569,0.059929,0.198658,0.233995
max,1.481628,0.600024,1.286908,0.470892


Transposing your data

In [33]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.868362,-0.48003,-1.730713,1.151656,1.481628,0.899309
B,0.156558,-0.229957,-1.775864,0.600024,-0.868354,-0.365555
C,0.263619,0.003774,-0.417897,1.286908,-0.058743,-1.714416
D,0.041226,-0.84304,-0.90608,0.470892,0.298252,-1.175684


Sorting by an axis

In [35]:
df.sort_index(axis=1, ascending=False) # this just sorts the axis 'A' 'B' 'C' 'D', not by any values

Unnamed: 0,D,C,B,A
2013-01-01,0.041226,0.263619,0.156558,0.868362
2013-01-02,-0.84304,0.003774,-0.229957,-0.48003
2013-01-03,-0.90608,-0.417897,-1.775864,-1.730713
2013-01-04,0.470892,1.286908,0.600024,1.151656
2013-01-05,0.298252,-0.058743,-0.868354,1.481628
2013-01-06,-1.175684,-1.714416,-0.365555,0.899309


Sorting by values

In [39]:
df.sort_values(by='B') # all rows stay locked.

Unnamed: 0,A,B,C,D
2013-01-03,-1.730713,-1.775864,-0.417897,-0.90608
2013-01-05,1.481628,-0.868354,-0.058743,0.298252
2013-01-06,0.899309,-0.365555,-1.714416,-1.175684
2013-01-02,-0.48003,-0.229957,0.003774,-0.84304
2013-01-01,0.868362,0.156558,0.263619,0.041226
2013-01-04,1.151656,0.600024,1.286908,0.470892


Selecting a single column, which yields a Series, equivalent to df.A

In [40]:
df['A']

2013-01-01    0.868362
2013-01-02   -0.480030
2013-01-03   -1.730713
2013-01-04    1.151656
2013-01-05    1.481628
2013-01-06    0.899309
Freq: D, Name: A, dtype: float64

In [41]:
df.A

2013-01-01    0.868362
2013-01-02   -0.480030
2013-01-03   -1.730713
2013-01-04    1.151656
2013-01-05    1.481628
2013-01-06    0.899309
Freq: D, Name: A, dtype: float64

Selecting via [], which slices the rows.

In [42]:
df[0:3] # exclusive right interval mark

Unnamed: 0,A,B,C,D
2013-01-01,0.868362,0.156558,0.263619,0.041226
2013-01-02,-0.48003,-0.229957,0.003774,-0.84304
2013-01-03,-1.730713,-1.775864,-0.417897,-0.90608


In [43]:
df['20130102':'20130104'] # inclusive(?) right interval mark

Unnamed: 0,A,B,C,D
2013-01-02,-0.48003,-0.229957,0.003774,-0.84304
2013-01-03,-1.730713,-1.775864,-0.417897,-0.90608
2013-01-04,1.151656,0.600024,1.286908,0.470892


#### Selection by label

For getting a cross section using a label