http://pandas.pydata.org/pandas-docs/stable/dsintro.html

In [1]:
import numpy as np
import pandas as pd

In [5]:
# Series are one dimensional arrays that can hold any data type.
data = {'label1': 1, 'label2': 2, 'label3': 3}
index = ['Label 1', 'Label 2', 'Label 3']
s = pd.Series(data, index=index)
print(s)

Label 1   NaN
Label 2   NaN
Label 3   NaN
dtype: float64


In [6]:
# Populating a Series from an ndarray
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)

a    1.643688
b    1.388095
c    1.323787
d   -0.065111
e    0.038684
dtype: float64


In [7]:
print(s.index)

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


In [8]:
pd.Series(np.random.randn(5))

0    0.414331
1    0.606476
2   -0.581734
3   -0.405868
4   -1.650096
dtype: float64

In [9]:
# Populating a series from a dictionary
d = {'a': 0., 'b': 1., 'c': 2.}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [10]:
# Here d is not in the Series, so it's value is NaN
pd.Series(d, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [11]:
# Populating a Series from one number
pd.Series(5., index=['a', 'b', 'c'])

a    5.0
b    5.0
c    5.0
dtype: float64

In [15]:
# Series can be treated like an ndarray
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print("First Value:")
print(s[0])
print("First 3 Values")
print(s[:3])
print("Values > median value")
print(s[s > s.median()])
print("Specified indices:")
print(s[[4,3,1]])
print("Exponential on all values:")
print(np.exp(s))

First Value:
-0.922673878915
First 3 Values
a   -0.922674
b   -0.203192
c    0.728118
dtype: float64
Values > median value
c    0.728118
d    0.511339
dtype: float64
Specified indices:
e   -1.139792
d    0.511339
b   -0.203192
dtype: float64
Exponential on all values:
a    0.397455
b    0.816122
c    2.071178
d    1.667523
e    0.319886
dtype: float64


In [16]:
# Series are also similar to a dictionary
print('Index of a:')
print(s['a'])
print("index of e:")
print(s['e'])
print('e' in s)
print('f' in s)

Index of a:
-0.922673878915
index of e:
-1.13979159595
True
False


In [17]:
# Looping through Series isn't usually necessary
# Here we add each element to itself
s + s

a   -1.845348
b   -0.406384
c    1.456235
d    1.022679
e   -2.279583
dtype: float64

In [18]:
# Multiplying each element by 2 (same as above)
s * 2

a   -1.845348
b   -0.406384
c    1.456235
d    1.022679
e   -2.279583
dtype: float64

In [19]:
# Calculating the exponential of each item
np.exp(s)

a    0.397455
b    0.816122
c    2.071178
d    1.667523
e    0.319886
dtype: float64

In [20]:
# A Series can also be given a name for later references
s = pd.Series(np.random.randn(5), name='something')
print(s)
print(s.name)

0   -1.828762
1    0.260855
2   -0.713340
3    1.011754
4   -0.703448
Name: something, dtype: float64
something


In [21]:
# DataFrames are 2-dimensional strutures, similar to a SQL table

# Dictionary of Series to DataFrame
d = {'one': pd.Series([1., 2., 3.], index=['a','b','c']),
    'two': pd.Series([1., 2., 3., 4.], index=['a','b','c','d'])}

df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [22]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [23]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [25]:
print('DataFrame indices:')
print(df.index)
print('DataFrame columns:')
print(df.columns)

DataFrame indices:
Index(['a', 'b', 'c', 'd'], dtype='object')
DataFrame columns:
Index(['one', 'two'], dtype='object')


In [28]:
# Selecting columns to add or delete
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [30]:
# Creating a new column
df['three'] = df['one'] * df['two']
df['three']

a    1.0
b    4.0
c    9.0
d    NaN
Name: three, dtype: float64

In [31]:
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [32]:
# Deleting a column
del df['two']
three = df.pop('three')
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [33]:
# Querying the data
df.query('one > 2.5')

Unnamed: 0,one,flag
c,3.0,True


In [36]:
# Adding back the column we popped earlier
df['three'] = three
df

Unnamed: 0,one,flag,three
a,1.0,False,1.0
b,2.0,False,4.0
c,3.0,True,9.0
d,,False,


In [40]:
# Assign does not modify the DataFrame
print(df.assign(ratio = lambda x: (x['one'] / x['three'])))
print(df)

   one   flag  three     ratio
a  1.0  False    1.0  1.000000
b  2.0  False    4.0  0.500000
c  3.0   True    9.0  0.333333
d  NaN  False    NaN       NaN
   one   flag  three
a  1.0  False    1.0
b  2.0  False    4.0
c  3.0   True    9.0
d  NaN  False    NaN


In [41]:
# Getting a row by name
df.loc['b']

one          2
flag     False
three        4
Name: b, dtype: object

In [42]:
# Getting a row by index
df.iloc[2]

one         3
flag     True
three       9
Name: c, dtype: object

In [43]:
# Working with dates
index = pd.date_range('1/1/2000', periods=8)
index

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [44]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))
df

Unnamed: 0,A,B,C
2000-01-01,-0.122818,1.309066,-0.986626
2000-01-02,-2.320383,-1.639862,-0.265797
2000-01-03,-0.040176,-1.880937,-1.3764
2000-01-04,0.241443,0.653584,0.354248
2000-01-05,-0.972494,-0.730601,0.228739
2000-01-06,0.626289,0.405281,1.324919
2000-01-07,0.563001,-0.848082,1.122594
2000-01-08,-1.455242,1.633347,-0.250521


In [45]:
type(df['A'])

pandas.core.series.Series

In [47]:
df.sub(df['A'], axis=0)

Unnamed: 0,A,B,C
2000-01-01,0.0,1.431885,-0.863807
2000-01-02,0.0,0.680521,2.054587
2000-01-03,0.0,-1.840761,-1.336225
2000-01-04,0.0,0.412141,0.112805
2000-01-05,0.0,0.241894,1.201233
2000-01-06,0.0,-0.221008,0.69863
2000-01-07,0.0,-1.411083,0.559594
2000-01-08,0.0,3.088589,1.204721


In [48]:
df * 5 + 2

Unnamed: 0,A,B,C
2000-01-01,1.385908,8.545332,-2.933129
2000-01-02,-9.601917,-6.199311,0.671017
2000-01-03,1.799122,-7.404683,-4.882002
2000-01-04,3.207216,5.267921,3.771242
2000-01-05,-2.862471,-1.653003,3.143694
2000-01-06,5.131446,4.026405,8.624597
2000-01-07,4.815004,-2.24041,7.612972
2000-01-08,-5.276211,10.166735,0.747393


In [49]:
# Transposing records (first 5)
df[:5].T

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00
A,-0.122818,-2.320383,-0.040176,0.241443,-0.972494
B,1.309066,-1.639862,-1.880937,0.653584,-0.730601
C,-0.986626,-0.265797,-1.3764,0.354248,0.228739


In [50]:
# Working with numpy
np.exp(df)

Unnamed: 0,A,B,C
2000-01-01,0.884424,3.702715,0.372833
2000-01-02,0.098236,0.194007,0.766595
2000-01-03,0.960621,0.152447,0.252486
2000-01-04,1.273085,1.922419,1.425109
2000-01-05,0.378139,0.48162,1.257014
2000-01-06,1.870656,1.499724,3.761882
2000-01-07,1.755934,0.428235,3.072816
2000-01-08,0.233344,5.120986,0.778395


In [51]:
np.asarray(df)

array([[-0.12281837,  1.30906631, -0.98662579],
       [-2.32038345, -1.63986218, -0.26579651],
       [-0.04017555, -1.88093669, -1.37640046],
       [ 0.24144327,  0.65358428,  0.35424841],
       [-0.97249428, -0.73060057,  0.22873887],
       [ 0.62628915,  0.405281  ,  1.32491943],
       [ 0.56300081, -0.84808209,  1.12259446],
       [-1.45524214,  1.63334697, -0.25052142]])

In [52]:
# Matrix multiplication
df.T.dot(df)

Unnamed: 0,A,B,C
A,9.231855,1.987645,2.48268
B,1.987645,12.452993,0.973372
C,2.48268,0.973372,6.194761


In [53]:
# Useful for large data sets, this will give an overview of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8 entries, 2000-01-01 to 2000-01-08
Freq: D
Data columns (total 3 columns):
A    8 non-null float64
B    8 non-null float64
C    8 non-null float64
dtypes: float64(3)
memory usage: 576.0 bytes
