# pandas

In [12]:
import pandas as pd
import numpy as np

### Series

#### Basics

In [3]:
s = pd.Series([0,1,2,3])
s

0    0
1    1
2    2
3    3
dtype: int64

In [4]:
s = pd.Series([0,1,2], index=['a','b','c'])
s

a    0
b    1
c    2
dtype: int64

In [5]:
s.values, type(s.values)

(array([0, 1, 2]), numpy.ndarray)

In [6]:
s.index

Index([u'a', u'b', u'c'], dtype='object')

#### Creation

In [8]:
s = pd.Series(['this', 'is', 'summer'])
s

0      this
1        is
2    summer
dtype: object

In [9]:
s = pd.Series({'one': 'this', 'two': 'is', 'three': 'summer'})
s

one        this
three    summer
two          is
dtype: object

In [13]:
a = np.random.randn(4)
print a
s = pd.Series(a, index = ['winter','spring','summer','autumn'])
s


[ 0.85035727 -1.43380003 -0.14355135  0.54463934]


winter    0.850357
spring   -1.433800
summer   -0.143551
autumn    0.544639
dtype: float64

#### numpy array operations

In [14]:
s1 =  s[s < 0]
s1

spring   -1.433800
summer   -0.143551
dtype: float64

In [15]:
s2 = s1 + 10
print s2
s1 + s2

spring    8.566200
summer    9.856449
dtype: float64


spring    7.132400
summer    9.712897
dtype: float64

In [17]:
s = pd.Series(np.random.randn(5))
s.where(s < 0, 1)

0    1.000000
1   -1.417048
2    1.000000
3    1.000000
4    1.000000
dtype: float64

#### Function application

In [16]:
s = pd.Series(np.random.randn(5))
s.map(np.square)

0    0.405248
1    0.000172
2    1.349563
3    0.052274
4    0.417280
dtype: float64

#### Sorting

In [18]:
s = pd.Series('Happy families are all alike'.split(), index = ['z', 'y', 'x', 'w', 'v'])
s

z       Happy
y    families
x         are
w         all
v       alike
dtype: object

In [19]:
s2 = s.sort_values()
s2

z       Happy
v       alike
w         all
x         are
y    families
dtype: object

In [20]:
s3 = s.sort_index()
s3

v       alike
w         all
x         are
y    families
z       Happy
dtype: object

### DataFrame

#### Creation

In [21]:
data = {'feature_1': ['a', 'a', 'b', 'c', 'c'],
        'feature_2': np.random.randn(5),
        'feature_3': 0,
        'feature_4': map(round, np.random.randn(5))}
df = pd.DataFrame(data)
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4
0,a,0.817237,0,-0.0
1,a,1.31716,0,1.0
2,b,-0.413032,0,-1.0
3,c,-0.578524,0,0.0
4,c,-1.199199,0,0.0


In [22]:
data = {'feature_1': ['a', 'a', 'b', 'c', 'c'],
        'feature_2': np.random.randn(5),
        'feature_3': 0,
        'feature_4': map(round, np.random.randn(5))}
df = pd.DataFrame(data, index=['Bob', 'Marie', 'Emmy', 'George', 'Leopold'])
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4
Bob,a,0.55274,0,0.0
Marie,a,-1.459436,0,-1.0
Emmy,b,-0.595732,0,-0.0
George,c,1.999883,0,0.0
Leopold,c,-0.300942,0,0.0


In [23]:
heights_dict = {'Bob': 178, 'Marie': 167, 'Emmy': 180, 'George': 186, 'Leopold': 170} 
heights_df = pd.Series(heights_dict) 

weights_dict = {'Bob': 60, 'Marie': 58, 'Emmy': 10, 'George': 80, 'Leopold': 55} 
weights_df = pd.Series(weights_dict)

heights_vs_weights = pd.DataFrame({'height': heights_df, 'weight': weights_df})
heights_vs_weights

Unnamed: 0,height,weight
Bob,178,60
Emmy,180,10
George,186,80
Leopold,170,55
Marie,167,58


In [25]:
heights_dict = {'Bob': 178, 'Marie': 167, 'Emmy': 180} 
heights_df = pd.Series(heights_dict)
weights_dict = {'Jim': 78, 'Marie': 67, 'Emmy': 80} 
weights_df = pd.Series(weights_dict)

heights_vs_weights = pd.DataFrame({'height': heights_df, 'weight': weights_df})
heights_vs_weights

Unnamed: 0,height,weight
Bob,178.0,
Emmy,180.0,80.0
Jim,,78.0
Marie,167.0,67.0


In [26]:
pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.167986,0.859284
b,0.484668,0.491095
c,0.920254,0.688447


In [27]:
#from_csv = pd.read_csv('csv', header=None)
#from_csv = pd.read_csv('csv', names=['a','b','c'])

#### Summary information

In [28]:
heights_vs_weights.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Bob to Marie
Data columns (total 2 columns):
height    3 non-null float64
weight    3 non-null float64
dtypes: float64(2)
memory usage: 96.0+ bytes


In [29]:
heights_vs_weights.describe()

Unnamed: 0,height,weight
count,3.0,3.0
mean,175.0,75.0
std,7.0,7.0
min,167.0,67.0
25%,172.5,72.5
50%,178.0,78.0
75%,179.0,79.0
max,180.0,80.0


#### Indexing and selection

In [31]:
df


Unnamed: 0,feature_1,feature_2,feature_3,feature_4
Bob,a,0.55274,0,0.0
Marie,a,-1.459436,0,-1.0
Emmy,b,-0.595732,0,-0.0
George,c,1.999883,0,0.0
Leopold,c,-0.300942,0,0.0


In [37]:
# access by label
df['feature_1']

Bob        a
Marie      a
Emmy       b
George     c
Leopold    c
Name: feature_1, dtype: object

In [34]:
df[['feature_1','feature_2']]

Unnamed: 0,feature_1,feature_2
Bob,a,0.55274
Marie,a,-1.459436
Emmy,b,-0.595732
George,c,1.999883
Leopold,c,-0.300942


In [38]:
# Slicing
df[0:2]

Unnamed: 0,feature_1,feature_2,feature_3,feature_4
Bob,a,0.55274,0,0.0
Marie,a,-1.459436,0,-1.0


In [39]:
# does NOT work
# df[0]
# KeyError: 0
df[0:1]

Unnamed: 0,feature_1,feature_2,feature_3,feature_4
Bob,a,0.55274,0,0.0


In [40]:
# iloc: preferred way for index-based access
df.iloc[1]

feature_1          a
feature_2   -1.45944
feature_3          0
feature_4         -1
Name: Marie, dtype: object

In [41]:
df.iloc[1:3,0:2]

Unnamed: 0,feature_1,feature_2
Marie,a,-1.459436
Emmy,b,-0.595732


In [42]:
df.iloc[np.array([True,False,True,False]),1:4]   

Unnamed: 0,feature_2,feature_3,feature_4
Bob,0.55274,0,0.0
Emmy,-0.595732,0,-0.0


In [43]:
# loc: preferred way for label-based access

In [46]:
df.loc['Emmy','feature_1':'feature_3']

feature_1           b
feature_2   -0.595732
feature_3           0
Name: Emmy, dtype: object

In [47]:
# ix: allows "mixed" access
df.ix['Emmy',0]

'b'

#### pandas -> numpy 

In [50]:
df = pd.DataFrame([222,333,444])
print df.values
type(df.values)

[[222]
 [333]
 [444]]


numpy.ndarray

#### numpy -> pandas

In [54]:
df = pd.DataFrame(np.arange(-2,6).reshape(2,4))
df

Unnamed: 0,0,1,2,3
0,-2,-1,0,1
1,2,3,4,5


### Function application

In [55]:
# numpy ufuncs may be used directly
df.abs()

Unnamed: 0,0,1,2,3
0,2,1,0,1
1,2,3,4,5


In [56]:
df.sum()

0    0
1    2
2    4
3    6
dtype: int64

In [57]:
df.sum(axis=1)

0    -2
1    14
dtype: int64

In [60]:
# use applymap() to apply custom functions to every element of a DataFrame
import math
def f(x): return math.factorial(abs(x))/x**2.
df.applymap(f)

Unnamed: 0,0,1,2,3
0,0.5,1.0,inf,1.0
1,0.5,0.666667,1.5,4.8


In [63]:
# use apply() to apply a function row-wise or column-wise:
def f(x): return x.sum()/len(x)
df.apply(f)                     

0    0
1    1
2    2
3    3
dtype: int64

In [64]:
df.apply(f, axis=1)

0   -1
1    3
dtype: int64

### Joins

In [73]:
frame1 = pd.DataFrame({'key1': range(3), 'val1': ['a', 'b', 'c']}, index=['blue','red','green'])
frame1

Unnamed: 0,key1,val1
blue,0,a
red,1,b
green,2,c


In [74]:
frame2 = pd.DataFrame({'key2': range(1, 5), 'val2': ['f', 'g', 'h', 'i']}, index=['water', 'fire', 'air', 'earth'])
frame2

Unnamed: 0,key2,val2
water,1,f
fire,2,g
air,3,h
earth,4,i


In [75]:
# merge joins on column names
pd.merge(frame1, frame2, left_on='key1', right_on='key2', how='inner')

Unnamed: 0,key1,val1,key2,val2
0,1,b,1,f
1,2,c,2,g
