In [None]:
import pandas as pd
import numpy as np

pd.__version__

In [39]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : [1, 3, 5, 7, 9, 11, np.NaN, 15],
                   'D' : [2, 4, 6, 8, np.NaN, 12, 14, 16]})

## Inspect

In [None]:
# Number of rows
len(df)
df.shape[0]

# Number of columns
df.shape[1]

# First/last few rows
df.head(10)
df.tail(10)

# Limit number of rows to show
pd.options.display.max_rows = 5

## Frequency Counts

In [40]:
# Single variable
df['A'].value_counts()

foo    5
bar    3
Name: A, dtype: int64

In [41]:
# Single variable, sorted by index name
df['A'].value_counts().sort_index()

bar    3
foo    5
Name: A, dtype: int64

In [42]:
# Single variable, using groupby() instead
df.groupby('A').size()

A
bar    3
foo    5
dtype: int64

In [43]:
# Single variable, with condition
df.groupby(df['A'] == 'foo').size()

A
False    3
True     5
dtype: int64

In [44]:
# Single variable, across all columns
df.groupby('A').count()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,3,3,3
foo,5,4,4


In [45]:
# Multiple variables
df.groupby(['A', 'B']).size()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

## Averages

In [46]:
# Mean of all numeric variables, excludes missings
df.mean()

C    7.285714
D    8.857143
dtype: float64

In [47]:
# Mean of all rows, excludes missings and non-numeric
df.mean(1)

0     1.5
1     3.5
2     5.5
3     7.5
4     9.0
5    11.5
6    14.0
7    15.5
dtype: float64

In [48]:
# Single variable
df.groupby('A').mean()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,7.0,8.0
foo,7.5,9.5


In [49]:
# Multiple variables
df.groupby(['A', 'B']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,3.0,4.0
bar,three,7.0,8.0
bar,two,11.0,12.0
foo,one,1.0,8.0
foo,three,15.0,16.0
foo,two,7.0,6.0


## Apply Function

In [57]:
# Cumulative sum
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
0,foo,one,1.0,2.0
1,foobar,oneone,4.0,6.0
2,foobarfoo,oneonetwo,9.0,12.0
3,foobarfoobar,oneonetwothree,16.0,20.0
4,foobarfoobarfoo,oneonetwothreetwo,25.0,
5,foobarfoobarfoobar,oneonetwothreetwotwo,36.0,32.0
6,foobarfoobarfoobarfoo,oneonetwothreetwotwoone,,46.0
7,foobarfoobarfoobarfoofoo,oneonetwothreetwotwoonethree,51.0,62.0


In [59]:
# Normalise
df[['C', 'D']].apply(lambda x: (x - x.mean())/x.std())

Unnamed: 0,C,D
0,-1.30393,-1.300309
1,-0.889043,-0.921052
2,-0.474156,-0.541795
3,-0.05927,-0.162539
4,0.355617,
5,0.770504,0.595975
6,,0.975231
7,1.600278,1.354488


## Aggregation Function

In [50]:
# Apply different functions to each variable - Pass in dictionary of functions
df.groupby('A').agg({'C': np.mean, 'D': np.size})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,7.0,3.0
foo,7.5,5.0


In [51]:
# Apply multiple functions to same variable
df.groupby(['A', 'B']).agg({'C': [np.size, np.mean]})

Unnamed: 0_level_0,Unnamed: 1_level_0,C,C
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,one,1.0,3.0
bar,three,1.0,7.0
bar,two,1.0,11.0
foo,one,2.0,1.0
foo,three,1.0,15.0
foo,two,2.0,7.0
