In [1]:
# Import NumPy package and load pandas
import numpy as np
import pandas as pd

# Arithmetic Operations

In [2]:
df1 = pd.DataFrame(np.arange(9).reshape((3,3)), index = ('r1','r2', 'r3'), columns= ['col1','col2','col3'])
df1

Unnamed: 0,col1,col2,col3
r1,0,1,2
r2,3,4,5
r3,6,7,8


In [3]:
df2 = pd.DataFrame(np.arange(9).reshape((3,3)), index = ('r1','r2', 'r3'), columns= ['col1','col2','col3'])
df2

Unnamed: 0,col1,col2,col3
r1,0,1,2
r2,3,4,5
r3,6,7,8


In [4]:
df1+df2

Unnamed: 0,col1,col2,col3
r1,0,2,4
r2,6,8,10
r3,12,14,16


In [5]:
df3 = pd.DataFrame(np.arange(12).reshape((4,3)), index = ('r1','r2', 'r3', 'r4'), columns= ['col1','col2','col3'])
df3

Unnamed: 0,col1,col2,col3
r1,0,1,2
r2,3,4,5
r3,6,7,8
r4,9,10,11


In [6]:
df1+df3

Unnamed: 0,col1,col2,col3
r1,0.0,2.0,4.0
r2,6.0,8.0,10.0
r3,12.0,14.0,16.0
r4,,,


Note: 
- Default values for the Missing labels will be filled with 'NaN'.

In [7]:
df1.add(df3, fill_value=0) # Filling values for the lables not found

Unnamed: 0,col1,col2,col3
r1,0.0,2.0,4.0
r2,6.0,8.0,10.0
r3,12.0,14.0,16.0
r4,9.0,10.0,11.0


# Summarizing and Computing

In [8]:
df = pd.DataFrame(
  [[1.4, np.nan],[7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]]
  ,index=['r1', 'r2', 'r3', 'r4']
  ,columns=['col1', 'col2']
)
df

Unnamed: 0,col1,col2
r1,1.4,
r2,7.1,-4.5
r3,,
r4,0.75,-1.3


In [9]:
df.sum() # returns the sum of the columns

col1    9.25
col2   -5.80
dtype: float64

Passing axis='columns' or axis=1 sums across the columns

In [10]:
df.sum(axis='columns') 

r1    1.40
r2    2.60
r3    0.00
r4   -0.55
dtype: float64

In [11]:
df.mean(axis='columns', skipna=False) # Skip NA values

r1      NaN
r2    1.300
r3      NaN
r4   -0.275
dtype: float64

In [12]:
df.idxmax() # Get the index of the max value

col1    r2
col2    r4
dtype: object

In [13]:
df.cumsum()

Unnamed: 0,col1,col2
r1,1.4,
r2,8.5,-4.5
r3,,
r4,9.25,-5.8


In [14]:
df.cummin()

Unnamed: 0,col1,col2
r1,1.4,
r2,1.4,-4.5
r3,,
r4,0.75,-4.5


In [15]:
df.describe() # Get the multiple summary statistics

Unnamed: 0,col1,col2
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


# Permutation and Random Sampling

In [16]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [28]:
sampler = np.random.permutation(5)
sampler

array([0, 4, 2, 3, 1])

In [29]:
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
3,12,13,14,15
1,4,5,6,7


In [30]:
df.sample(3)  #To select a random subset without replacement

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
0,0,1,2,3


In [31]:
pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5
