In [1]:
import numpy as np
import pandas as pd

In [2]:
#when we add objects with different indices, the new index will be the union of the two
#indices
#1a) Series with default numeric index
pd.Series([7.3, -2.5, 3.5, 1.5])

0    7.3
1   -2.5
2    3.5
3    1.5
dtype: float64

In [3]:
#when we add objects with different indices, the new index will be the union of the two
#indices
#1b) Series with specified labels
s1 = pd.Series([7.3, -2.5, 3.5, 1.5], index=['a', 'c', 'd', 'e'])
s1

a    7.3
c   -2.5
d    3.5
e    1.5
dtype: float64

In [4]:
#series with default indices
pd.Series([-2.1, 3.6, -1.5, 4, 3.1])

0   -2.1
1    3.6
2   -1.5
3    4.0
4    3.1
dtype: float64

In [5]:
#index for the new series
pd.Index(['a', 'c', 'e', 'f', 'g'])

Index(['a', 'c', 'e', 'f', 'g'], dtype='object')

In [6]:
#series with assigned indices in alphabetical order
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index =['a', 'c', 'e', 'f', 'g'])

In [7]:
print('s1', s1.to_numpy(), 's2', s2.to_numpy(), sep='\n')

s1
[ 7.3 -2.5  3.5  1.5]
s2
[-2.1  3.6 -1.5  4.   3.1]


In [8]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [9]:
s1 * s2

a   -15.33
c    -9.00
d      NaN
e    -2.25
f      NaN
g      NaN
dtype: float64

In [10]:
s1 - s2

a    9.4
c   -6.1
d    NaN
e    3.0
f    NaN
g    NaN
dtype: float64

In [11]:
#the internal alignment creates missing values where it does not overlap
#if we fill the empty values with interpolation, than we will get fewer empty places

s3 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s3 = s3.reindex(list('abcdefg'))
s3 = s3.fillna(method='ffill', limit=1)
s3

a    7.3
b    7.3
c   -2.5
d    3.4
e    1.5
f    1.5
g    NaN
dtype: float64

In [12]:
s4 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s4 = s4.reindex(list('abcdefghijk'))
s4 = s4.fillna(method='ffill', limit=1)
s4

a   -2.1
b   -2.1
c    3.6
d    3.6
e   -1.5
f    4.0
g    3.1
h    3.1
i    NaN
j    NaN
k    NaN
dtype: float64

In [13]:
s3 + s4

a    5.2
b    5.2
c    1.1
d    7.0
e    0.0
f    5.5
g    NaN
h    NaN
i    NaN
j    NaN
k    NaN
dtype: float64

In [14]:
s3.notnull() & s4.notnull()

a     True
b     True
c     True
d     True
e     True
f     True
g    False
h    False
i    False
j    False
k    False
dtype: bool

In [15]:
#pd.DataFrame: alignment performed on both rows and columns
np.arange(9.)

array([0., 1., 2., 3., 4., 5., 6., 7., 8.])

In [16]:
#pd.DataFrame: alignment performed on both rows and columns
np.arange(9.).reshape((3, 3))

array([[0., 1., 2.],
       [3., 4., 5.],
       [6., 7., 8.]])

In [17]:
#same as
np.arange(9.).reshape(3, 3)

array([[0., 1., 2.],
       [3., 4., 5.],
       [6., 7., 8.]])

In [18]:
#same as
np.arange(9.).reshape(*(3, 3))

array([[0., 1., 2.],
       [3., 4., 5.],
       [6., 7., 8.]])

In [19]:
pd.DataFrame(np.arange(9.).reshape(3, 3), columns=list('bcd'))

Unnamed: 0,b,c,d
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0


In [20]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
            index=['Ohio', 'Texas', 'Oregon'])

In [21]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [22]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Oregon,6.0,7.0,8.0


In [23]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [24]:
df1 + df2

Unnamed: 0,b,c,d,e
Ohio,3.0,,6.0,
Oregon,15.0,,18.0,
Texas,9.0,,12.0,
Utah,,,,


In [25]:
#adding DataFrames with no column or row labels in common
df1 = pd.DataFrame({'A': [1, 2]})
df1

Unnamed: 0,A
0,1
1,2


In [26]:
#adding DataFrames with no column or row labels in common
df2 = pd.DataFrame({'B': [3, 4]})
df2

Unnamed: 0,B
0,3
1,4


In [27]:
#We can specify a fill value when the dataframe has only one value
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                  columns=list('abcd'))

In [28]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                  columns=list('abcde'))

In [29]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [30]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [31]:
#let's set a specific value to be nan
df2.loc[1, 'b'] = np.nan

In [32]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [33]:
#adding df1 and df2 : we see np.nan values where there is no overlapping
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [34]:
#let's use the add method without the specifying a fill value
df1.add(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [35]:
#let's use the add method and specify a fill value
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [36]:
df2.add(df1)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [37]:
df2.add(df1, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [38]:
#Series and Dataframe methods have an r method that is reversed
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [39]:
#Note: We can specify a new refill-value when we reindex a series or DataFrame
df1.reindex(columns=df2.columns)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,
1,4.0,5.0,6.0,7.0,
2,8.0,9.0,10.0,11.0,


In [40]:
#Note: We can specify a new refill-value when we reindex a series or DataFrame
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [41]:
#flexible arithmetic methods
#df.add
df1.add(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [42]:
#df.radd
df1.radd(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [43]:
#df.div
df1.div(df2)

Unnamed: 0,a,b,c,d,e
0,,1.0,1.0,1.0,
1,0.8,,0.857143,0.875,
2,0.8,0.818182,0.833333,0.846154,
3,,,,,


In [44]:
#df.rdiv
df1.rdiv(df2)

Unnamed: 0,a,b,c,d,e
0,,1.0,1.0,1.0,
1,1.25,,1.166667,1.142857,
2,1.25,1.222222,1.2,1.181818,
3,,,,,


In [45]:
#df.floordiv
df1.floordiv(df2)

Unnamed: 0,a,b,c,d,e
0,,1.0,1.0,1.0,
1,0.0,,0.0,0.0,
2,0.0,0.0,0.0,0.0,
3,,,,,


In [46]:
df1.rfloordiv(df2)

Unnamed: 0,a,b,c,d,e
0,,1.0,1.0,1.0,
1,1.0,,1.0,1.0,
2,1.0,1.0,1.0,1.0,
3,,,,,


In [47]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [48]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [49]:
df1.mul(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,4.0,9.0,
1,20.0,,42.0,56.0,
2,80.0,99.0,120.0,143.0,
3,,,,,


In [50]:
df1.rmul(df2)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,4.0,9.0,
1,20.0,,42.0,56.0,
2,80.0,99.0,120.0,143.0,
3,,,,,


In [51]:
df1.pow(df2)

Unnamed: 0,a,b,c,d,e
0,1.0,1.0,4.0,27.0,
1,1024.0,,279936.0,5764801.0,
2,1073742000.0,31381060000.0,1000000000000.0,34522710000000.0,
3,,,,,


In [52]:
df1.rpow(df2)

Unnamed: 0,a,b,c,d,e
0,1.0,1.0,4.0,27.0,
1,625.0,,117649.0,2097152.0,
2,100000000.0,2357948000.0,61917360000.0,1792160000000.0,
3,,,,,


In [53]:
df1.rpow(2) #2 to the power of each element in df1

Unnamed: 0,a,b,c,d
0,1.0,2.0,4.0,8.0
1,16.0,32.0,64.0,128.0
2,256.0,512.0,1024.0,2048.0


In [54]:
df1.pow(2) #each element to the power of 2

Unnamed: 0,a,b,c,d
0,0.0,1.0,4.0,9.0
1,16.0,25.0,36.0,49.0
2,64.0,81.0,100.0,121.0


In [55]:
#Operations between DataFrames and Series: We can conduct operations between different DataFrame and Series
#objects
arr = np.arange(12.).reshape((3, 4))
arr #raw numpy array

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [56]:
arr[0]

array([0., 1., 2., 3.])

In [57]:
arr - arr[0] #operation is broadcast across the array

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [58]:
a0 = arr[0] - arr[0] #arr - arr[0] is the same as subtracting arr[0] from each row
a1 = arr[1] - arr[0]
a2 = arr[2] - arr[0]

In [59]:
#produces the same result as a[0] - a[1]
np.vstack([a0, a1, a2])

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [60]:
#operations between DataFrame and Series objects follow a similar pattern
#in the next few cells we will construct the DataFrame
pd.DataFrame(np.arange(12.))

Unnamed: 0,0
0,0.0
1,1.0
2,2.0
3,3.0
4,4.0
5,5.0
6,6.0
7,7.0
8,8.0
9,9.0


In [61]:
pd.DataFrame(np.arange(12).reshape((4, 3))) #DataFrame with default integer columns

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [62]:
pd.DataFrame(np.arange(12).reshape((4, 3)), columns=list('bde'))

Unnamed: 0,b,d,e
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [63]:
pd.DataFrame(np.arange(12).reshape((4, 3)), index=pd.Index(['Utah', 'Ohio', 'Texas', 'Oregon']))
#same as above but with pd.Index. Pandas wraps the index in an Index object if we do not

Unnamed: 0,0,1,2
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [64]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
             columns=list('bde'),
             index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
#same as above but with pd.Index. Pandas wraps the index in an Index object if we do not

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [65]:
series = frame.iloc[0]

In [66]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [67]:
series

b    0
d    1
e    2
Name: Utah, dtype: int64

In [68]:
frame - series #matches: Series index and DataFrame columns

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [69]:
#same as above
frame.sub(series)

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [70]:
series.rsub(frame)

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [71]:
frame.rsub(series)

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,-3,-3,-3
Texas,-6,-6,-6
Oregon,-9,-9,-9


In [72]:
#Scenario: A column is missing from either the DataFrame or a Series.
#Result: the objects reindexed, with missing values filled with np.nan
series2 = pd.Series(np.arange(3), index=['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

In [73]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [74]:
series2 + frame

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [75]:
series2.radd(frame)

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [76]:
#if we want to match over the columns, and match on the rows we have to use arithmetic methods
series3 = frame['d']
series3

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int64

In [77]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [78]:
series3

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int64

In [81]:
frame.sub(series3) #all np.nan values because the series indices

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [83]:
frame.sub(series3, axis='index') #broadcasts across the rows, matching over the columns

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1
