# Mod11 Ufuncs in Pandas

## Ufuncs: Index Preservation

In [15]:
import pandas as pd
import numpy as np

In [16]:
pd.__version__

'1.0.5'

In [17]:
np.__version__

'1.19.1'

In [18]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [19]:
ser.values

array([6, 3, 7, 4])

In [20]:
rng = np.random.RandomState(42)
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [21]:
df.values

array([[6, 3, 7, 4],
       [6, 9, 2, 6],
       [7, 4, 3, 7]])

In [22]:
np.exp(ser)  #指數運算

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [23]:
np.exp(ser.values)  #指數運算

array([ 403.42879349,   20.08553692, 1096.63315843,   54.59815003])

In [24]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,403.428793,20.085537,1096.633158,54.59815
1,403.428793,8103.083928,7.389056,403.428793
2,1096.633158,54.59815,20.085537,1096.633158


In [25]:
np.exp(df['A'])

0     403.428793
1     403.428793
2    1096.633158
Name: A, dtype: float64

In [26]:
np.exp(df[['B','D']])

Unnamed: 0,B,D
0,20.085537,54.59815
1,8103.083928,403.428793
2,54.59815,1096.633158


In [27]:
df

Unnamed: 0,A,B,C,D
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [28]:
np.exp(df[0:2])

Unnamed: 0,A,B,C,D
0,403.428793,20.085537,1096.633158,54.59815
1,403.428793,8103.083928,7.389056,403.428793


In [29]:
df.iloc[1]

A    6
B    9
C    2
D    6
Name: 1, dtype: int64

In [30]:
np.exp(df.iloc[1])

A     403.428793
B    8103.083928
C       7.389056
D     403.428793
Name: 1, dtype: float64

In [31]:
df.iloc[1].index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [32]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,-0.707107,1.224647e-16
1,-1.0,0.7071068,1.0,-1.0
2,-0.707107,1.224647e-16,0.707107,-0.7071068


Review NumPy ufunc

In [33]:
arr=np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [34]:
np.exp(arr)

array([ 1.        ,  2.71828183,  7.3890561 , 20.08553692, 54.59815003])

In [35]:
ar2=np.zeros(5)
np.exp(arr,out=ar2,where=arr>3)

array([ 0.        ,  0.        ,  0.        ,  0.        , 54.59815003])

In [36]:
ar2

array([ 0.        ,  0.        ,  0.        ,  0.        , 54.59815003])

## UFuncs: Index Alignment

### Index alignment in Series

In [37]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [38]:
area

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64

In [39]:
population

California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [40]:
population / area
# The resulting array contains the union of indices of the two input arrays
# any missing values are filled in with NaN by default  !!空值出現!!

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [41]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

In [42]:
A

0    2
1    4
2    6
dtype: int64

In [43]:
B

1    1
2    3
3    5
dtype: int64

In [44]:
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

explicit specification of the fill value for any elements in ``A`` or ``B`` that might be missing:

In [45]:
A.add(B, fill_value=0)     #用0來補空值

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

In [46]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [47]:
rng = np.random.RandomState(42)
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,6,3,7
1,4,6,9
2,2,6,7


In [48]:
A + B

Unnamed: 0,A,B,C
0,9.0,25.0,
1,20.0,14.0,
2,,,


In [49]:
A.add(B,fill_value=0)

Unnamed: 0,A,B,C
0,9.0,25.0,7.0
1,20.0,14.0,9.0
2,6.0,2.0,7.0


fill with the mean of all values in ``A`` (computed by first stacking the rows of ``A``):

In [50]:
ar = np.array([[2,7,3],[1,9,2]])

In [51]:
ar.mean()

4.0

In [52]:
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [53]:
A.mean()   # 預設為以欄平均

A    10.0
B    14.5
dtype: float64

In [54]:
A.mean(axis= 1)   # or (axis='columns') 改為以列平均

0    12.5
1    12.0
dtype: float64

In [55]:
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [77]:
type(A)

pandas.core.frame.DataFrame

In [56]:
%timeit A.values.mean()

9.86 µs ± 95.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [85]:
A.stack()  # 把本來二維降為一維

0  A     6
   B    19
1  A    14
   B    10
dtype: int64

In [59]:
A.unstack()

A  0     6
   1    14
B  0    19
   1    10
dtype: int64

In [60]:
fill = A.stack().mean()
fill

12.25

In [61]:
%%timeit
fill = A.stack().mean()
fill

352 µs ± 17.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [62]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,9.0,25.0,19.25
1,20.0,14.0,21.25
2,18.25,14.25,19.25


In [63]:
A.stack?

[0;31mSignature:[0m [0mA[0m[0;34m.[0m[0mstack[0m[0;34m([0m[0mlevel[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0mdropna[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Stack the prescribed level(s) from columns to index.

Return a reshaped DataFrame or Series having a multi-level
index with one or more new inner-most levels compared to the current
DataFrame. The new inner-most levels are created by pivoting the
columns of the current dataframe:

  - if the columns have a single level, the output is a Series;
  - if the columns have multiple levels, the new index
    level(s) is (are) taken from the prescribed level(s) and
    the output is a DataFrame.

The new index levels are sorted.

Parameters
----------
level : int, str, list, default -1
    Level(s) to stack from the column axis onto the index
    axis, defined as one index or label, or a list of indices
    or labels.
dropna : bool, default True
    Whether to drop 

## Lab

<b>有兩個 Series 如下，求兩個 Series 的和，遇到 NaN 以零代替</b>

In [64]:
s1 = pd.Series([8.6, -3.2, 6.1, 2.4], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-4.2, 7.3, -2.7, 5, 1.8],
               index=['a', 'c', 'e', 'f', 'g'])

In [65]:
s1

a    8.6
c   -3.2
d    6.1
e    2.4
dtype: float64

In [66]:
s2

a   -4.2
c    7.3
e   -2.7
f    5.0
g    1.8
dtype: float64

<b>有兩個 DataFrame 如下，求兩個 DataFrame 的乘積，遇到 NaN 以 1 代替</b>

In [67]:
np.random.seed(61)
df1 = pd.DataFrame(np.random.randint(10,size=12).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.random.randint(20,size=20).reshape((4, 5)),
                   columns=list('abcde'))

In [68]:
df1

Unnamed: 0,a,b,c,d
0,3,2,2,0
1,2,1,7,5
2,3,7,0,9


In [69]:
df2

Unnamed: 0,a,b,c,d,e
0,10,19,15,4,2
1,19,10,5,14,8
2,2,13,6,3,2
3,15,13,14,15,9
