# Operating on Data in Pandas

### Ufuncs: Index Preservation

In [4]:
import pandas as pd
import numpy as np

rng = np.random.default_rng(42)
ser = pd.Series(rng.integers(0, 10, 4))
ser

0    0
1    7
2    6
3    4
dtype: int64

In [5]:
df = pd.DataFrame(
    rng.integers(0, 10, (3, 4)),
    columns=["A", "B", "C", "D"],
)
df

Unnamed: 0,A,B,C,D
0,4,8,0,6
1,2,0,5,9
2,7,7,7,7


**Applying numpy operation will result same pandas object**

In [6]:
np.exp(ser)

0       1.000000
1    1096.633158
2     403.428793
3      54.598150
dtype: float64

In [7]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,1.224647e-16,-2.449294e-16,0.0,-1.0
1,1.0,0.0,-0.707107,0.707107
2,-0.7071068,-0.7071068,-0.707107,-0.707107


### Ufuncs: Index Alignment

**Index Alignment in Series**
suppose we are combining two different data sources and wish to find
only the top three US states by area and the top three US states by population:

In [8]:
area = pd.Series(
    {
        "Alaska": 1723337,
        "Texas": 695662,
        "California": 423967,
    },
    name="area",
)
population = pd.Series(
    {
        "California": 39538223,
        "Texas": 29145505,
        "Florida": 21538187,
    },
    name="population",
)

density = population / area
density

Alaska              NaN
California    93.257784
Florida             NaN
Texas         41.896072
dtype: float64

In [10]:
# the result is union of two serieses indecies
area.index.union(population.index)

Index(['Alaska', 'California', 'Florida', 'Texas'], dtype='object')

In [11]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [13]:
# using add() allows to optionally fill nan values for indecies
# 0 -> A[0] + 0
# 3 -> 0 + B[3]
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

**Index Alignment in DataFrames**

In [18]:
A = pd.DataFrame(rng.integers(0, 20, (2, 2),), columns=["a", "b"])
A

Unnamed: 0,a,b
0,1,9
1,15,3


In [19]:

B = pd.DataFrame(rng.integers(0, 10, (3, 3)), columns=["b", "a", "c"])
B

Unnamed: 0,b,a,c
0,4,1,6
1,4,3,2
2,5,6,9


In [20]:
A + B

Unnamed: 0,a,b,c
0,2.0,13.0,
1,18.0,7.0,
2,,,


In [21]:
A.add(B, fill_value=A.values.mean())

Unnamed: 0,a,b,c
0,2.0,13.0,13.0
1,18.0,7.0,9.0
2,13.0,12.0,16.0


### Ufuncs: Operations Between DataFrames and Series

In [22]:
A = rng.integers(10, size=(3, 4))
df = pd.DataFrame(A, columns=["Q", "R", "S", "T"])
df

Unnamed: 0,Q,R,S,T
0,4,1,8,6
1,7,0,3,7
2,8,4,8,8


In [24]:
# row-wise
df - df.iloc[0] # - first row

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,3,-1,-5,1
2,4,3,0,2


In [25]:
# column-wise it is possible by funcs with axis
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,3,0,7,5
1,7,0,3,7
2,4,0,4,4


In [26]:
halfrow = df.iloc[0, ::2] # row 0 | column 1, 3 -2->
halfrow

Q    4
S    8
Name: 0, dtype: int64

In [29]:
# halfrow contains Q = 4 | S = 8
# when subtract Q - 4 | S - 8 | R, T = NAN
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,3.0,,-5.0,
2,4.0,,0.0,
