In [1]:
import pandas as pd

In [2]:
data = pd.Series([10, -5, 3, -1, 7])

In [3]:
names = pd.Series(["Alice", "Bob", "Charlie", "David", "Eve", "Frank"])

In [4]:
validNames = ["Alice", "Eve", "Charlie"]

In [5]:
cleanedData=data[data>0]
cleanedData

Unnamed: 0,0
0,10
2,3
4,7


In [6]:
cleanedData.mean()

np.float64(6.666666666666667)

In [7]:
import numpy as np
positiveData=data.mask(data<=0, np.nan)
positiveData

#Series.mask(cond, other=nan, inplace=False)
#mask and filter work opposite, filter: keep if true, mask: remove if true

Unnamed: 0,0
0,10.0
1,
2,3.0
3,
4,7.0


In [8]:
nonPositiveData=data.where(data<=0, np.nan)
nonPositiveData

#Series.where(cond, other=nan, inplace=False)
#Always use np.nan and not some random number
#Where is the inverse of mask

Unnamed: 0,0
0,
1,-5.0
2,
3,-1.0
4,


• cond : Where cond is False, keep the
original value. Where True, replace with
corresponding value from other.
• other : Entries where cond is True are
replaced with corresponding value from
other.
• inplace : Whether to perform the operation
in place on the data.


In [9]:
#explicitly dropping nan
positiveData.dropna()

Unnamed: 0,0
0,10.0
2,3.0
4,7.0


• cond : Where cond is True, keep the
original value. Where False, replace with
corresponding value from other.
• other: Entries where cond is False are
replaced with corresponding value from
other.
• Inpace: Whether to perform the operation
in place on the data.


In [10]:
names=pd.Series(["Alice Johnson", "Bob", "Charlie", "David", "Eve"])
validNames=["Alice Johnson", "Eve", "Charlie"]

#Use isin() to create a boolean mask
mask=names.isin(validNames)
mask

Unnamed: 0,0
0,True
1,False
2,True
3,False
4,True


A vectorized function in Pandas (or NumPy) operates on entire Series or
DataFrame objects at once, rather than looping through elements
individually. This makes it much faster and more efficient by leveraging
low-level optimizations

In [11]:
data2=pd.Series([10, 20, 30, 40, 50])
add_5=data2+5
square=data2**2

In [12]:
add_5

Unnamed: 0,0
0,15
1,25
2,35
3,45
4,55


In [13]:
square

Unnamed: 0,0
0,100
1,400
2,900
3,1600
4,2500


In [14]:
log_values=data.apply(lambda x:x**0.5) #non-vectorized

In [15]:
spaceInName=names.map(lambda x: x.count(" ")) #non-vectorized
spaceInName

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0


In [16]:
spaceInName2=names.str.count(" ") #vectorized
spaceInName

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0


In [17]:
import re

# Sample Series with full names
names = pd.Series([
    "Alice Bob Johnson",
    "Charlie D. Brown",
    " David   Smith",  # Extra spaces
    "Eve@ Adams",      # Special character
    "Franklin Delano Roosevelt",
    "  Grace Hopper  "  # Leading & trailing spaces
])

#vectorized functions are not flexible enough

In [18]:
def extractInitials(name):
    # Remove leading/trailing spaces and special characters
    name = name.strip()
    name = re.sub(r"[^a-zA-Z\s]", "", name)  # Keep only letters and spaces

    # Extract initials using regex (first letter of each word)
    initials = re.findall(r'\b[A-Za-z]', name)  # \b ensures we get word boundaries

    return "".join(initials)  # Join initials together

# Apply function to Series
initialsSe = names.apply(extractInitials)
initialsSe

Unnamed: 0,0
0,ABJ
1,CDB
2,DS
3,EA
4,FDR
5,GH


In [19]:
#Rename Values

animals=pd.Series(["Dog", "Cat", "Bird", "Fish"])

#Define a mapping dictionary
animalNameMap={
    "Cat": "feline",
    "Dog": "canine",
    "Bird": "avian"
}

animalsNameChanged=animals.replace(animalNameMap)
animalsNameChanged
#

Unnamed: 0,0
0,canine
1,feline
2,avian
3,Fish


• The groupby() function in Pandas Series is used to group values based on
their index labels and perform aggregations or transformations on each
group.


In [20]:
import pandas as pd

# Directly defining the Series with sales values and region index
salesSe = pd.Series([
    3200, 4500, 3900, 4700, 1500, 1800, 2200, 5100, 3000, 4900,
    4100, 5200, 1600, 1400, 2300, 5000, 3300, 4600, 1700, 2000
], index=[
    "East", "West", "East", "West", "South", "South", "North", "East", "North", "West",
    "East", "West", "South", "South", "North", "West", "East", "West", "South", "North"
])

salesSe.index.name = "Region"
salesSe.name = "Sales"
salesSe.head()

Unnamed: 0_level_0,Sales
Region,Unnamed: 1_level_1
East,3200
West,4500
East,3900
West,4700
South,1500


It returns a SeriesGroupBy object, which is an
intermediate grouped object that needs
further operations to extract meaningful data.
SeriesGroupBy is an iterable


In [21]:
results=salesSe.groupby("Region")
for region, data in results:
  print(f"Region: {region}")
  print(data)
  print()

Region: East
Region
East    3200
East    3900
East    5100
East    4100
East    3300
Name: Sales, dtype: int64

Region: North
Region
North    2200
North    3000
North    2300
North    2000
Name: Sales, dtype: int64

Region: South
Region
South    1500
South    1800
South    1600
South    1400
South    1700
Name: Sales, dtype: int64

Region: West
Region
West    4500
West    4700
West    4900
West    5200
West    5000
West    4600
Name: Sales, dtype: int64



In [22]:
#directly using index for grouping
results=salesSe.groupby(salesSe.index)
for region, data in results:
  print(f"Region: {region}")
  print(data)
  print()

Region: East
Region
East    3200
East    3900
East    5100
East    4100
East    3300
Name: Sales, dtype: int64

Region: North
Region
North    2200
North    3000
North    2300
North    2000
Name: Sales, dtype: int64

Region: South
Region
South    1500
South    1800
South    1600
South    1400
South    1700
Name: Sales, dtype: int64

Region: West
Region
West    4500
West    4700
West    4900
West    5200
West    5000
West    4600
Name: Sales, dtype: int64



Aggregation functions:
a function that computes a summary statistic over a group of
values, reducing multiple values into a single scalar value.


In [23]:
results=salesSe.groupby("Region")
results.mean()

Unnamed: 0_level_0,Sales
Region,Unnamed: 1_level_1
East,3920.0
North,2375.0
South,1600.0
West,4816.666667


In [24]:
results.sum()

Unnamed: 0_level_0,Sales
Region,Unnamed: 1_level_1
East,19600
North,9500
South,8000
West,28900


In [25]:
results.min()

Unnamed: 0_level_0,Sales
Region,Unnamed: 1_level_1
East,3200
North,2000
South,1400
West,4500


In [26]:
results.agg(["mean", "sum", "min"]) #.agg method

Unnamed: 0_level_0,mean,sum,min
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,3920.0,19600,3200
North,2375.0,9500,2000
South,1600.0,8000,1400
West,4816.666667,28900,4500


In [27]:
#can define your own aggregationn function
def range(series):
  return series.max()-series.min()

results.agg(range)

Unnamed: 0_level_0,Sales
Region,Unnamed: 1_level_1
East,1900
North,1000
South,400
West,700


In [28]:
results.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
East,5.0,3920.0,762.889245,3200.0,3300.0,3900.0,4100.0,5100.0
North,4.0,2375.0,434.932945,2000.0,2150.0,2250.0,2475.0,3000.0
South,5.0,1600.0,158.113883,1400.0,1500.0,1600.0,1700.0,1800.0
West,6.0,4816.666667,263.944439,4500.0,4625.0,4800.0,4975.0,5200.0


In [29]:
results.agg(["mean", "count", range]) #can mix your own aggregation function

Unnamed: 0_level_0,mean,count,range
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,3920.0,5,1900
North,2375.0,4,1000
South,1600.0,5,400
West,4816.666667,6,700


In [30]:
#is "apply" and aggregation function?
double=lambda x: x*2
results.apply(double)

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales
Region,Region,Unnamed: 2_level_1
East,East,6400
East,East,7800
East,East,10200
East,East,8200
East,East,6600
North,North,4400
North,North,6000
North,North,4600
North,North,4000
South,South,3000


In pandas, the transform function is used to perform some group-specific
computations and return a result that has the same size as the original Series.
Unlike aggregation functions which reduce the size of the data (like sum, mean),
transform keeps the original structure of Series, aligning the result back to the
original indices.


In [31]:
data3 = pd.Series([10, 20, 40, 30, 40, 10, 50, 60, 1],
              index=['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'])

def getZscore(se):
  return(se-se.mean())/se.std()

zScore=data3.groupby(data3.index).transform(getZscore)
print(zScore)

A   -0.872872
A   -0.218218
A    1.091089
B    0.218218
B    0.872872
B   -1.091089
C    0.411714
C    0.728417
C   -1.140131
dtype: float64


In [32]:
zScore=data3.groupby(data3.index).transform(lambda x: (x-x.mean())/x.std())
print(zScore)

A   -0.872872
A   -0.218218
A    1.091089
B    0.218218
B    0.872872
B   -1.091089
C    0.411714
C    0.728417
C   -1.140131
dtype: float64
