In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

## NumPy

- Stands for Numerical Python.
- Collection of data structures and efficient operations on them.
- Building block for most data science packages in Python.

Why is NumPy better than using regular Python data structures (lists, dictionaries, etc.)? Efficiency!

- Broadcasting.
- Vectorization.

In [None]:
# Create a basic NumPy ndarray
# Show broadcasting arithmetic

x = np.array([1, 2, 3, 4, 5])
x = x + 1
print(x, x.dtype)

In [3]:
# Create two ndarrays
# Show vectorized operations.

x = np.array([1, 2, 3, 4, 5])
y = np.array([5, 4, 3, 2, 1])

# Fast vectorized operation
z = x + y
print(z)

# Slower iterative operation (don't do this!)
z = np.array([0, 0, 0, 0, 0])
for i in range(5):
  z[i] = x[i] + y[i]
print(z)

[6 6 6 6 6]
[6 6 6 6 6]


In [12]:
X = np.random.randn(500, 500)
Y = np.random.randn(500, 500)

In [13]:
%%time 
Z = np.matmul(X, Y)

CPU times: user 15.8 ms, sys: 14.1 ms, total: 29.9 ms
Wall time: 35.3 ms


In [14]:
%%time
Z = np.zeros((500, 500))
for i in range(500):
    for j in range(500):
        for k in range(500):
            Z[i, j] += X[i, k] * Y[k, j]

CPU times: user 1min 59s, sys: 2.36 s, total: 2min 1s
Wall time: 2min 21s


## Pandas

- Wrapper around NumPy for structured tabular data.
- Good for organizing and manipulating data.

Read about the titanic data:
https://www.kaggle.com/c/titanic/data/

In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/stanford-mse-125-2025/mse-125-2025-public/refs/heads/main/demos/titanic.csv")
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [8]:
# Pandas is really just storing a NumPy array
data.values

array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ...,
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], shape=(891, 12), dtype=object)

### Filter

Select a subset of rows and/or columnn from the dataframe

Ways to subset rows:
- A boolean array/series.
- A list of indices.

Ways to subset columns:
- A single or list of column names.
- A single or list of booleans.
- A single or list of indices.

General rules:
- Use `[]` when subsetting columns with names or booleans, or rows with booleans.
- Use `.loc` when subsetting *both* rows and columns (not with indices).
- Use `.iloc` when subsetting by indices

In [None]:
# Get the Sex column
data["Sex"]

# Subset rows using boolean series
data[data["Sex"] == "female"]

# Subset rows with boleans and columns with names
data.loc[data["Sex"] == "female", ["Age", "Fare"]]

# Get the second rows and 3-6th columns
data.iloc[1, 2:6]

Pclass                                                    1
Name      Cumings, Mrs. John Bradley (Florence Briggs Th...
Sex                                                  female
Age                                                    38.0
Name: 1, dtype: object

### Aggregate

Get aggregated statistics of the data.

In [15]:
data["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [17]:
data.isna().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

### Transform

- Replace a a column or make a new column using existing columns.

In [None]:
data["log_age"] = np.log(data["Age"])
data["age_squared"] = data["Age"] ** 2

### Sort

In [17]:
data.sort_values("Age").head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S
831,832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S
305,306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
386,387,0,3,"Goodwin, Master. Sidney Leonard",male,1.0,5,2,CA 2144,46.9,,S
172,173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S
183,184,1,2,"Becker, Master. Richard F",male,1.0,2,1,230136,39.0,F4,S


### Groupby

A convenient way to apply aggregate functions to specific subgroups.

<center>
<img src="split_apply_combine.png" alt="Alt Text" width="500"/>
</center>

In [18]:
data.groupby("Sex")["Age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,261.0,27.915709,14.110146,0.75,18.0,27.0,37.0,63.0
male,453.0,30.726645,14.678201,0.42,21.0,29.0,39.0,80.0


### Pivot/Melt

In [8]:
data.groupby(["Sex", "Pclass"])[["Age", "Fare"]].mean().reset_index()

Unnamed: 0,Sex,Pclass,Age,Fare
0,female,1,34.611765,106.125798
1,female,2,28.722973,21.970121
2,female,3,21.75,16.11881
3,male,1,41.281386,67.226127
4,male,2,30.740707,19.741782
5,male,3,26.507589,12.661633


In [10]:
means = data.groupby(["Sex", "Pclass"])[["Age", "Fare"]].mean().reset_index()
means_wide = means.pivot(index="Pclass", columns="Sex")
means_wide

Unnamed: 0_level_0,Age,Age,Fare,Fare
Sex,female,male,female,male
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,34.611765,41.281386,106.125798,67.226127
2,28.722973,30.740707,21.970121,19.741782
3,21.75,26.507589,16.11881,12.661633


In [11]:
means_wide[("Age", "female")]

Pclass
1    34.611765
2    28.722973
3    21.750000
Name: (Age, female), dtype: float64

### Merging

In [None]:
additional_info = pd.DataFrame({
    'PassengerId': [1, 2, 3, 4, 5],
    'Check_In_Gate': ['Gate A', 'Gate B', 'Gate A', 'Gate A', 'Gate C'],
    'Room_Quality': ['Luxury', 'Standard', 'Standard', 'Luxury', 'Economy']
})

pd.merge(data, additional_info, on='PassengerId')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Check_In_Gate,Room_Quality
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Gate A,Luxury
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Gate B,Standard
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Gate A,Standard
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Gate A,Luxury
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Gate C,Economy
