#### Filling Missing Values

In [1]:
import numpy as np
import pandas as pd
np.random.seed(1)
df = pd.DataFrame(np.arange(21).reshape(7, 3))
df.iloc[1:4, 1] = np.nan
df.iloc[-2:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0,1.0,2.0
1,3,,5.0
2,6,,8.0
3,9,,11.0
4,12,13.0,14.0
5,15,16.0,
6,18,19.0,


In [2]:
# fill column 1 with 0.5 and column 2 with 0.25
df.fillna({1: 0.5, 2: 0.25})


Unnamed: 0,0,1,2
0,0,1.0,2.0
1,3,0.5,5.0
2,6,0.5,8.0
3,9,0.5,11.0
4,12,13.0,14.0
5,15,16.0,0.25
6,18,19.0,0.25


In [12]:
# fill df with median value of each columns
df.fillna({1: df[1].median(), 2: df[2].median()})

Unnamed: 0,0,1,2
0,0,1.0,2.0
1,3,14.5,5.0
2,6,14.5,8.0
3,9,14.5,11.0
4,12,13.0,14.0
5,15,16.0,8.0
6,18,19.0,8.0


#### Group-Split-Combine

In [4]:
dates = pd.date_range(start='2024-01-01', end='2024-01-31')
stores = ['Littleton', 'Aurora', 'Highlands']
products = ['Airmax', 'Airforce', 'Jordans']
sales_data = pd.DataFrame({
    'date': np.random.choice(dates, size=300),
    'store': np.random.choice(stores, size=300),
    'product': np.random.choice(products, size=300),
    'sales': np.random.randint(10, 100, size=300)
})

In [14]:
sales_data

Unnamed: 0,date,store,product,sales
0,2024-01-06,Littleton,Jordans,25
1,2024-01-12,Highlands,Airforce,50
2,2024-01-13,Highlands,Jordans,70
3,2024-01-09,Highlands,Jordans,80
4,2024-01-10,Highlands,Airforce,59
...,...,...,...,...
295,2024-01-20,Highlands,Airforce,77
296,2024-01-25,Highlands,Airforce,88
297,2024-01-18,Littleton,Airmax,73
298,2024-01-11,Highlands,Airforce,30


In [8]:
#  Calculate the total sales for each store
sales_data.groupby('store')['sales'].sum()

store
Aurora       4473
Highlands    6196
Littleton    6329
Name: sales, dtype: int64

In [10]:
# Find the average sales per product for each store.
sales_data.groupby(['store','product'])['sales'].mean()

store      product 
Aurora     Airforce    65.233333
           Airmax      52.578947
           Jordans     48.935484
Highlands  Airforce    51.840909
           Airmax      56.289474
           Jordans     53.818182
Littleton  Airforce    63.655172
           Airmax      55.684211
           Jordans     62.289474
Name: sales, dtype: float64

### Reshaping

In [16]:
# Create a pivot table to show the total sales for each store across different dates. 
# The index should be the date, columns should be store, and values should be sales.
pivot_total_sales_date_store = sales_data.pivot_table(index='date', columns='store', values='sales', aggfunc='sum')
pivot_total_sales_date_store.head(10)

store,Aurora,Highlands,Littleton
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-01-01,268.0,555.0,172.0
2024-01-02,74.0,102.0,285.0
2024-01-03,56.0,257.0,280.0
2024-01-04,76.0,94.0,372.0
2024-01-05,188.0,129.0,88.0
2024-01-06,188.0,53.0,98.0
2024-01-07,136.0,45.0,113.0
2024-01-08,389.0,309.0,335.0
2024-01-09,67.0,350.0,220.0
2024-01-10,162.0,320.0,11.0
