In [44]:
import pandas as pd
import numpy as np
import os

Data Structures

In [45]:
# Create Series
a = [1, 2, 3, 4, 5]
b = pd.Series(data=np.random.randn(5),
              index=a)
print(b)

simple_dict1 = {
    "library": "pandas",
    "version": "1.3.4",
    "year": 2023}
simple_series = pd.Series(simple_dict1)
print(simple_series, simple_series["year"], sep="\n")

# Modify Series
simple_series["year"] = 2024
print(simple_series)

1   -0.789990
2   -0.883346
3    0.430660
4    1.874114
5    0.472657
dtype: float64
library    pandas
version     1.3.4
year         2023
dtype: object
2023
library    pandas
version     1.3.4
year         2024
dtype: object


In [46]:
# Create DataFrame
data = {
    "i": [121, 144, 169],
    "j": [11, 12, 13]}
df_data = pd.DataFrame(data,
                       index=["a", "b", "c"])
print(df_data)

     i   j
a  121  11
b  144  12
c  169  13


Indexing

In [51]:
# load dataset
data_path = os.path.join("data", "winequality-red.csv")
df = pd.read_csv(data_path,
                 header=0,
                 sep=",",
                 usecols=["residual sugar",
                          "chlorides",
                          "total sulfur dioxide",
                          "density",
                          "pH",
                          "alcohol",
                          "quality"],)
# rename columns
df = df.rename({
        "residual sugar": "rSugar",
        "total sulfur dioxide": "total_so2",},
              axis=1)
print(df.columns)

# return single row
print(df.iloc[5])

# return multiple rows
print(df.iloc[[5, 9, 10]])

# return a range of rows
print(df.iloc[5:10])

# conditional filtering
hq_wine = df.loc[df["quality"] > 7]
print(hq_wine)

sugar_low_ph = hq_wine.query("rSugar < pH")

# index row selection
idx_series = pd.Series(
      ["peach", "caramel", "apples",
       "melon", "orange", "grapes"],
      index=[75, 43, 23,1, 2, 3])
print(idx_series.loc[1], idx_series.iloc[1])

Index(['rSugar', 'chlorides', 'total_so2', 'density', 'pH', 'alcohol',
       'quality'],
      dtype='object')
rSugar        1.8000
chlorides     0.0750
total_so2    40.0000
density       0.9978
pH            3.5100
alcohol       9.4000
quality       5.0000
Name: 5, dtype: float64
    rSugar  chlorides  total_so2  density    pH  alcohol  quality
5      1.8      0.075       40.0   0.9978  3.51      9.4        5
9      6.1      0.071      102.0   0.9978  3.35     10.5        5
10     1.8      0.097       65.0   0.9959  3.28      9.2        5
   rSugar  chlorides  total_so2  density    pH  alcohol  quality
5     1.8      0.075       40.0   0.9978  3.51      9.4        5
6     1.6      0.069       59.0   0.9964  3.30      9.4        5
7     1.2      0.065       21.0   0.9946  3.39     10.0        7
8     2.0      0.073       18.0   0.9968  3.36      9.5        7
9     6.1      0.071      102.0   0.9978  3.35     10.5        5
      rSugar  chlorides  total_so2  density    pH  alcohol  qua

Multi-Indexing

In [58]:
# multi index indexing
multi_index_arr = [
    np.array(
        ['chunk1', 'chunk1', 'chunk1', 'chunk1', 'chunk1',
         'chunk2', 'chunk2', 'chunk2', 'chunk2', 'chunk2']),
    np.array(['a', 'b', 'c', 'd', 'e',
              'f', 'g', 'h', 'i', 'j'])]
df = pd.DataFrame(np.random.randn(10, 4),
                  index=multi_index_arr)
print(df)

# multi column indexing
multi_index_col = [
    np.array(
        ['chunk1a', 'chunk1a', 'chunk1a',
         'chunk2a', 'chunk2a', 'chunk2a']),
    np.array(['k', 'l', 'm',
              'n', 'o', 'p'])]
df = pd.DataFrame(np.random.randn(10, 6),
                  index=multi_index_arr,
                  columns=multi_index_col)
print(df)

# retrieve chunks
print(df.loc["chunk1"]["chunk1a"])
print(df.loc['chunk1', 'a']['chunk2a', 'p'])

                 0         1         2         3
chunk1 a -0.799218  0.401604 -0.247161  0.335426
       b  0.910829 -1.302787  1.937504  0.543357
       c -0.251698 -2.598736  0.313187 -0.337458
       d -2.385283  0.732763 -0.415969 -1.957286
       e  1.200936  1.504765 -0.702466  1.760458
chunk2 f  0.312511  0.619080 -0.949184  0.675571
       g -0.414630 -0.891160 -1.691088  0.994845
       h  0.031977  0.450205 -0.104500 -0.254625
       i  0.418491 -0.672941  0.050899 -0.554301
       j -0.284222  0.878741  0.261171 -0.480562
           chunk1a                       chunk2a                    
                 k         l         m         n         o         p
chunk1 a -0.959872  1.253158  1.526952  0.031791  1.194948  0.274431
       b  1.749856  0.354019 -0.048939  0.279909  1.659103 -0.923058
       c -0.462409  0.539482 -0.489243 -0.472929 -0.504509 -0.847867
       d -0.748435  0.715075  1.028249 -0.705451 -1.051276 -2.054669
       e -0.001426  0.335280 -0.153502  0.31033

Time Delta Index

In [64]:
# create time delta index
timedelta_index = pd.timedelta_range(
    start='0 minutes', periods=50, freq='36s')
print(timedelta_index)

series_a = pd.Series(np.arange(50),
                     index=timedelta_index)
print(series_a)

TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:36', '0 days 00:01:12',
                '0 days 00:01:48', '0 days 00:02:24', '0 days 00:03:00',
                '0 days 00:03:36', '0 days 00:04:12', '0 days 00:04:48',
                '0 days 00:05:24', '0 days 00:06:00', '0 days 00:06:36',
                '0 days 00:07:12', '0 days 00:07:48', '0 days 00:08:24',
                '0 days 00:09:00', '0 days 00:09:36', '0 days 00:10:12',
                '0 days 00:10:48', '0 days 00:11:24', '0 days 00:12:00',
                '0 days 00:12:36', '0 days 00:13:12', '0 days 00:13:48',
                '0 days 00:14:24', '0 days 00:15:00', '0 days 00:15:36',
                '0 days 00:16:12', '0 days 00:16:48', '0 days 00:17:24',
                '0 days 00:18:00', '0 days 00:18:36', '0 days 00:19:12',
                '0 days 00:19:48', '0 days 00:20:24', '0 days 00:21:00',
                '0 days 00:21:36', '0 days 00:22:12', '0 days 00:22:48',
                '0 days 00:23:24', '0 days 00:24:00

Missing Values

In [72]:
# showing NaNs
df = pd.DataFrame(np.random.randn(2, 3),
                  index=[1, 2,],
                  columns=["a", "b", "c"])
df = df.reindex([1, 2, 3, 4, 5])
print(df)

# show NaNs as mask
print(df.isna())

# show non-NaNs as mask
print(df.notna())

# fill NaN values
df_fill = df.fillna(0)
print(df_fill)

df_interpolate = df.interpolate(method='linear',
                                axis=0)
print(df_interpolate)

          a         b         c
1 -0.606153  2.171682 -0.161026
2 -1.308022 -1.019307 -0.052178
3       NaN       NaN       NaN
4       NaN       NaN       NaN
5       NaN       NaN       NaN
       a      b      c
1  False  False  False
2  False  False  False
3   True   True   True
4   True   True   True
5   True   True   True
       a      b      c
1   True   True   True
2   True   True   True
3  False  False  False
4  False  False  False
5  False  False  False
          a         b         c
1 -0.606153  2.171682 -0.161026
2 -1.308022 -1.019307 -0.052178
3  0.000000  0.000000  0.000000
4  0.000000  0.000000  0.000000
5  0.000000  0.000000  0.000000
          a         b         c
1 -0.606153  2.171682 -0.161026
2 -1.308022 -1.019307 -0.052178
3 -1.308022 -1.019307 -0.052178
4 -1.308022 -1.019307 -0.052178
5 -1.308022 -1.019307 -0.052178


Data Transformation

In [80]:
data = {'product': ['Apple', 'Orange', 'Melon', 'Peach', 'Pineapple'],
'price': [22, 27, 25, 29, 35],
'quantity': [5, 7, 4, 9, 3]}
df = pd.DataFrame(data)

# show statistics
print(df.describe())
print(df['product'].describe())

# dataframe info
print(df.info())

# creating a new variable
df['total_cost'] = df['price'] * df['quantity']
print(df)

# sorting values
df_sorted = df.sort_values(by='price')
print(df_sorted)

           price  quantity
count   5.000000  5.000000
mean   27.600000  5.600000
std     4.878524  2.408319
min    22.000000  3.000000
25%    25.000000  4.000000
50%    27.000000  5.000000
75%    29.000000  7.000000
max    35.000000  9.000000
count         5
unique        5
top       Apple
freq          1
Name: product, dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   product   5 non-null      object
 1   price     5 non-null      int64 
 2   quantity  5 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 252.0+ bytes
None
     product  price  quantity  total_cost
0      Apple     22         5         110
1     Orange     27         7         189
2      Melon     25         4         100
3      Peach     29         9         261
4  Pineapple     35         3         105
     product  price  quantity  total_cost
0      Apple     22 

Combining Multiple Pandas Objects

In [85]:
left_df = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'John'],
    "DeptID": [101, 102, 103, 104, 105]})

right_df = pd.DataFrame({
    'DeptID': [101, 102, 103],
    'DeptName': ['IT', 'Finance', 'HR']})

# left join with merge
left_join = pd.merge(
    left_df,
    right_df,
    on="DeptID",
    how="left")
print(left_join)

# left join with join
left_join = left_df.join(
    right_df.set_index("DeptID"),
    on="DeptID",
    how="left")
print(left_join)

   ID     Name  DeptID DeptName
0   1    Alice     101       IT
1   2      Bob     102  Finance
2   3  Charlie     103       HR
3   4    David     104      NaN
4   5     John     105      NaN
   ID     Name  DeptID DeptName
0   1    Alice     101       IT
1   2      Bob     102  Finance
2   3  Charlie     103       HR
3   4    David     104      NaN
4   5     John     105      NaN


Data Reshaping

In [100]:
data = {
    'Date':
        ['2023-06-01', '2023-06-01', '2023-06-01', '2023-06-02',
         '2023-06-02', '2023-06-02'],
    'City':
        ['Cupertino', 'Portland', 'Fort Worth', 'Cupertino',
         'Portland', 'Fort Worth'],
    'Temperature': [56, 65, 69, 59, 61, 66]}
df = pd.DataFrame(data)
print(df)

# pivots
pivoted_df = df.pivot(
    index="Date",
    columns='City',
    values='Temperature')
print(pivoted_df)

data = {
    'Month':
        ['January', 'January', 'January', 'January',
         'February', 'February', 'February', 'February'],
    'Region':
        ['North', 'East', 'West', 'South',
         'North', 'East', 'West', 'South' ],
    'Product':
        ['Product A', 'Product A', 'Product A', 'Product A',
         'Product B', 'Product B', 'Product B', 'Product B'],
    'Sales':
        [200, 150, 180, 210, 190, 175, 225, 250]}
df = pd.DataFrame(data)
print(df)

# pivot table (agg func)
pivot_table_df = df.pivot_table(
    index='Product',
    columns='Region',
    values='Sales',
    aggfunc='mean')
print(pivot_table_df)

data1 = {
    'Product': ['Product A', 'Product B'],
    '2023-Q1': [100, 80],
    '2023-Q2': [150, 130],
    '2023-Q3': [200, 180],
    '2023-Q4': [250, 230],
    '2024-Q1': [300, 280],
    '2024-Q2': [350, 330],
    '2024-Q3': [400, 380],
    '2024-Q4': [450, 430]}
df = pd.DataFrame(data1)
print(df)

# stacks
quart_sales = df.stack()
print(quart_sales)

# unstacks
unstacked_df = quart_sales.unstack()
print(unstacked_df)

melt_data = {
    'Product':
        ['Product A', 'Product B', 'Product C'],
    'January':
    [300, 200, 400],
    'February':
    [350, 250, 450],
    'March':
    [400, 300, 500]}
melt_df = pd.DataFrame(melt_data)
print(melt_df)

# melting (wide data -> narrow data)
melted_df = melt_df.melt(
    id_vars=['Product'],
    var_name='Month',
    value_name='Sales')
print(melted_df)

ctdata = {
    'Person':
        [1, 2, 3, 4,
         5, 6, 7, 8, 9],
    'State':
        ['NY', 'WA', 'CO', 'NY',
         'WA', 'CO', 'NY', 'WA', 'CO'],
    'Likes':
        ['Mountains', 'Mountains', 'Mountains', 'Mountains',
         'Mountains', 'Oceans', 'StateParks', 'StateParks', 'StateParks']}
ct_df = pd.DataFrame(ctdata)
print(ct_df)

# crosstabs
crosstab_df = pd.crosstab(
    ct_df['State'],
    ct_df['Likes'],
    rownames=['Region'],
    colnames=['FavActivity'],
    normalize=True,
    margins=True)
print(crosstab_df)

factdata = {
    'Person':
        [1, 2, 3, 4, 5],
    'Likes':
    ['Mountains', 'Oceans', 'Oceans', 'Parks', 'Parks']}
fact_df = pd.DataFrame(factdata)
print(fact_df)

# factorize (string -> integer)
labels, uniques = pd.factorize(fact_df['Likes'])
print(labels, uniques)
fact_df['Likes_Coded'] = labels
print(fact_df)

groupby_data = {
    'Person': [1, 2, 3, 4, 5],
    'Likes': ['Mountains', 'Oceans', 'Oceans', 'Parks', 'Parks'],
    'Sales': [1000, 2000, 1500, 3000, 2500]}
groupby_df = pd.DataFrame(groupby_data)
print(groupby_df)

# groupby and aggregate
grouped_df = groupby_df.groupby('Likes').agg(
    {'Sales': 'sum', 'Person': 'count'})
print(grouped_df)


         Date        City  Temperature
0  2023-06-01   Cupertino           56
1  2023-06-01    Portland           65
2  2023-06-01  Fort Worth           69
3  2023-06-02   Cupertino           59
4  2023-06-02    Portland           61
5  2023-06-02  Fort Worth           66
City        Cupertino  Fort Worth  Portland
Date                                       
2023-06-01         56          69        65
2023-06-02         59          66        61
      Month Region    Product  Sales
0   January  North  Product A    200
1   January   East  Product A    150
2   January   West  Product A    180
3   January  South  Product A    210
4  February  North  Product B    190
5  February   East  Product B    175
6  February   West  Product B    225
7  February  South  Product B    250
Region      East  North  South   West
Product                              
Product A  150.0  200.0  210.0  180.0
Product B  175.0  190.0  250.0  225.0
     Product  2023-Q1  2023-Q2  2023-Q3  2023-Q4  2024-Q1  2024-Q2