# Chapter 6 - Series Introduction

In [168]:
import pandas as pd

songs2 = pd.Series([145, 142, 38, 13], name='counts')
songs2

0    145
1    142
2     38
3     13
Name: counts, dtype: int64

In [169]:
# use pyarrow backend
songs3 = pd.Series([145, 142, 38, 13],
                   name='counts',
                   index=['Paul', 'John', 'George', 'Ringo'],
                   dtype='int64[pyarrow]')
songs3

Paul      145
John      142
George     38
Ringo      13
Name: counts, dtype: int64[pyarrow]

In [170]:
songs2.index

RangeIndex(start=0, stop=4, step=1)

In [171]:
songs3.index

Index(['Paul', 'John', 'George', 'Ringo'], dtype='object')

In [172]:
import numpy as np

nan_series = pd.Series([2, np.nan], index = ['Ono', 'Clapton'])
nan_series

Ono        2.0
Clapton    NaN
dtype: float64

In [173]:
nan_series2 = pd.Series([2, np.nan], index = ['Ono', 'Clapton'], dtype='int64[pyarrow]')
nan_series2

Ono           2
Clapton    <NA>
dtype: int64[pyarrow]

In [174]:
nan_series2.count()

1

In [175]:
nan_series2.size

2

In [176]:
numpy_ser = np.array([145, 142, 38, 13])
songs3.iloc[1]

142

In [177]:
numpy_ser[1]

142

In [178]:
songs3.mean()

84.5

In [179]:
numpy_ser.mean()

84.5

In [180]:
len(set(dir(numpy_ser)) & set(dir(songs3)))

112

In [181]:
mask = songs3 > songs3.median()
mask

Paul       True
John       True
George    False
Ringo     False
Name: counts, dtype: bool[pyarrow]

In [182]:
songs3[mask]

Paul    145
John    142
Name: counts, dtype: int64[pyarrow]

# Chapter 7 - Series Deep Dive

In [183]:
import pandas as pd

url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url, dtype_backend='pyarrow', engine='pyarrow')

In [184]:
city_mpg = df.city08
highway_mpg = df.highway08

In [185]:
(city_mpg + highway_mpg) / 2

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: double[pyarrow]

In [186]:
# city_mpg.mean()
# city_mpg.is_unique
# city_mpg.is_monotonic_increasing
city_mpg.quantile([.1, .5, .9])

0.1    13.0
0.5    17.0
0.9    24.0
Name: city08, dtype: double[pyarrow]

In [187]:
city_mpg.gt(20).sum()

10272

In [188]:
city_mpg.gt(20).astype('bool').mul(100).mean() # series of type bool[pyarrow] doesn't have .mul operator

24.965973167412017

In [189]:
city_mpg.astype('int16[pyarrow]')
# city_mpg.astype('int8[pyarrow]')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int16[pyarrow]

# Chapter 11 - Manipulation Methods

In [190]:
make = df.make
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: string[pyarrow]

In [191]:
make.value_counts()

make
Chevrolet                           4003
Ford                                3371
Dodge                               2583
GMC                                 2494
Toyota                              2071
                                    ... 
Grumman Allied Industries              1
Environmental Rsch and Devp Corp       1
General Motors                         1
Goldacre                               1
Isis Imports Ltd                       1
Name: count, Length: 136, dtype: int64[pyarrow]

In [192]:
top5 = make.value_counts().index[:5]
top5

Index(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'], dtype='string[pyarrow]', name='make')

In [193]:
def generalize_top5(val):
    if val in top5:
        return val
    else:
        return 'Other'

In [194]:
make.apply(generalize_top5)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [195]:
make.where(make.isin(top5), 'Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: string[pyarrow]

In [196]:
make.where(make.isin(top5))

0         <NA>
1         <NA>
2        Dodge
3        Dodge
4         <NA>
         ...  
41139     <NA>
41140     <NA>
41141     <NA>
41142     <NA>
41143     <NA>
Name: make, Length: 41144, dtype: string[pyarrow]

In [197]:
make.mask(~make.isin(top5), 'Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: string[pyarrow]

In [198]:
%%timeit
make.apply(generalize_top5)

22.3 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [199]:
%%timeit
make.where(make.isin(top5), 'Other')

850 µs ± 6.91 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [200]:
cyl = df.cylinders
missing = cyl.isna()
missing

0        False
1        False
2        False
3        False
4        False
         ...  
41139    False
41140    False
41141    False
41142    False
41143    False
Name: cylinders, Length: 41144, dtype: bool

In [201]:
make.loc[missing]
make[missing]

7138     Nissan
7139     Toyota
8143     Toyota
8144       Ford
8146       Ford
          ...  
34563     Tesla
34564     Tesla
34565     Tesla
34566     Tesla
34567     Tesla
Name: make, Length: 206, dtype: string[pyarrow]

In [202]:
temp = pd.Series([32, 40, None, 42, 39, 32], dtype='float[pyarrow]')
temp

0    32.0
1    40.0
2    <NA>
3    42.0
4    39.0
5    32.0
dtype: float[pyarrow]

In [203]:
temp.interpolate()

0    32.0
1    40.0
2    41.0
3    42.0
4    39.0
5    32.0
dtype: float[pyarrow]

# Chapter 12 - Indexing Operations

In [204]:
make.to_dict()

{0: 'Alfa Romeo',
 1: 'Ferrari',
 2: 'Dodge',
 3: 'Dodge',
 4: 'Subaru',
 5: 'Subaru',
 6: 'Subaru',
 7: 'Toyota',
 8: 'Toyota',
 9: 'Toyota',
 10: 'Toyota',
 11: 'Volkswagen',
 12: 'Volkswagen',
 13: 'Volkswagen',
 14: 'Dodge',
 15: 'Volkswagen',
 16: 'Volvo',
 17: 'Volvo',
 18: 'Audi',
 19: 'Audi',
 20: 'BMW',
 21: 'BMW',
 22: 'BMW',
 23: 'Buick',
 24: 'Buick',
 25: 'Dodge',
 26: 'Buick',
 27: 'Buick',
 28: 'Buick',
 29: 'Buick',
 30: 'Buick',
 31: 'Cadillac',
 32: 'Cadillac',
 33: 'Cadillac',
 34: 'Cadillac',
 35: 'Chevrolet',
 36: 'Dodge',
 37: 'Chevrolet',
 38: 'Chevrolet',
 39: 'Chevrolet',
 40: 'Chevrolet',
 41: 'Chrysler',
 42: 'CX Automotive',
 43: 'CX Automotive',
 44: 'Nissan',
 45: 'Nissan',
 46: 'Nissan',
 47: 'Dodge',
 48: 'Dodge',
 49: 'Dodge',
 50: 'Dodge',
 51: 'Dodge',
 52: 'Dodge',
 53: 'Dodge',
 54: 'Dodge',
 55: 'Dodge',
 56: 'Ford',
 57: 'Ford',
 58: 'Dodge',
 59: 'Ford',
 60: 'Ford',
 61: 'Ford',
 62: 'Ford',
 63: 'Ford',
 64: 'Hyundai',
 65: 'Hyundai',
 66: 'Hyu

In [205]:
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64[pyarrow]

In [206]:
# city2 = city_mpg.rename(make.to_dict())
city2 = city_mpg.rename(make)
city2

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: city08, Length: 41144, dtype: int64[pyarrow]

In [207]:
city2.index

Index(['Alfa Romeo', 'Ferrari', 'Dodge', 'Dodge', 'Subaru', 'Subaru', 'Subaru',
       'Toyota', 'Toyota', 'Toyota',
       ...
       'Saab', 'Saturn', 'Saturn', 'Saturn', 'Saturn', 'Subaru', 'Subaru',
       'Subaru', 'Subaru', 'Subaru'],
      dtype='object', length=41144)

In [208]:
# city2.reset_index()
city2.reset_index(drop=True)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64[pyarrow]

In [209]:
city2.loc['Subaru']

Subaru    17
Subaru    21
Subaru    22
Subaru    19
Subaru    20
          ..
Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, Length: 885, dtype: int64[pyarrow]

In [210]:
# city2.loc['Fisker']
city2.loc[['Fisker']]

Fisker    20
Name: city08, dtype: int64[pyarrow]

In [211]:
city2.loc[['Ferrari', 'Lamborghini']]

Ferrari         9
Ferrari        12
Ferrari        11
Ferrari        10
Ferrari        11
               ..
Lamborghini     6
Lamborghini     8
Lamborghini     8
Lamborghini     8
Lamborghini     8
Name: city08, Length: 357, dtype: int64[pyarrow]

In [212]:
# city2.loc['Ferrari':'Lamborghini'] # need to sort first
city2.sort_index().loc['Ferrari':'Lamborghini']

Ferrari        10
Ferrari        13
Ferrari        13
Ferrari         9
Ferrari        10
               ..
Lamborghini    12
Lamborghini     9
Lamborghini     8
Lamborghini    13
Lamborghini     8
Name: city08, Length: 11210, dtype: int64[pyarrow]

In [213]:
city2.sort_index().loc['F':'J']

Federal Coach    15
Federal Coach    13
Federal Coach    13
Federal Coach    14
Federal Coach    13
                 ..
Isuzu            15
Isuzu            15
Isuzu            15
Isuzu            27
Isuzu            18
Name: city08, Length: 9040, dtype: int64[pyarrow]

In [214]:
idx = pd.Index(['Dodge'])
city2.loc[idx]

Dodge    23
Dodge    10
Dodge    12
Dodge    11
Dodge    11
         ..
Dodge    18
Dodge    17
Dodge    14
Dodge    14
Dodge    11
Name: city08, Length: 2583, dtype: int64[pyarrow]

In [215]:
mask = city2 > 50
mask

Alfa Romeo    False
Ferrari       False
Dodge         False
Dodge         False
Subaru        False
              ...  
Subaru        False
Subaru        False
Subaru        False
Subaru        False
Subaru        False
Name: city08, Length: 41144, dtype: bool[pyarrow]

In [216]:
city2.loc[mask]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64[pyarrow]

In [217]:
cost = pd.Series([1.00, 2.25, 3.99, 0.99, 2.79], index=['Gum', 'Cookie', 'Melon', 'Roll', 'Carrots'])
inflation = 1.10
cost.mul(inflation).loc[lambda s_: s_ > 3]

Melon      4.389
Carrots    3.069
dtype: float64

In [218]:
city2

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: city08, Length: 41144, dtype: int64[pyarrow]

In [219]:
city2.iloc[0]

19

In [220]:
city2.sort_index().iloc[0]

13

In [221]:
city2.sort_values().iloc[0]

6

In [222]:
city2.iloc[-1]

16

In [223]:
city2.iloc[0:5]

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
Name: city08, dtype: int64[pyarrow]

In [224]:
mask =  city2 > 50
# city2.iloc[mask] # this will fail
city2.iloc[mask.to_numpy()]
city2.iloc[list(mask)]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64[pyarrow]

In [225]:
city2.sample(6, random_state=42)

Volvo         16
Mitsubishi    19
Buick         27
Jeep          15
Land Rover    13
Saab          17
Name: city08, dtype: int64[pyarrow]

In [226]:
# city2.filter(items=['Ford', 'Subaru']) # fails because index is duplicated

In [227]:
city2.filter(like='rd')

Ford    18
Ford    16
Ford    17
Ford    17
Ford    15
        ..
Ford    26
Ford    19
Ford    21
Ford    18
Ford    19
Name: city08, Length: 3371, dtype: int64[pyarrow]

In [228]:
city2.filter(regex='(Ford)|(Subaru)')

Subaru    17
Subaru    21
Subaru    22
Ford      18
Ford      16
          ..
Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, Length: 4256, dtype: int64[pyarrow]

In [229]:
city_mpg.reindex([0, 0, 10, 20, 2_000_000])

0            19
0            19
10           23
20           14
2000000    <NA>
Name: city08, dtype: int64[pyarrow]

In [230]:
s1 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2 = pd.Series([15, 25, 35], index=['b', 'c', 'd'])

s2

b    15
c    25
d    35
dtype: int64

In [231]:
s2.reindex(s1.index)

a     NaN
b    15.0
c    25.0
dtype: float64

In [232]:
import pyarrow as pa

string_pa = pd.ArrowDtype(pa.string())
age = pd.Series(['0-10', '11-15', '11-15', '61-65', '46-50'], dtype=string_pa)
age


0     0-10
1    11-15
2    11-15
3    61-65
4    46-50
dtype: string[pyarrow]

In [233]:
age_split = (age
 .str.split('-', expand=True)
 .astype('int8[pyarrow]'))

age_split

Unnamed: 0,0,1
0,0,10
1,11,15
2,11,15
3,61,65
4,46,50


In [234]:
age_split.mean(axis=1)

0     5.0
1    13.0
2    13.0
3    63.0
4    48.0
dtype: double[pyarrow]