# Chapter 4: Series Introduction

In [1]:
import pandas as pd
import numpy as np

In [2]:
temp = pd.Series([75, 72, 64, 54, 68, 72, 78], index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], name = 'temperatures')

In [3]:
temp

Unnamed: 0,temperatures
Monday,75
Tuesday,72
Wednesday,64
Thursday,54
Friday,68
Saturday,72
Sunday,78


In [4]:
temp.mean()

69.0

- Series is used to model one-dimensional data

In [5]:
mask = temp > temp.mean()

In [7]:
temp[mask]

Unnamed: 0,temperatures
Monday,75
Tuesday,72
Saturday,72
Sunday,78


In [8]:
c = pd.Series(['Green', 'Yellow', 'Red', 'Blue', 'Purple'], dtype = 'category')

In [9]:
c

Unnamed: 0,0
0,Green
1,Yellow
2,Red
3,Blue
4,Purple


In [10]:
c.cat.ordered

False

In [None]:
series = {
    'index': [0, 1, 2, 3],
    'data': [145, 142, 38, 13],
    'name': "songs"
}

In [None]:
series

{'index': [0, 1, 2, 3], 'data': [145, 142, 38, 13], 'name': 'songs'}

In [None]:
def get(series, idx):
    value_idx = series['index'].index(idx)
    return series['data'][value_idx]

In [None]:
get(series, 1)

142

In [None]:
series = {
    'index':[1,2,3,4],
    'data':[ 12, 24, 36, 48],
    'name':"poems"
}

In [None]:
series

{'index': [1, 2, 3, 4], 'data': [12, 24, 36, 48], 'name': 'poems'}

In [None]:
def get(series, idx):
  value_idx = series['index'].index(idx)
  return series['data'][value_idx]

In [None]:
get(series, 2

    )

24

## 4.1 The Index Abstraction

- ``Index`` is the core feature of pandas' data structures
- Many of operations performed on a ``Series`` operate directly on the index

In [None]:
songs = {
    'index': ["Paul", "John", "George", "Ringo"],
    'data': [145, 142, 38, 13],
    'name': 'counts'
}

In [None]:
songs['index'].index('John')

1

In [None]:
get(songs, "John")

142

## 4.2 The Pandas Series

- Series is one-dimensional though it looks like it is two-dimensional
- Generic name for an index is an ``axis``
- To get the best speed and to leverage vectorized operations, the values should be of the same type though not required

In [None]:
songs2 = pd.Series([145, 142, 38, 13],
                    name="counts")


In [None]:
songs2

0    145
1    142
2     38
3     13
Name: counts, dtype: int64

In [None]:
songs2.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
# Index can also be string based
songs3 = pd.Series([145, 142, 38, 13],
                  name="counts",
                  index=["Paul", "John", "George", "Ringo"])

In [None]:
songs3

Paul      145
John      142
George     38
Ringo      13
Name: counts, dtype: int64

In [None]:
songs3.index

Index(['Paul', 'John', 'George', 'Ringo'], dtype='object')

In [None]:
songs4 = pd.Series([35, 70, 105, 140], name = 'cnts')
songs4

Unnamed: 0,cnts
0,35
1,70
2,105
3,140


In [None]:
songs4.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
songs5 = pd.Series([35,70,105,140], name = 'cnt', index=['a', 'b', 'c', 'd'])
songs5

Unnamed: 0,cnt
a,35
b,70
c,105
d,140


In [None]:
songs5.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [None]:
songs5.dtype

dtype('int64')

## 4.3 The NaN value

- When pandas determines that a series holds numeric values but cannot find a number to represent an entry, it will use ``NaN``
- The type of series is float64 and not int64 because it supports ``NaN``
- ``None``, ``NaN``, ``nan``, ``<NA>`` and null all refers to empty or missing data found in a pandas series or dataframe

In [None]:
nan_series = pd.Series([2, np.nan],
                    index=["Ono", "Clapton"])

In [None]:
nan_series

Ono        2.0
Clapton    NaN
dtype: float64

In [None]:
# count method disregards NaN
nan_series.count()

1

In [None]:
# inspect number of entries (including missing values)
nan_series.size

2

In [None]:
nan_s2 = pd.Series([2, np.nan], index = ['a', 'not_a_number'])
nan_s2

Unnamed: 0,0
a,2.0
not_a_number,


In [None]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [None]:
nan_s2.index, nan_s2.size

(Index(['a', 'not_a_number'], dtype='object'), 2)

In [None]:
nan_s2.is_unique

True

In [None]:
nan_s2.astype('Int64')

Unnamed: 0,0
a,2.0
not_a_number,


## 4.4 Optional Integer Support for NaN

- int64 type does not support missing data
- We can pass dtype=int64

In [None]:
nan_series4 = pd.Series([2, np.nan],
                        index=['Ono', 'Clapton'],
                        dtype='int64')

ValueError: cannot convert float NaN to integer

In [None]:
nan_series2 = pd.Series([2, None],
                        index=['Ono', 'Clapton'],
                        dtype='Int64')

In [None]:
nan_series2

Ono           2
Clapton    <NA>
dtype: Int64

In [None]:
non_s3 = pd.Series([2, None], index =['a', 'not_a_number'], dtype = 'Int64')
non_s3

Unnamed: 0,0
a,2.0
not_a_number,


In [None]:
non_s4 = pd.Series([2, None], index =['a', 'not_a_number'])
non_s4

Unnamed: 0,0
a,2.0
not_a_number,


In [None]:
non_s4.count()

1

## 4.5 Similar to NumPy

- Series behaves similarly to numpy array

In [None]:
songs3 = pd.Series([145, 142, 38, 13], name = 'cnts')
songs3

Unnamed: 0,cnts
0,145
1,142
2,38
3,13


In [None]:
# Both respond to index operations
numpy_ser = np.array([145, 142, 38, 13])
print(numpy_ser[1])
print(songs3[1])

142
142


In [None]:
numpy_s = np.array([145, 142, 38, 13])
numpy_s

array([145, 142,  38,  13])

In [None]:
print(numpy_s, numpy_s.mean())
print(songs3, songs3.mean())

[145 142  38  13] 84.5
0    145
1    142
2     38
3     13
Name: cnts, dtype: int64 84.5


In [None]:
# Both have common methods
print(songs3.mean())
print(numpy_ser.mean())

84.5
84.5


In [None]:
songs3

Unnamed: 0,cnts
0,145
1,142
2,38
3,13


In [None]:
# Both have a notion of boolean array
mask = songs3 > songs3.median()
print(mask)

Paul       True
John       True
George    False
Ringo     False
Name: counts, dtype: bool


In [None]:
songs3.sum()/4

84.5

In [None]:
songs3.median()

90.0

In [None]:
mask = songs3 < songs3.median()
print(mask)

0    False
1    False
2     True
3     True
Name: cnts, dtype: bool


In [None]:
songs3[mask]

Unnamed: 0,cnts
2,38
3,13


In [None]:
# We can filter using this mask
songs3[mask]

Paul    145
John    142
Name: counts, dtype: int64

In [None]:
songs3[songs3 > songs3.median()]

Paul    145
John    142
Name: counts, dtype: int64

## 4.6 Categorical Data

When we load data, we can indicate that the data is categorical. Benefits of categorical values:
- Use less memory than strings
- Improve performance
- Can have an ordering
- Can perform operations on categories
- Enforce membership on values

### Ordering

In [None]:
s = pd.Series(["m", "l", "xs", "s", "xl"], dtype='category')
s

0     m
1     l
2    xs
3     s
4    xl
dtype: category
Categories (5, object): ['l', 'm', 's', 'xl', 'xs']

In [None]:
s = pd.Series(['s', 'm', 'l', 'xl', 'xxl'], dtype = 'category')
print(s)

0      s
1      m
2      l
3     xl
4    xxl
dtype: category
Categories (5, object): ['l', 'm', 's', 'xl', 'xxl']


In [None]:
s.cat.ordered

False

In [None]:
# ordering
s.cat.ordered

False

In [None]:
s2 = pd.Series(['l', 'm', 's', 'xl', 'xxl'], dtype = 'category')
s2.cat.ordered

False

In [None]:
# convert non-categorical series to an ordered category
s2 = pd.Series(["m", "l", "xs", "s", "xl"])
s2


Unnamed: 0,0
0,m
1,l
2,xs
3,s
4,xl


In [None]:
size_type = pd.api.types.CategoricalDtype(
    categories=['s', 'xl', 'm', 'l'], ordered=True)
s3 = s2.astype(size_type)

In [None]:
s3

Unnamed: 0,0
0,m
1,l
2,
3,s
4,xl


In [None]:
s3.cat.ordered

True