In [None]:
import pandas as pd
from random import sample

# Panda Series
A 1-dimension array of data

In [None]:
# create a Series, with a name (which will be used as Column name in a DataFrame)
# note that the data is a mix of types
pd.Series(['John Doe', 23, 180], name='User')

http://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.Series.html

In [None]:
# assign to a variable (and not show output)
s = pd.Series(['John Doe', 23, 180], name='User')

In [None]:
# define the index in creating the Series
# index needs to be of the same length as the data
s = pd.Series(['John Doe', 23, 180], name='User', index=['a', 'b', 'c'])

In [None]:
s

## Series Attributes

In [None]:
s.values # list the values in the data

In [None]:
s.index # index type and range

In [None]:
s.dtype # type of data

In [None]:
s.ndim # nb of dimensions

In [None]:
s.shape # shape of data (cols, rows)

In [None]:
s.size # total nb of elements, including nulls

In [None]:
s.is_unique # are the values in the series unique?

## Accessing data
Data is accessed like in a standard Python list (array)

In [None]:
s

In [None]:
s.index[1]

In [None]:
s.values[2]

In [None]:
# also, for multiple values at once:
s.index[[0, 2]]

In [None]:
s.values[[0, 2]] # pass an array on value indices to lookup

In [None]:
s.values[1:3] # pass an array slice

## Methods

In [None]:
s = pd.Series([6, 7, 7, 8, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 2, 3, 3, 6, 4, 7, 6, 5])

In [None]:
s.head() # first elements (default to 5)

In [None]:
s.head(3) # first 3 element

In [None]:
s.tail() # last 5 elements

In [None]:
s.tail(3) # last 3 elements

In [None]:
s.sort_values() # sort the values

In [None]:
s # note this is not done in place

In [None]:
# need to reassign the output to s
s = s.sort_values()
s

In [None]:
s.sort_index() # sort the index values

In [None]:
s # not done in place

In [None]:
# We can also use the parameter `inplace=True`
s.sort_index(inplace=True)

In [None]:
s

In [None]:
s.count() # nb of elements

In [None]:
s.sum() # sum of all elements

In [None]:
s.product() # product of all elements

In [None]:
s.min() # mininum: equivalent to np.min(b)

In [None]:
s.max() # maximum: equivalent to np.max(b)

In [None]:
s.mean() # mean (average) of all elements

In [None]:
s.std() # standard deviation: equivalent to np.std(b)

In [None]:
s.median() # median value, i.e. the value in the middle of the sorted array

In [None]:
s.mode() # most frequent value...

In [None]:
# ...or values if multiple values have the same highest frequency:
s = pd.Series([1,1,1, 2,2,2, 3,3,3, 4,5,6])
s.mode()

In [None]:
s.value_counts() # the frequency distribution of all values

In [None]:
s.describe() # summary of the series, with stats and percentiles

In [None]:
s.idxmin() # index of the min value

In [None]:
s.idxmax() # index of the max value

In [None]:
s.apply(lambda x: x + 1) # apply a function to each element of the Series

In [None]:
s.map({1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six', 7: 'seven', 8: 'eight'}) # map the values to another, 
# could be a Series or function

In [None]:
s.map(lambda x: str(x) + "%").head()

In [None]:
s.nunique() # number of unique values 

In [None]:
# does not count NaNs
s.map({1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 6: 'six', 7: 'seven', 8: 'eight'}).nunique()

## Conversion to Python classes

In [None]:
s.to_dict() # convert to a python dictionary

In [None]:
# equivalent to 
dict(s)

In [None]:
s.to_json() # convert to a JSON string (JavaScript Object Notation)

In [None]:
list(s) # convert to a python list

## Data type (dtype) conversions

In [None]:
s = pd.Series(['0', '1', 2, None])

In [None]:
s

In [None]:
s = s.astype('int8') # will raise an error because we have None in the data

In [None]:
s = s.astype('int8', errors='ignore') # ignore errors BUT DOES NOT CONVERT

In [None]:
s[0] # is still a string type

In [None]:
s = s.astype('string') # cast all as strings is possible, but numbers fail

In [None]:
s

In [None]:
s[3]

## Replacing bad values
Often times, the data is not clean, and there are nulls (None / np.nan) or values that need conversion.

To convert to a certain type, data needs to be cleaned up

In [None]:
s = pd.Series(['0', '1', 2, None])

In [None]:
s = s.fillna(0) # replace None or NaNs with the given value

In [None]:
s

In [None]:
s.astype('int8')

## Remove bad values
We can remove the bad values entirely

In [None]:
s = pd.Series(['0', '1', 2, None, None, None])
s

In [None]:
s.dropna()

## Backfilling bad values
Sometimes it is necessary to back fill with known values

In [None]:
s = pd.Series([1,2,3, None, None, None, 4,5,6])
s.values

In [None]:
s.fillna(method='ffill') # forward fill with last valid value

In [None]:
s.fillna(method='bfill') # backfill with next valid value

http://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.Series.fillna.html

## Deal with duplicates

In [None]:
s = pd.Series([6, 7, 7, 8, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 2, 3, 3, 6, 4, 7, 6, 5])
s

In [None]:
s.nunique() # number unique values (as above)

In [None]:
s = s.drop_duplicates() # drop duplicates removes duplicate values (keeping the first occurence and its index)\
s

In [None]:
s = s.sort_values()
s.reset_index()

In [None]:
# old index is added as an extra column by default, to avoid this, use `drop=True`
s.reset_index(drop=True)

## Aggregate Series

In [None]:
s1 = pd.Series([1,2,3])
s2 = pd.Series([4,5,6])
pd.concat([s1, s2]) # note that the index is also concatenated

In [None]:
pd.concat([s1, s2], ignore_index=True) # regenerate a new index

## Value Lookup
Gotcha to be aware of

In [None]:
s = pd.Series([1,2,3], index=['a', 'b', 'c'])

In [None]:
3 in s # this checks if 3 is in the series INDEX

In [None]:
3 in s.index # equivalent to `3 in s`

In [None]:
3 in s.values # this checks if 3 is in the series VALUES