In [2]:
import pandas as pd

#Pandas - Series

In [8]:
# Series are a core data structure in pandas
animals = ['Tiger','Bear', 'Moose']
pd.Series(animals)

0    Tiger
1     Bear
2    Moose
dtype: object

In [9]:
numbers = [1, 2, 3]
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [14]:
# underneath, pandas stores series values in a typed array using the numpy lib
# pandas does an automatic type conversion
print(pd.Series(['Tiger', None]))
print(pd.Series([1, None]))

0    Tiger
1     None
dtype: object
0    1.0
1    NaN
dtype: float64


In [16]:
import numpy as np
print(np.nan == None)
print(np.nan == np.nan)
print(np.isnan(np.nan))

False
False
True


In [18]:
# series can be created from dictionaries
# doing so, the dictionary keys are automatically passed as series' labels
sports = {'Archery': 'Bhutan','Golf': 'Scotland', 'Sumo':'Japan'}
sports

{'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan'}

In [34]:
s = pd.Series(sports)
print(s)
print(s.keys())
print(s.values)

Archery      Bhutan
Golf       Scotland
Sumo          Japan
dtype: object
Index(['Archery', 'Golf', 'Sumo'], dtype='object')
['Bhutan' 'Scotland' 'Japan']


In [23]:
s.values

array(['Bhutan', 'Scotland', 'Japan'], dtype=object)

In [35]:
s.index

Index(['Archery', 'Golf', 'Sumo'], dtype='object')

In [38]:
# we can also pass a list as index
s = pd.Series(['Tiger','Bear'], index=['India','America'])
s

India      Tiger
America     Bear
dtype: object

#Querying a series

In [5]:
sports = {'Archery':'Bhutan',
          'Golf':'Scotland',
          'Sumo':'Japan',
          'Taekwondo':'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [12]:
# A serie can be queried using the index position or the index label
print(s.iloc[3])        # by position; starts a 0
print(s.loc['Sumo'])    # by index
# note: iloc and loc are attributes, not methods

South Korea
Japan


In [14]:
# smart indexing, but not recommended
print(s[3])
print(s['Golf'])

South Korea
Scotland


In [3]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s

0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [20]:
# sum using loop. works, but slow
total = 0
for item in s:
    total += item
print(total)

324.0


In [4]:
# pandas and numpy support vectorized computation
import numpy as np
np.sum(s)

324.0

In [25]:
# let's time both
s = pd.Series(np.random.randint(0,1000,10000))
print(len(s))
s.head()    # return first 5 elements

10000


0    677
1     79
2    897
3    442
4    340
dtype: int32

Magic functions start with a % sign
We are going to use a cellular magic function, starting with %%
It will wrap the code in the current jupyter cell

In [27]:
%%timeit -n 100
summary = 0
for item in s:
    summary += item

100 loops, best of 3: 714 µs per loop


In [41]:
%%timeit -n 100
np.sum(s)

100 loops, best of 3: 83.5 µs per loop


Broadcasting: apply an operation to modify each value of a series

In [6]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label,value in s.iteritems():
    s.loc[label] = value + 2

10 loops, best of 3: 529 ms per loop


In [9]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2

10 loops, best of 3: 317 µs per loop


In [10]:
# loc attribute can also be used to add new items
s = pd.Series([1,2,3])
s.loc['Animal']='Bears' # pandas allows mixed types
s

0             1
1             2
2             3
Animal    Bears
dtype: object

In [12]:
# example with non unique index values
# this makes dataframes different from a relational database
original_sports = pd.Series({'Archery':'Bhutan',
          'Golf':'Scotland',
          'Sumo':'Japan',
          'Taekwondo':'South Korea'})
# create a new series with multiple entries for cricket
cricket_loving_countries = \
    pd.Series(['Australia',
               'Barbados',
               'Pakistan',
               'England'],
              index=['Cricket',
                     'Cricket',
                     'Cricket',
                     'Cricket'])
# append will not change the original list
# instead, it will return a new list with appended values
# pandas will try to infer the best data type to use
all_countries = original_sports.append(cricket_loving_countries)
all_countries

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object