# Exploring series data
A series is a 1-dimensional array of a sequence of data.  It can consist of any data type.  

In [17]:
import pandas as pd

In [18]:
#tell pandas what we want the values to be
desserts = ["ice cream","cookies","cake","shake","smoothie","baclava","Turkish Delight"]

In [19]:
#call a series constructor method
pd.Series(desserts)

0          ice cream
1            cookies
2               cake
3              shake
4           smoothie
5            baclava
6    Turkish Delight
dtype: object

In [20]:
ages = [45,44,43,41,40,38,38,35,32,28]
newAge = pd.Series(ages) 

#note that "dtype" refers to the data type of the values, not the index or overall object.  "Object" is a string data type.

In [21]:
cool = [False, True, False, True, True, False, False, True, False, False]
pd.Series(cool)

0    False
1     True
2    False
3     True
4     True
5    False
6    False
7     True
8    False
9    False
dtype: bool

In [22]:
# You can turn a dictionary into a series, too
definitions = {"onomotopeia": "writing out a sound as it sounds.",
              "metaphor" : "comparing two objects to each other without using 'like' or 'as'",
               "simile" : "comparing two objects to each other using 'like' or 'as'"
              }
pd.Series(definitions)

onomotopeia                    writing out a sound as it sounds.
metaphor       comparing two objects to each other without us...
simile         comparing two objects to each other using 'lik...
dtype: object

### Attributes

attributes are values or properties of an object that are readable.

In [23]:
about_me = ["Smart","Handsome","Awesome","Haughty","Slow","Procrastinator"]
attributes = pd.Series(about_me)
attributes

0             Smart
1          Handsome
2           Awesome
3           Haughty
4              Slow
5    Procrastinator
dtype: object

type the name of the object, then type ".", then hit the "tab" key to see what you can do with that object

In [24]:
attributes.values

array(['Smart', 'Handsome', 'Awesome', 'Haughty', 'Slow',
       'Procrastinator'], dtype=object)

In [25]:
attributes.index #summary info about the object

RangeIndex(start=0, stop=6, step=1)

In [26]:
attributes.dtype # see what kind of data this is 

dtype('O')

In [27]:
#ndim to get the number dimensions (always 1 for a series.  Will be different for a dataframe)
attributes.ndim

1

In [28]:
#see the 'shape' of the data
attributes.shape

(6,)

In [29]:
prices.shape #note that lists don't have a 'shape'.  It must be a pandas object, such as a series or dataframe.

NameError: name 'prices' is not defined

In [30]:
newAge.shape

(10,)

In [31]:
attributes.size #check the total vlues in the list.  This will count null values

6

In [32]:
newAge.size

10

In [33]:
attributes.name = "Stuff" #create a name attribute to refer to the series

In [34]:
attributes.name

'Stuff'

### Series methods

In [35]:
prices = [3.78, 4.25, 8.45, 1.75, 2.89, 10.34, 24.17]
series = pd.Series(prices)
prices

[3.78, 4.25, 8.45, 1.75, 2.89, 10.34, 24.17]

In [36]:
#add up everything in the series
series.sum()

55.629999999999995

In [37]:
series.product()

171581.52340816983

In [38]:
series.mean()

7.947142857142857

In [39]:
series.mean()

7.947142857142857

### Parameters and Arguments


In [40]:
fruits = ["apple","orange","banana", "jaca", "jabuticaba","blueberry","qumqwat"]
weekdays = ["Sunday","Monday", "Tuesday","Wednesday","Thursday","Friday","Saturday"]
pd.Series(fruits, weekdays) #hold down Shift + Tab to see parameters
pd.Series(data = fruits, index = weekdays)
pd.Series()

Series([], dtype: float64)

### The inplace Parameter
this will overwrite the original data with whatever change you're making to the data

In [41]:
series.sort_values(ascending = False, inplace = True)
series

6    24.17
5    10.34
2     8.45
1     4.25
0     3.78
4     2.89
3     1.75
dtype: float64

## Pandas plays nicely with built-in python methods.

In [42]:
len(prices)

7

In [43]:
dir(prices) #return all available methods (even hidden ones)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [44]:
#sort the list
sorted(attributes)

['Awesome', 'Handsome', 'Haughty', 'Procrastinator', 'Slow', 'Smart']

In [45]:
#convert to a python list
list(attributes)

['Smart', 'Handsome', 'Awesome', 'Haughty', 'Slow', 'Procrastinator']

In [46]:
#convert a list to a dictionary
dict(attributes)

{0: 'Smart',
 1: 'Handsome',
 2: 'Awesome',
 3: 'Haughty',
 4: 'Slow',
 5: 'Procrastinator'}

In [47]:
# max
max(prices)

24.17

In [48]:
# get min value
min(prices)

1.75

# the .read_csv() method

In [49]:
pd.read_csv("../data/pandas/pokemon.csv", usecols = ['Pokemon'], squeeze = True) #hold SHIFT + TAB to see all the available parameters
# usecols: takes a list of just the columns you want
# squeeze: converts to a panda series

0       Bulbasaur
1         Ivysaur
2        Venusaur
3      Charmander
4      Charmeleon
5       Charizard
6        Squirtle
7       Wartortle
8       Blastoise
9        Caterpie
10        Metapod
11     Butterfree
12         Weedle
13         Kakuna
14       Beedrill
15         Pidgey
16      Pidgeotto
17        Pidgeot
18        Rattata
19       Raticate
20        Spearow
21         Fearow
22          Ekans
23          Arbok
24        Pikachu
25         Raichu
26      Sandshrew
27      Sandslash
28        Nidoran
29       Nidorina
          ...    
691     Clauncher
692     Clawitzer
693    Helioptile
694     Heliolisk
695        Tyrunt
696     Tyrantrum
697        Amaura
698       Aurorus
699       Sylveon
700      Hawlucha
701       Dedenne
702       Carbink
703         Goomy
704       Sliggoo
705        Goodra
706        Klefki
707      Phantump
708     Trevenant
709     Pumpkaboo
710     Gourgeist
711      Bergmite
712       Avalugg
713        Noibat
714       Noivern
715       

In [50]:
googleData = pd.read_csv('../data/pandas/google_stock_price.csv', squeeze = True)

# .head() and .tail() methods

In [51]:
#get data
def getData():
    pokemon = pd.read_csv("../data/pandas/pokemon.csv",usecols=['Pokemon'],squeeze=True)
    google = pd.read_csv("../data/pandas/google_stock_price.csv",squeeze=True)
    return pokemon, google

pokemon, google = getData()


In [52]:
#check to see if data is in this scope
pokemon.head(5) #gets first 5 rows.  Returns a COPY of new series

0     Bulbasaur
1       Ivysaur
2      Venusaur
3    Charmander
4    Charmeleon
Name: Pokemon, dtype: object

In [53]:
#check the end of this dataset
google.tail(5)

3007    772.88
3008    771.07
3009    773.18
3010    771.61
3011    782.22
Name: Stock Price, dtype: float64

# .sort_values() method

In [54]:
#sort value in ascending order
google.sort_values().head(10)

11    49.95
9     50.07
0     50.12
10    50.70
12    50.74
7     50.95
14    51.10
13    51.10
8     51.13
3     52.38
Name: Stock Price, dtype: float64

In [55]:
#get values in reverse order
google.sort_values(ascending=False).head(10)

3011    782.22
2859    776.60
3009    773.18
3007    772.88
3010    771.61
3008    771.07
2860    771.00
3006    768.79
2840    767.04
2843    766.81
Name: Stock Price, dtype: float64

# .sort_index() method
you can sort the series by the index rather than the value.  This way you can always get the original order back.

In [57]:
google.sort_index(ascending=False, inplace = True)
google

3011    782.22
3010    771.61
3009    773.18
3008    771.07
3007    772.88
3006    768.79
3005    745.91
3004    741.77
3003    738.42
3002    739.77
3001    742.74
3000    738.63
2999    741.19
2998    736.96
2997    733.78
2996    719.85
2995    720.95
2994    716.98
2993    720.64
2992    715.09
2991    705.63
2990    695.36
2989    697.77
2988    694.49
2987    699.21
2986    692.10
2985    684.11
2984    680.04
2983    668.26
2982    675.22
         ...  
29       64.74
28       65.47
27       63.37
26       59.07
25       59.86
24       60.35
23       59.13
22       58.86
21       59.62
20       58.69
19       56.93
18       55.94
17       55.69
16       53.70
15       52.61
14       51.10
13       51.10
12       50.74
11       49.95
10       50.70
9        50.07
8        51.13
7        50.95
6        53.02
5        53.90
4        52.95
3        52.38
2        54.65
1        54.10
0        50.12
Name: Stock Price, Length: 3012, dtype: float64

# using Python's "in" keyword

In [58]:
#you can use "in" to see if a value in a list, using python's built-in "in" method
11 in [1,2,3,4,5,6,7,8,9,10]

False

In [59]:
#but it won't work on this, like you might expect:
pokemon.head(5)
"Bulbasaur" in pokemon

#why not?

False

In [60]:
#b/c pandas is searching the index of a series.  So, I could check it against the index or values.  For example
"Bulbasaur" in pokemon.values

True

# Extract values from a series based on its index position or label

In [61]:
#just use the name of the series and treat the value at its position in the list
google[100]

96.67

In [62]:
#you can get several values using a list in side of the list
google[[100,34,76]]

100    96.67
34     69.36
76     85.63
Name: Stock Price, dtype: float64

In [63]:
#you can also get a range
google[12:20]

2999    741.19
2998    736.96
2997    733.78
2996    719.85
2995    720.95
2994    716.98
2993    720.64
2992    715.09
Name: Stock Price, dtype: float64

In [64]:
#or get values from the end
google[-10:]

9    50.07
8    51.13
7    50.95
6    53.02
5    53.90
4    52.95
3    52.38
2    54.65
1    54.10
0    50.12
Name: Stock Price, dtype: float64

In [65]:
#get everything up to a certain position
google[:15]

3011    782.22
3010    771.61
3009    773.18
3008    771.07
3007    772.88
3006    768.79
3005    745.91
3004    741.77
3003    738.42
3002    739.77
3001    742.74
3000    738.63
2999    741.19
2998    736.96
2997    733.78
Name: Stock Price, dtype: float64

### getting data from a label

In [66]:
#import column data as a series 
poke = pd.read_csv("../data/pandas/pokemon.csv", index_col="Pokemon", squeeze=True)
poke

Pokemon
Bulbasaur        Grass
Ivysaur          Grass
Venusaur         Grass
Charmander        Fire
Charmeleon        Fire
Charizard         Fire
Squirtle         Water
Wartortle        Water
Blastoise        Water
Caterpie           Bug
Metapod            Bug
Butterfree         Bug
Weedle             Bug
Kakuna             Bug
Beedrill           Bug
Pidgey          Normal
Pidgeotto       Normal
Pidgeot         Normal
Rattata         Normal
Raticate        Normal
Spearow         Normal
Fearow          Normal
Ekans           Poison
Arbok           Poison
Pikachu       Electric
Raichu        Electric
Sandshrew       Ground
Sandslash       Ground
Nidoran         Poison
Nidorina        Poison
                ...   
Clauncher        Water
Clawitzer        Water
Helioptile    Electric
Heliolisk     Electric
Tyrunt            Rock
Tyrantrum         Rock
Amaura            Rock
Aurorus           Rock
Sylveon          Fairy
Hawlucha      Fighting
Dedenne       Electric
Carbink           Rock
Goo

In [67]:
#even though there's no explicit index, there is one implicitly.
poke[0]

'Grass'

In [68]:
poke[-10:]

Pokemon
Bergmite         Ice
Avalugg          Ice
Noibat        Flying
Noivern       Flying
Xerneas        Fairy
Yveltal         Dark
Zygarde       Dragon
Diancie         Rock
Hoopa        Psychic
Volcanion       Fire
Name: Type, dtype: object

In [69]:
#this syntax also works for regular syntax labels.  e.g., 
poke['Avalugg']

'Ice'

In [70]:
#or for multiple extracts
poke[['Xerneas','Diancie',"Volcanion"]]

Pokemon
Xerneas      Fairy
Diancie       Rock
Volcanion     Fire
Name: Type, dtype: object

In [71]:
# you can even get a range
poke["Bergmite":"Volcanion"]

Pokemon
Bergmite         Ice
Avalugg          Ice
Noibat        Flying
Noivern       Flying
Xerneas        Fairy
Yveltal         Dark
Zygarde       Dragon
Diancie         Rock
Hoopa        Psychic
Volcanion       Fire
Name: Type, dtype: object

# Using .get() method on a series

In [72]:
# .get works best when the index is sorted.  So take care of that first
poke.sort_index(inplace = True)

In [73]:
poke.get(key="Yveltal")

'Dark'

In [74]:
#why use this instead of another method?  because you can get a default value of "None" or even set the default value
poke.get(key="Peter")

# Using math mathods on series objects

In [75]:
#get the count of a series.  Count() is similar to len(), but will not count missing values.
google.count()

3012

In [76]:
#add all the values in a series
google.sum()

1006942.0

In [77]:
#get the mean of a series
google.mean()

334.31009296148756

In [78]:
#another way to get the mean
google.sum()/google.count()

334.3100929614874

In [79]:
# get the standard deviation
google.std()

173.187204771131

In [80]:
#get max and min
google.min()

49.95

In [81]:
google.max()

782.22

In [82]:
#get the median
google.median()

283.315

In [83]:
# get the mode (most occurring #)
google.mode()

0    291.21
dtype: float64

In [84]:
#describe gives us a big statistical summary all at once
google.describe()

count    3012.000000
mean      334.310093
std       173.187205
min        49.950000
25%       218.045000
50%       283.315000
75%       443.000000
max       782.220000
Name: Stock Price, dtype: float64

# .idmax() and .idmin() methods

In [85]:
#returns an integer of the index of the max or min values
google.idxmax()

3011

In [86]:
google.idxmin()

11

In [87]:
#use this to get the max value
google[google.idxmax()]

782.22

# the .value_counts() method (like pivot tables)

In [88]:
# to get a count of how many times a specific value occurs in a series
poke.value_counts()

Water       105
Normal       93
Grass        66
Bug          63
Fire         47
Psychic      47
Rock         41
Electric     36
Ground       30
Dark         28
Poison       28
Fighting     25
Dragon       24
Ice          23
Ghost        23
Steel        22
Fairy        17
Flying        3
Name: Type, dtype: int64

In [89]:
#the returned value is a new series.  So, you can use other series methods on it.  To sum all the counts, e.g., 
poke.value_counts().sum()

721

In [90]:
poke.value_counts().mean()

40.05555555555556

In [91]:
# put the lower counts first
poke.value_counts(ascending=True)

Flying        3
Fairy        17
Steel        22
Ghost        23
Ice          23
Dragon       24
Fighting     25
Poison       28
Dark         28
Ground       30
Electric     36
Rock         41
Psychic      47
Fire         47
Bug          63
Grass        66
Normal       93
Water       105
Name: Type, dtype: int64

# the .apply() method on a series
applies a function on every single value in a series

In [92]:
#let's start by creating a custom python function
import math
math.log(2,3)

0.6309297535714574

In [93]:
#let's apply a logarithmic transformation to our data
##first, a reminer of logs.
math.log(8,2)

3.0

In [94]:
#this will give the same thing, but python documentation says it's usually more accurate
math.log2(8)

3.0

In [95]:
#define a function to apply to every row in the series
def classify_performance(num):
    if num < 300:
        return "meh"
    elif num >= 300 and num < 650:
        return "Satsifactory"
    else:
        "Wow!"

In [96]:
#apply the custom-defined function to a series
google.apply(classify_performance)

3011    None
3010    None
3009    None
3008    None
3007    None
3006    None
3005    None
3004    None
3003    None
3002    None
3001    None
3000    None
2999    None
2998    None
2997    None
2996    None
2995    None
2994    None
2993    None
2992    None
2991    None
2990    None
2989    None
2988    None
2987    None
2986    None
2985    None
2984    None
2983    None
2982    None
        ... 
29       meh
28       meh
27       meh
26       meh
25       meh
24       meh
23       meh
22       meh
21       meh
20       meh
19       meh
18       meh
17       meh
16       meh
15       meh
14       meh
13       meh
12       meh
11       meh
10       meh
9        meh
8        meh
7        meh
6        meh
5        meh
4        meh
3        meh
2        meh
1        meh
0        meh
Name: Stock Price, Length: 3012, dtype: object

In [99]:
#define a function to apply a logarithmic translation of the google data (just for fun)
def transform_log2(num):
    num = math.log2(num)
    return num

In [100]:
google.apply(transform_log2)

3011    9.611431
3010    9.591728
3009    9.594661
3008    9.590718
3007    9.594101
3006    9.586446
3005    9.542858
3004    9.534828
3003    9.528298
3002    9.530933
3001    9.536713
3000    9.528708
2999    9.533700
2998    9.525443
2997    9.519204
2996    9.491553
2995    9.493755
2994    9.485789
2993    9.493135
2992    9.481981
2991    9.462768
2990    9.441616
2989    9.446608
2988    9.439810
2987    9.449582
2986    9.434837
2985    9.418085
2984    9.409476
2983    9.384266
2982    9.399214
          ...   
29      6.016585
28      6.032762
27      5.985728
26      5.884354
25      5.903520
24      5.915282
23      5.885818
22      5.879216
21      5.897724
20      5.875043
19      5.831117
18      5.805808
17      5.799346
16      5.746850
15      5.717265
14      5.675251
13      5.675251
12      5.665052
11      5.642413
10      5.663914
9       5.645875
8       5.676098
7       5.671010
6       5.728465
5       5.752213
4       5.726559
3       5.710944
2       5.7721

In [102]:
#use apply with an anonymous function.  Note that you use must use the "lambda" keyword followed by a temporary variable name for the value you'll be modifying
google.apply(lambda log_value : math.log2(log_value))

3011    9.611431
3010    9.591728
3009    9.594661
3008    9.590718
3007    9.594101
3006    9.586446
3005    9.542858
3004    9.534828
3003    9.528298
3002    9.530933
3001    9.536713
3000    9.528708
2999    9.533700
2998    9.525443
2997    9.519204
2996    9.491553
2995    9.493755
2994    9.485789
2993    9.493135
2992    9.481981
2991    9.462768
2990    9.441616
2989    9.446608
2988    9.439810
2987    9.449582
2986    9.434837
2985    9.418085
2984    9.409476
2983    9.384266
2982    9.399214
          ...   
29      6.016585
28      6.032762
27      5.985728
26      5.884354
25      5.903520
24      5.915282
23      5.885818
22      5.879216
21      5.897724
20      5.875043
19      5.831117
18      5.805808
17      5.799346
16      5.746850
15      5.717265
14      5.675251
13      5.675251
12      5.665052
11      5.642413
10      5.663914
9       5.645875
8       5.676098
7       5.671010
6       5.728465
5       5.752213
4       5.726559
3       5.710944
2       5.7721

# The .map() method
Use the map method to "map" the values from one list onto another object

In [106]:
pokemon_names = pd.read_csv("../data/pandas/pokemon.csv", usecols=["Pokemon"], squeeze=True)
pokemon_names.head(6)

0     Bulbasaur
1       Ivysaur
2      Venusaur
3    Charmander
4    Charmeleon
5     Charizard
Name: Pokemon, dtype: object

In [107]:
pokemon_types = pd.read_csv("../data/pandas/pokemon.csv", index_col="Pokemon", squeeze = True)
pokemon_types.head(6)

Pokemon
Bulbasaur     Grass
Ivysaur       Grass
Venusaur      Grass
Charmander     Fire
Charmeleon     Fire
Charizard      Fire
Name: Type, dtype: object

In [109]:
#appply .map() to match values between these series.  This will take values from first series and match them to index values from the second series.  Then, it returns the value from the second series
pokemon_names.map(pokemon_types)

0         Grass
1         Grass
2         Grass
3          Fire
4          Fire
5          Fire
6         Water
7         Water
8         Water
9           Bug
10          Bug
11          Bug
12          Bug
13          Bug
14          Bug
15       Normal
16       Normal
17       Normal
18       Normal
19       Normal
20       Normal
21       Normal
22       Poison
23       Poison
24     Electric
25     Electric
26       Ground
27       Ground
28       Poison
29       Poison
         ...   
691       Water
692       Water
693    Electric
694    Electric
695        Rock
696        Rock
697        Rock
698        Rock
699       Fairy
700    Fighting
701    Electric
702        Rock
703      Dragon
704      Dragon
705      Dragon
706       Steel
707       Ghost
708       Ghost
709       Ghost
710       Ghost
711         Ice
712         Ice
713      Flying
714      Flying
715       Fairy
716        Dark
717      Dragon
718        Rock
719     Psychic
720        Fire
Name: Pokemon, Length: 7

In [110]:
#use .map() with dictionaries
pokemon_names = pd.read_csv("../data/pandas/pokemon.csv", usecols=["Pokemon"], squeeze=True)
pokemon_types = pd.read_csv("../data/pandas/pokemon.csv", index_col="Pokemon", squeeze = True).to_dict() #converts a list to a dict

In [113]:
pokemon_names.map(pokemon_types)

0         Grass
1         Grass
2         Grass
3          Fire
4          Fire
5          Fire
6         Water
7         Water
8         Water
9           Bug
10          Bug
11          Bug
12          Bug
13          Bug
14          Bug
15       Normal
16       Normal
17       Normal
18       Normal
19       Normal
20       Normal
21       Normal
22       Poison
23       Poison
24     Electric
25     Electric
26       Ground
27       Ground
28       Poison
29       Poison
         ...   
691       Water
692       Water
693    Electric
694    Electric
695        Rock
696        Rock
697        Rock
698        Rock
699       Fairy
700    Fighting
701    Electric
702        Rock
703      Dragon
704      Dragon
705      Dragon
706       Steel
707       Ghost
708       Ghost
709       Ghost
710       Ghost
711         Ice
712         Ice
713      Flying
714      Flying
715       Fairy
716        Dark
717      Dragon
718        Rock
719     Psychic
720        Fire
Name: Pokemon, Length: 7