In [1]:
#Series is a one-dimensional labeled array capable of holding data of any type (integer, string, float, python objects, etc.).
#The axis labels are collectively called index.


#The series is one of the core data structures in pandas. You think of it a cross between a list and a dictionary.
#The items are all stored in an order and there's labels with which you can retrieve them.
#An easy way to visualize this is two columns of data. The first is the special index, a lot like keys in a dictionary.
#While the second is your actual data.
#It's important to note that the data column has a label of its own and can be retrieved using the .name attribute.
#This is different than with dictionaries and is useful when it comes to merging multiple columns of data. 
#And we'll talk about that later on in the course.

# Let's import pandas to get started

#Pandas deals with the following three data structures −

#Series - 1D labeled homogeneous array, sizeimmutable.
#DataFrame - General 2D labeled, size-mutable tabular structure with potentially heterogeneously typed columns.
#Panel - General 3D labeled, size-mutable array.

In [2]:
import pandas as pd

# As you might expect, you can create a series by passing in a list of values. 
# When you do this, Pandas automatically assigns an index starting with zero and
# sets the name of the series to None. Let's work on an example of this.

# One of the easiest ways to create a series is to use an array-like object, like 
# a list. 

# Here I'll make a list of the three of students, Alice, Jack, and Molly, all as strings


students = ['Alice', 'Jack', 'Molly', 'Tuba']

# Now we just call the Series function in pandas and pass in the students
pd.Series(students)

0    Alice
1     Jack
2    Molly
3     Tuba
dtype: object

In [4]:
# Create an Empty Series

s = pd.Series()
print(s)

Series([], dtype: float64)


  s = pd.Series()


In [5]:
# The result is a Series object which is nicely rendered to the screen. We see here that 
# the pandas has automatically identified the type of data in this Series as "object" and
# set the dytpe parameter as appropriate. We see that the values are indexed with integers,
# starting at zero

In [6]:
# Lets create a little list of numbers

numbers = [1, 2, 3]

# And turn that into a series

pd.Series(numbers)

# And we see on my architecture that the result is a dtype of int64 objects

0    1
1    2
2    3
dtype: int64

In [7]:
# There's some other typing details that exist for performance that are important to know. 
# The most important is how Numpy and thus pandas handle missing data. 

# In Python, we have the none type to indicate a lack of data. But what do we do if we want 
# to have a typed list like we do in the series object?

# Underneath, pandas does some type conversion. If we create a list of strings and we have 
# one element, a None type, pandas inserts it as a None and uses the type object for the 
# underlying array. 

In [8]:
students = ['Alice', 'Jack', None]
# And lets convert this to a series
pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [9]:
# However, if we create a list of numbers, integers or floats, and put in the None type,
# pandas automatically converts this to a special floating point value designated as NaN, 
# which stands for "Not a Number".

# So lets create a list with a None value in it

In [10]:
numbers = [1, 2, None]
# And turn that into a series
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [11]:
# You'll notice a couple of things. First, NaN is a different value. Second, pandas
# set the dytpe of this series to floating point numbers instead of object or ints. That's
# maybe a bit of a surprise - why not just leave this as an integer? Underneath, pandas
# represents NaN as a floating point number, and because integers can be typecast to
# floats, pandas went and converted our integers to floats. So when you're wondering why the
# list of integers you put into a Series is not floats, it's probably because there is some
# missing data.

In [12]:
# For those who might not have done scientific computing in Python before, it is important
# to stress that None and NaN might be being used by the data scientist in the same way, to
# denote missing data, but that underneath these are not represented by pandas in the same
# way.

# NaN is *NOT* equivilent to None and when we try the equality test, the result is False.

# Lets bring in numpy which allows us to generate an NaN value

In [13]:
import numpy as np
# And lets compare it to None
np.nan == None

False

In [14]:
# It turns out that you actually can't do an equality test of NAN to itself. When you do, 
# the answer is always False. 
np.nan == np.nan

False

In [15]:
# Instead, you need to use special functions to test for the presence of not a number, 
# such as the Numpy library isnan().

np.isnan(np.nan)

True

In [16]:
# So keep in mind when you see NaN, it's meaning is similar to None, but it's a 
# numeric value and treated differently for efficiency reasons.

# A series can be created directly from dictionary data. If you do this, the index is 
# automatically assigned to the keys of the dictionary that you provided and not just 
# incrementing integers.

In [17]:
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
s = pd.Series(students_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [18]:
# We see that, since it was string data, pandas set the data type of the series to "object".
# We see that the index, the first column, is also a list of strings.


# Once the series has been created, we can get the index object using the index attribute.

s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [19]:
# Now, this is kind of interesting. The dtype of object is not just for strings, but for
# arbitrary objects. Lets create a more complex type of data, say, a list of tuples.


students = [("Alice","Brown"), ("Jack", "White"), ("Molly", "Green")]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [22]:
students = [["Alice","Brown"], ["Jack", "White"], ("Molly", "Green")]
pd.Series(students)

0    [Alice, Brown]
1     {Jack, White}
2    (Molly, Green)
dtype: object

In [25]:
# You can also separate your index creation from the data by passing in the index as a 
# list explicitly to the series.

s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [27]:
# So what happens if your list of values in the index object are not aligned with the keys 
# in your dictionary for creating the series? Well, pandas overrides the automatic creation 
# to favor only and all of the indices values that you provided. So it will ignore from your 
# dictionary all keys which are not in your index, and pandas will add None or NaN type values 
# for any index value you provide, which is not in your dictionary key list.

# Here's and example. I'll pass in a dictionary of three items, in this case students and
# their courses
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
# When I create the series object though I'll only ask for an index with three students, and
# I'll exclude Jack
s = pd.Series(students_scores, index=['Alice', 'Molly', 'Sam'])
s

Alice      Physics
Molly      English
Jack     Chemistry
Sam            NaN
dtype: object

In [28]:
s = pd.Series(students_scores, index=['Alice', 'Molly','Jack' ,'Sam'])
s

Alice      Physics
Molly      English
Jack     Chemistry
Sam            NaN
dtype: object

In [31]:
# Create a Series from ndarray

# If data is an ndarray, then index passed must be of the same length. If no index is passed,
# then by default index will be range(n) where n is array length, i.e., [0,1,2,3…. range(len(array))-1].

import pandas as pd
import numpy as np
data = np.array(['a','b','c','d'])
s = pd.Series(data)
s

0    a
1    b
2    c
3    d
dtype: object

In [32]:
# Create a Series from Scalar

# If data is a scalar value, an index must be provided. The value will be repeated to match the length of index

import pandas as pd
import numpy as np
s = pd.Series(5, index=[0, 1, 2, 3])
s

0    5
1    5
2    5
3    5
dtype: int64

In [39]:
# Accessing Data from Series with Position

# Retrieve the first element. As we already know, the counting starts from zero for the array,
# which means the first element is stored at zeroth position and so on.

import pandas as pd
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])

#retrieve the first element
print(s[0])
print(s)
print(s['b'])

1
a    1
b    2
c    3
d    4
e    5
dtype: int64
2


In [42]:
# Retrieve the first three elements in the Series. If a : is inserted in front of it, all items from that index onwards
# will be extracted. If two parameters (with : between them) is used, items between the two indexes
# (not including the stop index)


import pandas as pd
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])

#retrieve the first three element
print (s[:3])

#retrieve the last three element
print(s[-3:])

a    1
b    2
c    3
dtype: int64
c    3
d    4
e    5
dtype: int64
