# Introduction to Pandas

## Keywords

Series, querying a series, parallelization

In [65]:
%reset -f
import pandas as pd
import numpy as np

In [66]:
students = ['Ruslan','Masinjila',None]

In [67]:
pd.Series(students)

0       Ruslan
1    Masinjila
2         None
dtype: object

In [68]:
numbers = [1,2,3,4,5,None]

pd.Series(numbers)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64

In [69]:
# For floating points, the None type is NaN which means not a number
# The numbers are now floats. Pandas represents NaN as floating point numbers. 
# If the data of integers changes to floats, it is likely because some of the data is NaN
# NaN is not equivalent to None



In [70]:
np.nan == None

False

In [71]:
# Likewise, the following is false
np.nan == np.nan


False

In [72]:
# Instead, it should be tested as follows
np.isnan(np.nan)

True

In [73]:
# Series from dictionaries

student_scores = {'Alice': 'Physics',
                  'Bob' :   'Chemistry',
                  'Ruslan': 'Mathematics'

                    }

s = pd.Series(student_scores)
s

Alice         Physics
Bob         Chemistry
Ruslan    Mathematics
dtype: object

In [74]:
# Get the indices
s.index

Index(['Alice', 'Bob', 'Ruslan'], dtype='object')

In [75]:
# Series with indices

indices = ['Ruslan', 'Bob', 'Alice']
values = ['Mathematics', ' Chemisty','Physics']

pd.Series(values, index = indices)

Ruslan    Mathematics
Bob          Chemisty
Alice         Physics
dtype: object

In [76]:
# The index can be used to specify which keys from a dictionary to align with
# Example
pd.Series(student_scores, index = ['Ruslan','Jamila'])

Ruslan    Mathematics
Jamila            NaN
dtype: object

# Querying a series

In [77]:
# Querying can be done by the index position  or a label.

# Use iloc attribute for numerical querying starting at 0
# Use loc attribute to query by index label



In [78]:
# Students
students_classes = { 'Alice' : 'Physics',
                      'Jack' : 'Chemistry',
                      'Moly' : 'English',
                      'Sam'  : 'History'

                        }

s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Moly       English
Sam        History
dtype: object

In [79]:
# Fourth row using iloc
print(s.iloc[3])


# Pandas takes this as iloc
print(s[3])


# Fourth row using loc
print(s.loc['Sam'])

# Pandas reads this as loc
print(s['Sam'])

History
History
History
History


In [80]:
# If the index is a list of integers, then we need to be specific with iloc and loc
# For example
subjects= ['Physics', 'Chemistry','Maths']
codes = [100,101,102]

codes_subjects = pd.Series(subjects, index = codes)

codes_subjects

100      Physics
101    Chemistry
102        Maths
dtype: object

In [81]:
# Accessing data

grades = pd.Series([90,80,70,60])

np.sum(grades)

300

In [82]:
numbers = np.random.randint(0,1000,10000)


In [83]:
%%timeit -n 100

np.sum(numbers)/len(numbers)

14.1 µs ± 5.09 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [84]:
# Broadcasting

print(numbers) 

numbers += 2

print(numbers)

[ 13 335 127 ... 348  23 844]
[ 15 337 129 ... 350  25 846]


In [87]:
# .loc can also be used to add new index if the index does not exist.
# For example

s = pd.Series([10,20,30])
s
s.loc['History'] = 40
s

0          10
1          20
2          30
History    40
dtype: int64

In [88]:
# Students
students_classes = { 'Alice' : 'Physics',
                      'Jack' : 'Chemistry',
                      'Moly' : 'English',
                      'Sam'  : 'History'

                        }

s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Moly       English
Sam        History
dtype: object

In [90]:
kelly_classes = pd.Series(['Philosophy','Arts','Maths'],index= ['Kelly','Kelly','Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object

In [92]:
all_students_classes = s.append(kelly_classes)
all_students_classes

Alice       Physics
Jack      Chemistry
Moly        English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object

In [94]:
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object