# The Pandas Library

![cute panda](./panda.jpg)

The Pandas library builds off from NumPy's arrays.
Pandas allows to store multi-dimensional arrays with heterogeneous or missing data types in `DataFrames`.

In [1]:
import numpy as np
import pandas as pd

## Pandas Datastructures

### Series
A series is a 1D python array

In [2]:
# Creating a series

grades = pd.Series([4.0, 3.7, 4.0, 4.0, 3.3, 3.7])
print(grades)

0    4.0
1    3.7
2    4.0
3    4.0
4    3.3
5    3.7
dtype: float64


In [3]:
# Series attributes

print(grades.values)  # returns list
print(grades.index)

[4.  3.7 4.  4.  3.3 3.7]
RangeIndex(start=0, stop=6, step=1)


In [4]:
# Indexing Series

print(grades[0])
print(grades[1:3])  # returns series

4.0
1    3.7
2    4.0
dtype: float64


In [5]:
# Explicit Indices

grades = pd.Series([4.0, 3.7, 4.0, 4.0, 3.3, 3.7],
                  index=['calc', 'phys', 'chem', 'stats', 'CSO', 'DSA'])

print(grades.values)  # still the same
print(grades.index)
print(grades['calc'])
print(grades['phys':'CSO'])  # can still be sliced!

[4.  3.7 4.  4.  3.3 3.7]
Index(['calc', 'phys', 'chem', 'stats', 'CSO', 'DSA'], dtype='object')
4.0
phys     3.7
chem     4.0
stats    4.0
CSO      3.3
dtype: float64


A series can actually be used as a dictionary!

### DataFrame
A data frame is a 2D NumPy array that stores a table of data

In [6]:
# Creating a DataFrame

classes = ['calc', 'phys', 'chem', 'stats', 'CSO', 'DSA']
grades = [4.0, 3.7, 4.0, 4.0, 3.3, 3.7]
codes = ["MATH 201", "PHYS 121", "CHEM 101", "MATH 141", "CS 301", "CS 302"]

courses = pd.DataFrame(index=classes, data={'grades': grades, 'codes': codes})

In [7]:
# Creating a DataFrame from Series
grades = pd.Series([4.0, 3.7, 4.0, 4.0, 3.3, 3.7],
                   index = ['calc', 'phys', 'chem', 'stats', 'CSO', 'DSA'])
codes = pd.Series(["MATH 201", "PHYS 121", "CHEM 101", "MATH 141", "CS 301", "CS 302"],
                   index = ['calc', 'phys', 'chem', 'stats', 'CSO', 'DSA'])

classes = pd.DataFrame({'grades': grades, 'codes': codes})

In [8]:
print(classes)
print(classes.values)
print(classes.index)
print(classes.columns)

       grades     codes
calc      4.0  MATH 201
phys      3.7  PHYS 121
chem      4.0  CHEM 101
stats     4.0  MATH 141
CSO       3.3    CS 301
DSA       3.7    CS 302
[[4.0 'MATH 201']
 [3.7 'PHYS 121']
 [4.0 'CHEM 101']
 [4.0 'MATH 141']
 [3.3 'CS 301']
 [3.7 'CS 302']]
Index(['calc', 'phys', 'chem', 'stats', 'CSO', 'DSA'], dtype='object')
Index(['grades', 'codes'], dtype='object')


In [9]:
# Indexing a DataFrame
print(classes['grades'])

calc     4.0
phys     3.7
chem     4.0
stats    4.0
CSO      3.3
DSA      3.7
Name: grades, dtype: float64


### Index
An Index is an immutable array defining the indices of a Series of DataFrame

In [10]:
# Creating Indexes`
fib = pd.Index([0, 1, 2, 3, 5, 8, 13])
primes = pd.Index([2, 3, 5, 7, 11])

print(fib)
print(fib.size)
print(fib[0:3])

Int64Index([0, 1, 2, 3, 5, 8, 13], dtype='int64')
7
Int64Index([0, 1, 2], dtype='int64')


In [11]:
# Indexes support set operations
print(fib & primes)
print(fib | primes)
print(fib ^ primes)

Int64Index([2, 3, 5], dtype='int64')
Int64Index([0, 1, 2, 3, 5, 7, 8, 11, 13], dtype='int64')
Int64Index([0, 1, 7, 8, 11, 13], dtype='int64')


## Pandas Operations

In [12]:
# DataFrame and Series objects support ufuncs

grades = pd.Series([4.0, 3.7, 4.0, 4.0, 3.3, 3.7])
grades *= 1.5
print(grades)

0    6.00
1    5.55
2    6.00
3    6.00
4    4.95
5    5.55
dtype: float64


## Missing Data Values

In [13]:
grades = pd.Series([4.0, 4.0, 4.0, None, np.nan])
print(grades)  # converts to NaN

0    4.0
1    4.0
2    4.0
3    NaN
4    NaN
dtype: float64


In [16]:
# Detecting null values
print(grades.isnull())

# Filtering null values
print(grades.dropna())
print(grades.fillna(4.0))

0    False
1    False
2    False
3     True
4     True
dtype: bool
0    4.0
1    4.0
2    4.0
dtype: float64
0    4.0
1    4.0
2    4.0
3    4.0
4    4.0
dtype: float64
