# Introduction to Pandas

In [18]:
import numpy as np
import pandas as pd

from utils import render

### Meet series

A Series is like a Python dictionary and a NumPy array got together and had a child

In [2]:
# creating from a dictionary

test_balance_data = {
    'pasan': 20.00,
    'treasure': 20.18,
    'ashley': 1.05,
    'craig': 42.42,
}

# series constructor accepts any dict-like object
balances = pd.Series(test_balance_data)

# note that labels have been set from the test_balance_data.keys() and the values are set from test_balance_data.values()
balances

ashley       1.05
craig       42.42
pasan       20.00
treasure    20.18
dtype: float64

In [3]:
# creating from an iterable
# when labels aren't present they're defaulted to incremental integers starting at 0

unlabeled_balances = pd.Series([20.00, 20.18, 1.05, 42.42])
unlabeled_balances

0    20.00
1    20.18
2     1.05
3    42.42
dtype: float64

In [4]:
# you can also provide an index argument which requires an iterable the same size as your data

labeled_balances = pd.Series([20.00, 20.18, 1.05, 42.42],
                            index=['pasan','treasure','ashley','craig']
                            )
labeled_balances

pasan       20.00
treasure    20.18
ashley       1.05
craig       42.42
dtype: float64

In [5]:
# a numpy array is also iterable, so you can create a new series from an ndarray

ndbalances = np.array([20.00, 20.18, 1.05, 42.42])
pd.Series(ndbalances)

0    20.00
1    20.18
2     1.05
3    42.42
dtype: float64

In [6]:
# creating from a scalar and an index
pd.Series(20.00, index=["guil",'jay','james','ben','nick'])

# in other words, each key is assigned the same scalar value for the entire Series

guil     20.0
jay      20.0
james    20.0
ben      20.0
nick     20.0
dtype: float64

### Accessing a series

In [9]:
balances = pd.Series(test_balance_data)
balances

ashley       1.05
craig       42.42
pasan       20.00
treasure    20.18
dtype: float64

In [8]:
# a series is ordered and indexable

# get the first user's balance
balances[0]

1.05

In [10]:
type(balances[0])

numpy.float64

In [11]:
# the last balance
balances[-1]

20.18

In [12]:
# accessing by label
balances['pasan']

20.0

In [19]:
# series behave like dictionaries

for label, value in balances.items():
    render("The label {} has a value of {}".format(label, value))

The label ashley has a value of 1.05

The label craig has a value of 42.42

The label pasan has a value of 20.0

The label treasure has a value of 20.18

In [21]:
try:
    balances['kermit']
except:
    render("Accessing a non-existent key raises a 'KeyError'")

Accessing a non-existent key raises a 'KeyError'

In [22]:
# use get to safely access keys. None is returned if key not present

if 'kermit' not in balances:
    render("Use 'in' to test the existence of a label")

Use 'in' to test the existence of a label

In [23]:
# accessing by property
balances.ashley

1.05

In [24]:
# accessing more explicitly with loc and iloc

# a series exposes a property named loc which can be used to explicitly lookup by label based indices only
balances.loc['pasan']

20.0

In [25]:
# and to use the positional index explicitly, you can use the property iloc:
balances.iloc[0]

1.05

In [26]:
# slicing by positional index
# includes values from 0
# up until and not including 3
balances.iloc[0:3]

ashley     1.05
craig     42.42
pasan     20.00
dtype: float64

In [30]:
# slicing by label
# when using labels, the slice is inclusive. The last item is included

balances.loc['ashley':'pasan']

ashley     1.05
craig     42.42
pasan     20.00
dtype: float64

### Vectorization and Broadcasting Review

Vectorization and broadcasting are what makes NumPy so fast. Pandas' data structures have similar super powers

In [31]:
# vectorization allows us to avoid looping

test_balance_data = {
    'pasan': 20.00,
    'treasure': 20.18,
    'ashley': 1.05,
    'craig': 42.42,
}

test_deposit_data = {
    'pasan': 20,
    'treasure': 10,
    'ashley': 100,
    'craig': 55,
}

balances = pd.Series(test_balance_data)
deposits = pd.Series(test_deposit_data)

In [32]:
# Vectorization
# while it is indeed possible to loop through each item and apply it to another...

for label, value in deposits.iteritems():
    balances[label] += value
balances

ashley      101.05
craig        97.42
pasan        40.00
treasure     30.18
dtype: float64

In [33]:
# it's important to remember to lean on vectorization and skip the loops altogether
# vectorization is faster and easier to read and write

# undo the change using inplace subtraction
balances -= deposits

# this is the same as the loop above using inplace addition
balances += deposits
balances

ashley      101.05
craig        97.42
pasan        40.00
treasure     30.18
dtype: float64

In [34]:
# Broadcasting
# broadcasting a scalar

# 5 is broadcasted and added to each and every value
# this returns a new series
balances + 5

ashley      106.05
craig       102.42
pasan        45.00
treasure     35.18
dtype: float64

In [35]:
# Broadcasting a series
# labels are used to line up entries
# when the label only exists in one side, a np.nan is put in place

# cashbox is giving out free coupons that users can scan into the app to get $1 added to their accounts

coupons = pd.Series(1, ['craig', 'ashley', 'james'])
coupons

craig     1
ashley    1
james     1
dtype: int64

In [36]:
# now add coupons to people who cashed them in

balances + coupons

ashley      102.05
craig        98.42
james          NaN
pasan          NaN
treasure       NaN
dtype: float64

In [37]:
# Using the fill_value parameter
balances.add(coupons, fill_value=0)

ashley      102.05
craig        98.42
james         1.00
pasan        40.00
treasure     30.18
dtype: float64

### Meet DataFrames

A dataframe is basically just a 2D collection of series

In [38]:
test_users_list = [
    ['Craig','Dennis', 42.42],
    ['Treasure', 'Porth', 25.00]
]

pd.DataFrame(test_users_list)

Unnamed: 0,0,1,2
0,Craig,Dennis,42.42
1,Treasure,Porth,25.0


In [43]:
# add in column headings

pd.DataFrame(test_users_list, index=['craigsdennis', 'treasure'],
            columns=['first_name', 'last_name', 'balance'])

Unnamed: 0,first_name,last_name,balance
craigsdennis,Craig,Dennis,42.42
treasure,Treasure,Porth,25.0


In [46]:
# from a dictionary

# much like a series, if you don't specify the index, it will be autogenerated in range format

test_user_data = {
    'first_name': ['Craig', 'Treasure'],
    'last_name': ['Dennis', 'Porth'],
    'balance': [42.42, 25.00]
}

pd.DataFrame(test_user_data)

Unnamed: 0,balance,first_name,last_name
0,42.42,Craig,Dennis
1,25.0,Treasure,Porth


In [47]:
# supply index using index keyword

pd.DataFrame(test_user_data, index=['craigsdennis', 'treasure'])

Unnamed: 0,balance,first_name,last_name
craigsdennis,42.42,Craig,Dennis
treasure,25.0,Treasure,Porth


In [48]:
# DataFrame.from_dict adds more options

# the orient keyword

by_username = {
    'craigsdennis': {
        'first_name': 'Craig',
        'last_name': 'Dennis',
        'balance': 42.42
    },
    'treasure': {
        'first_name': 'Treasure',
        'last_name': 'Porth',
        'balance': 25.00
    }
}

pd.DataFrame.from_dict(by_username, orient='index')

Unnamed: 0,first_name,last_name,balance
craigsdennis,Craig,Dennis,42.42
treasure,Treasure,Porth,25.0


### Accessing a datraframe

In [50]:
test_user_data = {
    'first_name': ['Craig', 'Treasure', 'Ashley', 'Guil'],
    'last_name': ['Dennis', 'Porth', 'Boucher', 'Hernandez'],
    'balance': [42.42, 25.00, 2.02, 87.00]
}

test_user_names = ['craigsdennis', 'treasure', 'lindsay2000', 'guil']

users = pd.DataFrame(test_user_data, index=test_user_names)

users

Unnamed: 0,balance,first_name,last_name
craigsdennis,42.42,Craig,Dennis
treasure,25.0,Treasure,Porth
lindsay2000,2.02,Ashley,Boucher
guil,87.0,Guil,Hernandez


In [51]:
# by column name
balances = users['balance']
balances

craigsdennis    42.42
treasure        25.00
lindsay2000      2.02
guil            87.00
Name: balance, dtype: float64

In [52]:
balances.name

'balance'

In [53]:
# by label
# retrieve a row from df by using loc property and supply the label

users.loc['guil']

balance              87
first_name         Guil
last_name     Hernandez
Name: guil, dtype: object

In [54]:
# by position

users.iloc[1]

balance             25
first_name    Treasure
last_name        Porth
Name: treasure, dtype: object

In [55]:
# retrieve a specific value
# by chaining

# CAREFUL: this first retrieves the column Series and then uses the label
users['first_name']['craigsdennis']

'Craig'

In [56]:
# CAREFUL: this retrieves the row 'series' and then does a lookup for first_name

users.loc['craigsdennis']['first_name']

'Craig'

In [57]:
# using dataframe.loc

users.loc['craigsdennis', 'first_name']

'Craig'

In [58]:
# using DataFrame.at

users.at['craigsdennis', 'first_name']

'Craig'

In [59]:
# retrieve a specific dataframe through slicing
# all rows and the following ordered columns as a list
users.loc[:, ['balance', 'last_name']]

Unnamed: 0,balance,last_name
craigsdennis,42.42,Dennis
treasure,25.0,Porth
lindsay2000,2.02,Boucher
guil,87.0,Hernandez


In [60]:
# when using a slice with loc the results are inclusive
users.loc['treasure':'lindsay2000', :]

Unnamed: 0,balance,first_name,last_name
treasure,25.0,Treasure,Porth
lindsay2000,2.02,Ashley,Boucher


In [61]:
# when using a slice with iloc the results are exclusive
users.iloc[1:2, 1:]

Unnamed: 0,first_name,last_name
treasure,Treasure,Porth
