# Pandas - Vectorized Operations & Time Series
## Introducing Pandas String Operations

In [9]:
import numpy as np
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [10]:
# Numpy doesn't offer fast vectorization of strings
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [11]:
# Will break if there are missing values
data = ['peter', 'Paul',None, 'MARY', 'gUIDO']
[s.capitalize() for s in data]

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [12]:
# Pandas to the rescue
import pandas as pd
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [13]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

## Tables of Pandas String Methods

In [14]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

#### Methods similar to Python string methods

||||
|----------|--------------|--------------|--------------|
| len()    | lower()      | translate()  | islower()    |
| ljust()  | upper()      | startswith() | isupper()    |
| rjust()  | find()       | endswith()   | isnumeric()  |
| center() | rfind()      | isalnum()    | isdecimal()  |
| zfill()  | index()      | isalpha()    | split()      |
| strip()  | rindex()     | isdigit()    | rsplit()     |
| rstrip() | capitalize() | isspace()    | partition()  |
| lstrip() | swapcase()   | istitle()    | rpartition() |

In [15]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [16]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [17]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [18]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

#### Methods using regular expressions

| Method     | Description                                                           |
|------------|-----------------------------------------------------------------------|
| match()    | Call re.match() on each element, returning a Boolean.                 |
| extract()  | Call re.match() on each element, returning matched groups as strings. |
| findall()  | Call re.findall() on each element.                                    |
| replace()  | Replace occurrences of pattern with some other string.                |
| contains() | Call re.search() on each element, returning a Boolean.                |
| count()    | Count occurrences of pattern.                                         |
| split()    | Equivalent to str.split(), but accepts regexps.                       |
| rsplit()   | Equivalent to str.rsplit(), but accepts regexps.                      |

In [19]:
monte.str.extract('([A-Za-z]+)')

  """Entry point for launching an IPython kernel.


0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [20]:
# find all names that start and end with a consonant
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

#### Miscellaneous methods
| Method          | Description                                                       |
|-----------------|-------------------------------------------------------------------|
| get()           | Index each element                                                |
| slice()         | Slice each element                                                |
| slice_replace() | Replace slice in each element with passed value                   |
| cat()           | Concatenate strings                                               |
| repeat()        | Repeat values                                                     |
| normalize()     | Return Unicode form of string                                     |
| pad()           | Add whitespace to left, right, or both sides of strings.          |
| wrap()          | Split long strings into lines with length less than a given width |
| join()          | Join strings in each element of the Series with passed separator  |
| get_dummies()   | Extract dummy variables as a DataFrame                            |

#### Vectorized item access and slicing

In [21]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [22]:
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

#### Indicator variables

In [23]:
full_monte = pd.DataFrame({'name': monte,
                           'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C', 'B|C|D']})
full_monte

Unnamed: 0,info,name
0,B|C|D,Graham Chapman
1,B|D,John Cleese
2,A|C,Terry Gilliam
3,B|D,Eric Idle
4,B|C,Terry Jones
5,B|C|D,Michael Palin


In [24]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


## Example: Recipe Database

In [31]:
import json
try:
    recipes = pd.read_json('data/recipeitems.json')
except ValueError as e:
    print "ValueError:", e

ValueError: Trailing data


In [32]:
with open('data/recipeitems.json') as f:
   line = f.readline()
pd.read_json(line).shape

(2, 12)

In [34]:
# read the entire file into a Python array
with open('data/recipeitems.json', 'r') as f:
    # Extract each line
    data = (line.strip() for line in f)
    # Reformat so each line is the element of a list
    data_json = "[{0}]".format(','.join(data))
# read the result as a JSON
recipes = pd.read_json(data_json)

In [35]:
recipes.shape

(173278, 17)

In [36]:
recipes.iloc[0]

_id                              {u'$oid': u'5160756b96cc62079cc2db15'}
cookTime                                                          PT30M
creator                                                             NaN
dateModified                                                        NaN
datePublished                                                2013-03-11
description           Late Saturday afternoon, after Marlboro Man ha...
image                 http://static.thepioneerwoman.com/cooking/file...
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
name                                    Drop Biscuits and Sausage Gravy
prepTime                                                          PT10M
recipeCategory                                                      NaN
recipeInstructions                                                  NaN
recipeYield                                                          12
source                                                  thepione

In [37]:
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [38]:
# What recipe has the longest ingredient list?
recipes.name[np.argmax(recipes.ingredients.str.len())]

u'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [39]:
# How many recipes for breakfast food?
recipes.description.str.contains('[Bb]reakfast').sum()

3524

In [40]:
# How many list cinnamon as an ingredient?
recipes.ingredients.str.contains('[Cc]innamon').sum()

10526

In [41]:
# Did any misspell the word 'cinnamon'?
recipes.ingredients.str.contains('[Cc]inamon').sum()

11

#### A simple recipe recommender
Given a list of ingredients, find a recipe that uses all those ingredients.

In [42]:
spice_list = ['salt', 'pepper', 'oregano', 'sage', 'parsley',
              'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin']

In [43]:
# build boolean dataframe consisting of T or F whether this ingredient appears in list
import re
spice_df = pd.DataFrame(
           dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE)) for spice in spice_list))
spice_df.head()

Unnamed: 0,cumin,oregano,paprika,parsley,pepper,rosemary,sage,salt,tarragon,thyme
0,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [45]:
# find a recipe that uses parsley paprika and tarragon
selection = spice_df.query('parsley & paprika & tarragon')
len(selection)

10

In [46]:
recipes.name[selection.index]

2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 Southern fried chicken in buttermilk
163175            Fried Chicken Sliders with Pickles + Slaw
165243                        Bar Tartine Cauliflower Salad
Name: name, dtype: object

## Working with Time Series
### Dates and Times in Python
#### Native Python dates and times: datetime and dateutil