# Built-in data structures, functions, and files

In [93]:
# Setup
import pandas as pd
import numpy as np
import re # reguar expressions
import requests

### Tuples

In [3]:
# Tuples are immutable and can be created in a number of different ways
my_tuple = 4, 5, 6
another_tuple = (4, 5, 6)
yet_another_tuple = tuple([4, 5, 6])

print(f'Are all of these tuples equivalent?\n{my_tuple == another_tuple == yet_another_tuple}')

Are all of these tuples equivalent?
True


In [7]:
# As noted, the tuple data structure itself is immutable, although the contents may be mutable
try: 
    tuple[0] = 1
except TypeError:
    print("Sorry, you can't do that.  Tuples are immutable!")

Sorry, you can't do that.  Tuples are immutable!


In [9]:
# One slick Python feature is tuple unpacking-- give an expression and Python will try to unpack the data in the tuple into the variables provided
my_tuple = (4, 5, 6)
a, b, c = my_tuple

for letter, val in zip(['a', 'b', 'c'], [a, b, c]):
    print(f'Value of {letter} is {val}.')

Value of a is 4
Value of b is 5
Value of c is 6


In [10]:
# We can use variable unpacking to iterate over sequences of tuples, just as we did with lists
my_tuples = ((1, 2, 3), 
            (4, 5, 6),
            (7, 8, 9))

for a, b, c in my_tuples:
    print(f'a = {a}, b = {b}, c = {c}')

a = 1, b = 2, c = 3
a = 4, b = 5, c = 6
a = 7, b = 8, c = 9


In [11]:
# We can also use the * syntax used to capture any number of arguments 
values = (1, 2, 3, 4, 5)
a, b, *remaining = values
print(f'Value of a: {a}\n')
print(f'Value of b: {b}\n')
print(f'All remaining values: {remaining}')

Value of a: 1

Value of b: 2

All remaining values: [3, 4, 5]


### Lists

In [12]:
# Lists are mutable and can be used to materialize values from a generator
generator = range(11)
list(generator)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [2]:
# Sorting with the sort() method is done in place
# You can use other keys-- like the length of the elements
b = ['saw', 'small', 'He', 'foxes', 'six']
b.sort()
print(f'Sorted b, no keys:\n{b}')

Sorted b, no keys:
['He', 'foxes', 'saw', 'six', 'small']


In [3]:
# Sort by length
b.sort(key=len)
print(f'Sorted b, key = len:\n{b}')

Sorted b, key = len:
['He', 'saw', 'six', 'foxes', 'small']


In [8]:
# Sort by length-- reversed
b.sort(key=len)
b.reverse()
print(f'Sorted b, key = len, in reverse order:\n{b}')

Sorted b, key = len, in reverse order:
['small', 'foxes', 'six', 'saw', 'He']


In [15]:
# Slicing-- using negative integers to reverse the list
my_list = list(range(11))
my_list.reverse()
rev_list = list(my_list)

print(f'Reversed list of numbers:\n{rev_list}\n')
print(f'Original list of numbers:\n{rev_list[::-1]}')

Reversed list of numbers:
[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

Original list of numbers:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


### Built-in sequence functions

In [16]:
# One useful function is enumerate(), which creates a sequence of an interable and the associated indexes
some_list = ['foo', 'bar', 'baz']
for index, element in enumerate(some_list):
    print(f'Index: {index}, element: {element}')

Index: 0, element: foo
Index: 1, element: bar
Index: 2, element: baz


In [17]:
# This strategy can also be easily used to create dictionaries
some_dict = {}
for index, element in enumerate(some_list):
    some_dict[index] = element

print(some_dict)

{0: 'foo', 1: 'bar', 2: 'baz'}


In [26]:
# The zip method() is a way of iterating over two sequences simultaneously
seq1 = ['foo', 'bar', 'baz']
seq2 = ['one', 'two', 'three']

print(f'Using zip with enumerate:')
for i, (a, b) in enumerate(zip(seq1, seq2)):
    print(f'{i}: {a}, {b}')
    
# Now use without enumerate
print(f'\nUsing zip without enumerate:')
for (a, b) in zip(seq1, seq2):
    print(f'{a}, {b}')
    


Using zip with enumerate:
0: foo, one
1: bar, two
2: baz, three

Using zip without enumerate:
foo, one
bar, two
baz, three


In [29]:
# reversed() is another helpful built-in, but like range(), it's a generator
# Thus, we need to materialize the contents with list() or another function
reversed(seq1)

<list_reverseiterator at 0x7fdfd14ff430>

In [30]:
list(reversed(seq1))

['baz', 'bar', 'foo']

### Dictionaries

In [31]:
# These are called different things-- hash maps, associative arrays
d1 = {'a': 'some value',
     'b': [1, 2, 3, 4]}

# Get keys and values
for key, val in d1.items():
    print(f'Key: {key}, value: {val}')

Key: a, value: some value
Key: b, value: [1, 2, 3, 4]


In [33]:
# Merge dictionaries using update()-- note, this modifies in place
d1.update({'b': 'foo', 
          'c': 12})

print(d1)

{'a': 'some value', 'b': 'foo', 'c': 12}


In [4]:
# If we want to build a dictionary from two sequences, we have two options
# First, we could use the loop approach presented earlier
# Or, we could just use the dict() function
key_list = ['fruit', 'vegetable', 'poultry', 'dairy', 'drink']
value_list = ['banana', 'cucumber', 'chicken', 'milk', 'wine']

food_dict = {}
for key, val in zip(key_list, value_list):
    food_dict[key] = val
    
for key, val in food_dict.items():
    print(f'{key.title()}: {val}')


Fruit: banana
Vegetable: cucumber
Poultry: chicken
Dairy: milk
Drink: wine


In [6]:
# Use the dict() approach
food_dict2 = dict(zip(key_list, value_list))
for key, val in food_dict2.items():
    print(f'{key.title()}: {val}')

Fruit: banana
Vegetable: cucumber
Poultry: chicken
Dairy: milk
Drink: wine


In [2]:
# Create a dictionary with letters as keys and lists as values
words = ['apple', 'bat', 'bar', 'atom', 'book', 'cat', 'cook']
by_letter = {}

for word in words:
    letter = word[0]
    if letter not in by_letter.keys():
        by_letter[letter] = [word] # making a list
    else:
        by_letter[letter].append(word)
        
for key, val in by_letter.items():
    print(f'{key}: {val}')

a: ['apple', 'atom']
b: ['bat', 'bar', 'book']
c: ['cat', 'cook']


In [4]:
# The rule of thumb with dictionary keys-- they need to be hashable
# Hence, why dictionaries are also called hash tables
# Use the hash() function to check if an object is hashable
# Hashable objects are immutable
# Hashes are fixed-size integers that Python uses for looking up dictionary keys

print('Key: hash')
for key in by_letter.keys():
    print(f'{key}: {hash(key)}')

Key: hash
a: 2982953944752013288
b: 3877403595932308983
c: 4665640972348144721


In [6]:
list_of_lists = [[1, 2], [3, 4], [5, 6]]
for my_list in list_of_lists:
    try:
        print(f'{my_list}: {hash(my_list)}')
    except TypeError:
        print(f'Sorry, {my_list} is not hashable!')

Sorry, [1, 2] is not hashable!
Sorry, [3, 4] is not hashable!
Sorry, [5, 6] is not hashable!


In [7]:
# Alternatively, convert list to tuple
list_of_tuples = [(1, 2), (3, 4), (5, 6)]
for my_tuple in list_of_tuples:
    try:
        print(f'{my_tuple}: {hash(my_tuple)}')
    except TypeError:
        print(f'Sorry, {my_tuple} is not hashable!')

(1, 2): -3550055125485641917
(3, 4): 1079245023883434373
(5, 6): -7007623702649218251


### Set


In [9]:
# Sets are composed of only unique values
# Create them either with the set() function, or with {}
my_set = set([2, 2, 2, 2, 1, 3, 4])
my_other_set = {2, 2, 2, 2, 1, 3, 4}

print(f'Are both sets identical?\n{my_set == my_other_set}')

Are both sets identical?
True


In [15]:
# Set operations
set1 = set([1, 2, 3, 4, 5,])
set2 = set([3, 4, 5, 6, 7, 8])

# Union-- all distinct elements in both sets
print(f'Union between sets:\n{set1.union(set2)}\n')

# Intersection-- overlap between both sets
print(f'Intersection bewteen sets:\n{set1.intersection(set2)}\n')

# Difference-- elements in A that are not in B
print(f'Difference between sets A and B:\n{set1.difference(set2)}\n')

# Symmetric differnce-- all elements in A or B but not in both
print(f'Symmetric difference A and B:\n{set1.symmetric_difference(set2)}\n')

# Is subset-- all elements of A contained in B
print(f'Is A a subset of B?\n{set1.issubset(set2)}\n')

# Is disjoint-- A and B have no elements in common
print(f'Do A and B have no elements in common?\n{set1.isdisjoint(set2)}')

Union between sets:
{1, 2, 3, 4, 5, 6, 7, 8}

Intersection bewteen sets:
{3, 4, 5}

Difference between sets A and B:
{1, 2}

Symmetric difference A and B:
{1, 2, 6, 7, 8}

Is A a subset of B?
False

Do A and B have no elements in common?
False


### List, set, and dict comprehensions

In [17]:
# List comprehensions condense for for-loop and if-else blocks into one-line manipulations
strings = ['a', 'as', 'bat', 'car', 'dove', 'python']
[string.upper() for string in strings if len(string) > 2]

['BAT', 'CAR', 'DOVE', 'PYTHON']

In [18]:
# Set comprehensions are very similar, just with {} instead of []
strings = ['cheese', 'cheese', 'cheese', 'lettuce', 'tomato']
{string.upper() for string in strings}

{'CHEESE', 'LETTUCE', 'TOMATO'}

In [23]:
# Dictionary-- revisit for dictionary comprehensions
string_dict = {}
for string in strings:
    string_dict[string] = strings.count(string)

print(string_dict)

{'cheese': 3, 'lettuce': 1, 'tomato': 1}


In [27]:
# Create a dictionary comprehension-- lookup map based on index
lookup_map = {val: idx for idx, val in enumerate(strings)}
print(lookup_map)

{'cheese': 2, 'lettuce': 3, 'tomato': 4}


In [29]:
# We can also use mapping functions, very similar to R
# These will "map" functions to each element in a sequence
# For example, we could find the length of each string in a list
# However, to materialize the object, we need to create a set() or list()
list(map(len, strings))

[6, 6, 6, 7, 6]

In [32]:
# If we wanted to process a nested list--
all_data = [['john', 'emily', 'michael', 'mary', 'steven'], 
           ['maria', 'juan', 'javier', 'natalia', 'pilar']]

names_of_interest = []
for names in all_data:
    for name in names:
        if name.count('e') >= 2:
            names_of_interest.append(name)
            
print(names_of_interest)

['steven']


In [33]:
# We can condense this with list comprehensions
lots_of_es = [name for names in all_data for name in names if name.count('e') >= 2]
lots_of_es == names_of_interest

True

In [35]:
# This style can also be used to flatten a list of nested tuples
some_tuples = [(1, 2, 3), 
              (4, 5, 6), 
              (7, 8, 9)]

flattened = [val for tup in some_tuples for val in tup]
print(flattened)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [36]:
# Again, these same results can be generated using nested for loops
flattened2 = []
for tup in some_tuples:
    for val in tup:
        flattened2.append(val)
        
print(f'Are the two flatted lists the same?\n{flattened == flattened2}')

Are the two flatted lists the same?
True


## Functions


In [37]:
# Keyword arguments (used for setting defaults, for example) MUST go after positional arguments
# Define simple function
def my_function(x, y, z=1.5):
    if z > 1:
        return z * (x + y)
    else:
        return z / (x + y)

In [38]:
# Test function
my_function(10, 10)

30.0

In [39]:
my_function(10, 10, z=1.2)

24.0

### Namespaces, scope, and local functions

In [40]:
# Namespaces refer to mappings between variables and data
# In the context of functions, namespaces refer to the variables scope
# Similar to R, a local namespace is created when the function is calle
# However, the local namespace is destroyed after the function returns a value (or None)
# Consider two examples
def func():
    a = []
    for i in range(5):
        a.append(i)
        
# Try to call a
a

NameError: name 'a' is not defined

In [48]:
# The variable 'a' is defined in the local namespace
# In other words, it's not accessible in the global namespace
# The only way we can override this behavior is using the 'global' keyword
a = None
def bind_a_variable():
    global a
    a = []
    for i in range(5):
        a.append(i)

# Call the function
bind_a_variable()
print(a)

[0, 1, 2, 3, 4]


### Returning multiple values


In [49]:
# Interestingly, you can return multiple values from a function in Python
# For example, if we wanted to populate a tuple
def populate_tuple():
    a = 5
    b = 6
    c = 7
    return a, b, c

my_tuple = populate_tuple()
print(my_tuple)

(5, 6, 7)


### Funtions are objects


In [52]:
# Like everything else in Python, functions are objects
# If we have a bunch of strings that we need to reformat, we can write a simple function
# We'll be modular with our functions here and create two
# The first will remove all puncutation
# The second will do the rest of the cleaning--trimming whitespace, makeing everything title case, etc
def remove_punctuation(value):
    return re.sub('[!?#]', '', value)

def clean_strings(strings):
    result = []
    for string in strings:
        result.append(remove_punctuation(string.strip().title()))
    return result


In [53]:
# Test 
states = ['  Alabama', 'Georgia!', 'Georgia', 'georgia', 
         'south   carolina##', 'West virginia?']

clean_strings(states)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'South   Carolina',
 'West Virginia']

In [60]:
# Use the built-in map function to apply a function to a sequence
[state for state in map(remove_punctuation, states)]

['  Alabama',
 'Georgia',
 'Georgia',
 'georgia',
 'south   carolina',
 'West virginia']

### Lambda functions


In [68]:
# Lambda functions appear very similar to the apply family of functions in R
# They consist of a single statement and the result is the return value
double_x = lambda x: x * 2

# Call the function
double_x(10)

# Note-- they are caled anonymous functions because they are never delcared with the 'def' keyword
# As a consequence, they don't have a __name__ attribute

20

In [69]:
# Lambda functions can be useful for functions that take other functions as arguments
# For example, if we wante dot sort a list of strings by a custom key, we could use a lambda function
# Here, we're sorting by the number of distinct letters in each string
strings = ['foo', 'card', 'bar', 'aaaa', 'abab']
strings.sort(key=lambda x: len(set(list(x))))
strings

['aaaa', 'foo', 'abab', 'bar', 'card']

### Currying

In [70]:
# Currying refers to deriving new functions from existing ones
# I'll come back to this later-- for now, just know the functools module can help write these

### Generators

The main idea behind generator functions is to create objects that can be used in for loops, like lists, but that don't store their full contents in memory.  This could come in handy when processing large files that would otherwise cause the program to crash if they read the full contents in memory.

### Errors and exception handling


In [71]:
# Handling exceptions is important so programs don't stop-- print a helpful error message instead
def try_float(x):
    try:
        return float(x)
    except ValueError:
        print(f'Invalid: cannot cast {x} as a float.')

In [72]:
try_float(5)

5.0

In [73]:
try_float('something else')

Invalid: cannot cast something else as a float.


In [89]:
# We can re-write to include multiple exceptions by writing them in a tuple
def try_float(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        print(f'Invalid: cannot cast object "{x}" of type {type(x)} as a float.')

In [90]:
try_float(5)

5.0

In [91]:
try_float('something else')

Invalid: cannot cast object "something else" of type <class 'str'> as a float.


In [92]:
try_float(('something', 'else'))

Invalid: cannot cast object "('something', 'else')" of type <class 'tuple'> as a float.


## Files and the operating system

To work with files, we can open them explicitly in read mode (default) using the `open()` function-- just remember to close the file handle to return resources to the operating system.  If we use the `with open()` idiom, the file handle will automatically be closed after the idented lines.  When writing to files, we can also use the `x` model to abort if the file name already exists-- by default, with `w` mode, any files with the same name will be overwritten.  

In [113]:
# Download example data from the GitHub repository
url = 'https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/examples/segismundo.txt'
file = requests.get(url)

# Write sequence of strings to file
with open('data/segismundo.txt', 'w') as handle:
    handle.write(file.text)

# Get a list of lines from the input file with readlines()
with open('data/segismundo.txt', 'r') as handle:
    lines = handle.readlines()

In [114]:
# Check that the file handle is closed
print(f'Is the file handle closed?\n{handle.closed}')

Is the file handle closed?
True
