In [1]:
%autosave 0

Autosave disabled


# Equivalent vs. identical

In the contexts of Python,

- If two variables refer to the same object, they are identical
    - `a is b` would be `True`
- If two objects have the same values, they are equivalent

The "`is`" (***not*** the same as the comparison operator `==`) operator checks if two objects are identical.

## Equivalent mutable objects are not necessarily identical

- Identical-ness implies equivalent-ness
    - If `a is b` is true, `a == b` is always true
- Equivalent-ness does not imply identical-ness
    - If `a == b` is true, we can not be certain `a is b`

In [2]:
# These two lists are equivalent (having same elements), but not identical

a = [1, 2, 3]
b = [1, 2, 3]

a is b

False

In [3]:
a == b

True

In [4]:
# Since they are not identical, changing b does not affect a

b.append(99)
a, b

([1, 2, 3], [1, 2, 3, 99])

## identical mutable objects  -- be careful of this behavior of Python!

In [5]:
a = [1, 2, 3]
b = a
a is b

True

In [6]:
a == b

True

In [7]:
# changing b also changes a
b.append(99)
a, b

([1, 2, 3, 99], [1, 2, 3, 99])

## immutable objects

### numbers

In [8]:
a = 3
b = 3
a is b

True

In [9]:
a == b

True

### strings

In [10]:
a = 'banana'
b = 'banana'
a is b

True

In [11]:
a == b

True

### tuples

In [12]:
a = (3,5,7)
b = (3,5,7)
a == b

True

In [13]:
a is b

False

### examples

In [14]:
a = 'apple'
b = a
a = 'orange'
b

'apple'

In [15]:
a = (3,5,7)
b = a
a = (1,2,3)
b

(3, 5, 7)

# List - revisit

One of the most important Python object types

- list is mutable
- Most (not all) list methods modifies the list and returns `None`

In [16]:
# What's wrong with this statement

a = [1,2,3,4]
a = a.append(5)
# a = ?

In [17]:
# What's wrong with this function?

def myfunc1(a):
    a = a.append(-99)
    return a

## Read the documentation

In [18]:
a = []
help(a.append)

Help on built-in function append:

append(...) method of builtins.list instance
    L.append(object) -> None -- append object to end



## Deleting elements in a list

1. del
2. pop(index)
3. remove(element)

In [19]:
a = ['a', 'b', 'c', 'd']
a.pop(2)
a

['a', 'b', 'd']

In [20]:
help(a.pop)

Help on built-in function pop:

pop(...) method of builtins.list instance
    L.pop([index]) -> item -- remove and return item at index (default last).
    Raises IndexError if list is empty or index is out of range.



In [21]:
# add a few elements
a.append('x')
a

['a', 'b', 'd', 'x']

In [22]:
del a[1:3]
a

['a', 'x']

In [23]:
a.remove('a')
a

['x']

In [24]:
help(a.remove)

Help on built-in function remove:

remove(...) method of builtins.list instance
    L.remove(value) -> None -- remove first occurrence of value.
    Raises ValueError if the value is not present.



## Utility functions for lists

- len(list)
- max(list)
- min(list)
- sum(list)   # for numbers

In [25]:
a = [1,2,3,4,5,6,7,8,9]
sum(a), max(a), min(a), len(a)

(45, 9, 1, 9)

In [26]:
avg = sum(a) / len(a)
print(avg)

5.0


## Some examples

t = [1,2,3]

To append elements, use

- t.append(99)
- t = t + [99]

In [27]:
t = [1,2,3]
t = t + [99]   # or t.append(99)
t

[1, 2, 3, 99]

These are wrong:

```
t.append([99])     # valid syntax but probably want you want
t = t.append(99)   # destroys "t"!
t + [99]           # does not append
t = t + 99         # wrong: 99 is not a list
```

# String - revisit

- Strings are immutable
- Most (not all) string methods returns a new string.

## strings and lists

Some functions that convert strings to or from lists

In [28]:
# join elements of a list into a string
a = ['Today', 'Is', 'A', 'Good', 'Day']
x = ' '.join(a)
x

'Today Is A Good Day'

In [29]:
a = ['Today', 'Is', 'A', 'Good', 'Day']
y = '---'.join(a)
y

'Today---Is---A---Good---Day'

In [30]:
# split a string into a list

a = 'Today is a good day'
a = a.split()                    # by default, ' ' is the delimiter
a

['Today', 'is', 'a', 'good', 'day']

In [31]:
b = 'Today-is-a-good-day'
b = b.split('-')                 # specify '-' to be the delimiter
b

['Today', 'is', 'a', 'good', 'day']

# sqlite technique: batch insert/update

Combine data in Python containers and update/insert in one `cursor.executemany()` (instead of `cursor.execute()`)

In [32]:
import sqlite3

conn = sqlite3.connect('mydb.sqlite')
cur = conn.cursor()

cur.execute('DROP TABLE IF EXISTS mytable')
cur.execute('CREATE TABLE mytable (name TEXT, address TEXT, phone TEXT)')

# pack data in a container

data = [
        ('John', '101 Westwood, Los Angeles', '123-456'),
        ('Jane', '95 Hollywood, Los Angeles', '246-000'),
        ('Jim', '88 Pico Blvd, Los Angeles', '333-333')
    ]

# execute at once

cur.executemany('INSERT INTO mytable (name, address, phone) \
                VALUES (?,?,?)', data)

conn.commit()
conn.close()

In [33]:
# check db
! sqlite3 mydb.sqlite "select * from mytable"

John|101 Westwood, Los Angeles|123-456
Jane|95 Hollywood, Los Angeles|246-000
Jim|88 Pico Blvd, Los Angeles|333-333


In [34]:
! rm -f mydb.sqlite   # clean up

# sqlite -- accessing by column names


In [35]:
import sqlite3

conn = sqlite3.connect('data/mydb.sqlite')

conn.row_factory = sqlite3.Row        # enable access by column names

cur = conn.execute('SELECT * from mytable')
for row in cur:
    print('{:<10s} {:>10s}'.format(row['name'], row['phone']))

conn.close()

John          123-456
Jane          246-000
Jim           333-333


## Capture the query results in a list -- `fetchall()`

In [36]:
import sqlite3

conn = sqlite3.connect('data/mydb.sqlite')
cur = conn.execute('SELECT * from mytable')

x = cur.fetchall()

conn.close()
print(x)

[('John', '101 Westwood, Los Angeles', '123-456'), ('Jane', '95 Hollywood, Los Angeles', '246-000'), ('Jim', '88 Pico Blvd, Los Angeles', '333-333')]


## A word about cursor buffer

The cursor buffer is emptied after access

In [37]:
import sqlite3

conn = sqlite3.connect('data/mydb.sqlite')
cur = conn.execute('SELECT * from mytable')
x = cur.fetchall()

print("after fetchall, x =", x)

# At this point, the cursor's buffer has been emptied (by the "fetch")
# fetch again

x2 = cur.fetchall()
print('fetch again, x2 =',x2)

conn.close()

after fetchall, x = [('John', '101 Westwood, Los Angeles', '123-456'), ('Jane', '95 Hollywood, Los Angeles', '246-000'), ('Jim', '88 Pico Blvd, Los Angeles', '333-333')]
fetch again, x2 = []


# Try... except...

Capture expected/unexpected errors and prevent crash

In [38]:
! rm -f db1.sqlite
# example of inserting duplicate items

import sqlite3
conn = sqlite3.connect('db1.sqlite')
cur = conn.cursor()
cur = conn.execute('DROP TABLE IF EXISTS mytable')
cur.execute('CREATE TABLE mytable (name TEXT UNIQUE)')    # note the "UNIQUE" constraint

for x in ['John', 'Mary', 'Jane'] + ['Jack', 'Lily']:
#for x in ['John', 'Mary', 'Jane'] + ['John'] + ['Jack', 'Lily']:   # duplicate item causes error!

    cur.execute('INSERT INTO mytable (name) VALUES (?)', (x,))
    
conn.commit()

! sqlite3 db1.sqlite "select * from mytable"

John
Mary
Jane
Jack
Lily


In [39]:
!rm -f db1.sqlite

conn = sqlite3.connect('db1.sqlite')
conn = sqlite3.connect('db1.sqlite')
cur = conn.cursor()
cur = conn.execute('DROP TABLE IF EXISTS mytable')
cur.execute('CREATE TABLE mytable (name TEXT UNIQUE)') 

for x in ['John', 'Mary', 'Jane'] + ['John'] + ['Jack', 'Lily']:   # contains duplicate item

    try:      # protect the possible error by try-except
        cur.execute('INSERT INTO mytable (name) VALUES (?)', (x,))
    except:
        print("There is an insert error... ignore x = ", x)
        continue
    
conn.commit()
conn.close()

! sqlite3 db1.sqlite "select * from mytable"

There is an insert error... ignore x =  John
John
Mary
Jane
Jack
Lily


- When the "try" action fails, the control falls into "except".
- Try-except has many other uses

## Since you cannot guarantee the run time environment, it is a good idea to protect the code from possible crash by using try/except.

Another common example - trying to open a file that does not exist:

In [40]:
# What if the file does not exist

print('program starts...')

#f = open('myfile.txt', 'r')
#x = f.reads()
#f.close()

print('program continues...')

program starts...
program continues...


In [41]:
# code protected by try-except:

print('program starts...')

try:
    f = open('myfile.txt', 'r')
    x = f.reads()
    f.close()
except:
    print('file open error')
    pass

print('program continues...')

program starts...
file open error
program continues...


# XML parsing example



XML example

```xml
<person>
  <name>John</name>
  <phone type="intl">
    +1 310 123 4567
  </phone>
  <email hide="yes"/>
</person>

```

For more information about XML, see, e.g. https://en.wikipedia.org/wiki/XML and other references

In [42]:
import xml.etree.ElementTree as ET

with open('xml.xml', 'r') as f:
    data = f.read()
    
tree = ET.fromstring(data)    # read in XML as a tree structure

print('Name = ', tree.find('name').text )           # get a value
print('Attr = ', tree.find('email').get('hide') )   # get an attribute

Name =  John
Attr =  yes


## Accessing multiple nodes in XML

In [43]:
with open('xml2.xml', 'r') as f:
    data = f.read()

import xml.etree.ElementTree as ET

myxml = ET.fromstring(data)
ulist = myxml.findall('users/user')   # find all user under "users"

for x in ulist:
    print('Name = ', x.find('name').text  )
    print('id   = ', x.find('id').text  )

Name =  Marie
id   =  023
Name =  Bret
id   =  023


# JSON 

Another popular format for data exchange.

```json
{
    "glossary": {
        "title": "example glossary",
        "GlossDiv": {
            "title": "S",
            "GlossList": {
                "GlossEntry": {
                    "ID": "SGML",
                    "SortAs": "SGML",
                    "GlossTerm": "Standard Generalized Markup Language",
                    "Acronym": "SGML",
                    "Abbrev": "ISO 8879:1986",
                    "GlossDef": {
                        "para": "A meta-markup language, used to create markup languages such as DocBook.",
                        "GlossSeeAlso": ["GML", "XML"]
                    },
                    "GlossSee": "markup"
                }
            }
        }
    }
}
```

For more info, see,  http://json.org

In [44]:
import json

with open('json.json', 'r') as f:
    data = f.read()

info = json.loads(data) # load a string in JSON format into a Python object
print(info)

[{'x': '6', 'name': 'John', 'id': '015'}, {'x': '2', 'name': 'Marie', 'id': '023'}]


In [45]:
for item in info:
    print(item['name'])

John
Marie


## Google API example

In [46]:
import json
import urllib.request

u = urllib.request.urlopen('https://maps.googleapis.com/maps/api/geocode/json?' \
        + 'address=10920+Wilshire+Blvd,+Los+Angeles,+CA')
X = u.read().decode()
data = json.loads(X)

In [47]:
dist = data['routes'][0]['legs'][0]['distance']['text']
hrs  = data['routes'][0]['legs'][0]['duration']['text']
dist, hrs

KeyError: 'routes'

In [None]:
for x in data['routes'][0]['legs'][0]['steps']:
    print(x['start_location'], x['distance']['text'])

https://www.google.com/maps/dir/Las+Vegas,+Nevada/Los+Angeles,+CA/@35.0426638,-117.8116339,8z/data=!3m1!4b1!4m14!4m13!1m5!1m1!1s0x80beb782a4f57dd1:0x3accd5e6d5b379a3!2m2!1d-115.1398296!2d36.1699412!1m5!1m1!1s0x80c2c75ddc27da13:0xe22fdf6f254608f4!2m2!1d-118.2436849!2d34.0522342!3e0

# File and path

The `os` module has a lot of useful tools.

```python
import os
```

- Linux and Mac use the UNIX-style path, e.g. `/Users/your_name/Desktop`
- Windows use a different path, e.g. `C:\\Users\\your_name\\Desktop`

Python program can be made cross-platform (same code runs on both Windows and UNIX), if you use the correct function calls to construct path names. 

Using the `/` (on UNIX) or `\\` (on Windows) in path names makes your Python code non-cross-platform.


In [None]:
# the current dirctory

import os
cwd = os.getcwd()
print("current directory = ", cwd)

In [None]:
# list files

os.listdir(cwd)

## join path and file name

In [None]:
## home directory

homedir = os.path.expanduser('~')    # works on both UNIX and Windows

In [None]:
root = os.path.expanduser('~')
dir = 'Desktop'
os.path.join(root, dir)    # on Windows, the path will automatically use `\\`

In [None]:
# check if a file is a file
homedir =  os.path.expanduser('~')   
f = os.path.join(homedir, 'git', 'pyclass-2017')   # this is a directory
os.path.isfile(f) 

In [None]:
f = os.path.join(homedir, 'git', 'pyclass-2017', 'info.tex')  # this is a file
os.path.isfile(f)  

In [None]:
# [o] if a file, [x] if not

cwd = os.path.join(homedir, 'git', 'pyclass-2017')
count_f = 0
count_d = 0
for x in os.listdir(cwd):
    if os.path.isfile(x):
        print('{:<40} {:.>20}'.format(x, '[o]'))
        count_f += 1              # same as count = count + 1
    else:
        print('{:<40} {:.>20}'.format(x, '[x]'))
        count_d += 1
print("Total: {} files, {} non-files".format(count_f, count_d))

# Traverse (walk) a directory

`os.walk()` returns the (dirpath, dirnames, filenames) tuples of all (sub)directories, top-to-bottom by default.

- dirpath : the path to the directory
- dirnames: a list of subdirectories in dirpath
- filenames: a list of non-directory files in dirpath

In [None]:
import os
for (dirname, dirs, files) in os.walk('/tmp'):
    print(dirname, dirs, files)

In [None]:
# create full path from dir and filename using os.path.join

import os

cwd = os.getcwd()
fname = os.listdir(cwd)[0]
print("cwd = {}\nfname = {}\nfull path = {}".format(cwd, fname, os.path.join(cwd, fname)  ))

## file size

In [None]:
import os
cwd = os.getcwd()
f = os.path.join(cwd, 'info.pdf')
os.path.getsize(f)   # size in bytes

In [None]:
# check the non-directory file size of all subdirectories, skipping the .git directory

import os
from os.path import join, getsize

cwd = os.getcwd()
for root, dirs, files in os.walk(cwd):
    
    total_size = 0
    for name in files:
        fullpath = os.path.join(root, name)
        total_size += os.path.getsize(fullpath)

    print("{:<55}{:>3} non-dir files {:>10} bytes".format(root, len(files), total_size))

    if '.git' in dirs:       # skip the .git directory
        dirs.remove('.git')

## file time stamp

- ctime: creation time
- mtime: modification time
- atime: access time

In [None]:
import os
fn = 'json.json'
os.path.getatime(fn), os.path.getctime(fn), os.path.getmtime(fn)

In [None]:
import datetime
datetime.datetime.fromtimestamp(1501106386.011095)

In [None]:
datetime.datetime.fromtimestamp(1501106386.011095).strftime('%Y-%m-%d %H:%M')

In [None]:
os.stat(fn)

In [None]:
os.stat(fn).st_atime

## Changing directory

In [None]:
import os
cwd = os.getcwd()
print('current directory = {}'.format(cwd))
print(os.listdir(cwd)[0:5])

In [None]:
# change the current working directory

os.chdir('/tmp')
cwd = os.getcwd()
print('current directory = {}'.format(cwd))
print(os.listdir(cwd[0:5]))

## Home directory

In [None]:
os.path.expanduser('~')

In [None]:
# full path

os.chdir(os.path.join(os.path.expanduser('~'), 'git/pyclass-2017'))
os.getcwd()

# Exercise

Write a Python program to check the file sizes in your home directory and all its subdirectories. Report the size (in bytes) and the full path/name of the biggest file. To verify, manually check the sizes of your files. Does the answer from your program make sense?

The functions you may need are:

- `os.path.expanduser('~')`
- `os.path.getsize()`
- `os.path.isfile()`
- `os.walk`
- some `if` conditions
- `for`-loop

This is the output I got on my computer:
```text
biggest file = /home/schuang/VirtualBox VMs/windows-10/windows-10.vdi
size = 17950572544 bytes
```