# Week 9 Presentation Companion

### Working with Text & Binary
___

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Number Representations

In [4]:
int("226")
int("11100010", 2) # binary (base 2)
int("342", 8) # 3 binary digits per 1 octal (base 8) digit 
int("E2", 16) # 4 binary digits per 1 hexadecimal (base 16) digit


226

226

226

226

In [5]:
# Note the binary is decoded like this

1*2**7 + 1*2**6 + 1*2**5 + 0*2**4 + 0*2**3 + 0*2**2 + 1*2**1 + 0*2**0

# the hexadecimal like this

14*16**1 + 2*16**0

226

226

### Character code points in ASCII.

* Each unique character code requires $\le$ 7 bits.
* The ASCII character set comprises 128 unique characters.

In [8]:
"T"                # outputs glyph
ord("T")           # outputs decimal representation of code point 
bin(ord("T"))      # with the type code 
bin(ord("T"))[2:]  # outputs binary representation of code point
oct(ord("T"))[2:]  # outputs octal representation of code point
hex(ord("T"))[2:]  # outputs hexadecimal representation of code point

chr(84)            # outputs glyph, provided decimal representation of code point
"\u0054"           # outputs glyph, provided hexadecimal representation of code point (lower case \u for 16 bits)

'T'

84

'0b1010100'

'1010100'

'124'

'54'

'T'

'T'

### Character code points in Unicode.

* Each unique character code point is a number 0-10FFFF hexadecimal (0-1,114,111 decimal).
* The Unicode character set now comprises 109,384 unique characters with room left for up to 1,114,112 unique characters.

In [5]:
"𓀀"                # outputs glyph
ord("𓀀")           # outputs decimal representation of code point
bin(ord("𓀀"))[2:]  # outputs binary representation of code point
oct(ord("𓀀"))[2:]  # outputs octal representation of code point
hex(ord("𓀀"))[2:]  # outputs hexadecimal representation of code point

chr(77824)         # outputs glyph, provided decimal representation of code point
"\U00013000"       # outputs glyph, provided hexaecimal representation of code point (upper case \U for 32 bits)

'𓀀'

77824

'10011000000000000'

'230000'

'13000'

'𓀀'

'𓀀'

In [6]:
"\U000106A9" # this glyph is not supported by the system, though the code point is supported by the system

'𐚩'

### Character names in Unicode.

In [7]:
import unicodedata

unicodedata.name("T")
unicodedata.name("\u0054")

unicodedata.name("𓀀")
unicodedata.name("\U00013000")

'LATIN CAPITAL LETTER T'

'LATIN CAPITAL LETTER T'

'EGYPTIAN HIEROGLYPH A001'

'EGYPTIAN HIEROGLYPH A001'

In [9]:
unicodedata.lookup("LATIN CAPITAL LETTER T")
"\N{LATIN CAPITAL LETTER T}"

unicodedata.lookup("EGYPTIAN HIEROGLYPH A001")
"\N{EGYPTIAN HIEROGLYPH A001}"

'T'

'T'

'𓀀'

'𓀀'

### Python strings.

* A Python string (type str) is a sequence of Unicode code points.

In [11]:
"Tut"
"\u0054\u0075\u0074"

type("Tut")
type("\u0054\u0075\u0074")

len("Tut")
len("\u0054\u0075\u0074")

'Tut'

'Tut'

str

str

3

3

In [13]:
"𓀀𓃒𓁶𓅂"
"\U00013000\U000130D2\U00013076\U00013142"

type("𓀀𓃒𓁶𓅂")
type("\U00013000\U000130D2\U00013076\U00013142")

len("𓀀𓃒𓁶𓅂")
len("\U00013000\U000130D2\U00013076\U00013142")

'𓀀𓃒𓁶𓅂'

'𓀀𓃒𓁶𓅂'

str

str

4

4

In [14]:
"Tut𓀀𓃒𓁶𓅂"
"\u0054\u0075\u0074\U00013000\U000130D2\U00013076\U00013142"

type("Tut𓀀𓃒𓁶𓅂")
type("\u0054\u0075\u0074\U00013000\U000130D2\U00013076\U00013142")

len("Tut𓀀𓃒𓁶𓅂")
len("\u0054\u0075\u0074\U00013000\U000130D2\U00013076\U00013142")

'Tut𓀀𓃒𓁶𓅂'

'Tut𓀀𓃒𓁶𓅂'

str

str

7

7

### Encoding/Decoding Schemes (aka Codecs).

* Strings are encoded so that they can be stored in computer memory and secondary storage.
* Use a specific encoding scheme consistently.

In [22]:
scheme = "utf-8"

s = "Tut"
e = s.encode(scheme)
z = [(c, [bin(i)[2:].zfill(8) for i in list(c.encode(scheme))]) for c in s]
b = [bin(i)[2:].zfill(8) for i in list(s.encode(scheme))]

s
e
z
b
    
len(s) # number of characters
len(b) # number of bytes (groups of 8 bits) used to store the characters 

'Tut'

b'Tut'

[('T', ['01010100']), ('u', ['01110101']), ('t', ['01110100'])]

['01010100', '01110101', '01110100']

3

3

In [23]:
scheme = "utf-8"

s = "𓀀𓃒𓁶𓅂"
e = s.encode(scheme)
z = [(c, [bin(i)[2:].zfill(8) for i in list(c.encode(scheme))]) for c in s]
b = [bin(i)[2:].zfill(8) for i in list(s.encode(scheme))]

s
e
z
b
    
len(s) # number of characters
len(b) # number of bytes (groups of 8 bits) used to store the characters 

'𓀀𓃒𓁶𓅂'

b'\xf0\x93\x80\x80\xf0\x93\x83\x92\xf0\x93\x81\xb6\xf0\x93\x85\x82'

[('𓀀', ['11110000', '10010011', '10000000', '10000000']),
 ('𓃒', ['11110000', '10010011', '10000011', '10010010']),
 ('𓁶', ['11110000', '10010011', '10000001', '10110110']),
 ('𓅂', ['11110000', '10010011', '10000101', '10000010'])]

['11110000',
 '10010011',
 '10000000',
 '10000000',
 '11110000',
 '10010011',
 '10000011',
 '10010010',
 '11110000',
 '10010011',
 '10000001',
 '10110110',
 '11110000',
 '10010011',
 '10000101',
 '10000010']

4

16

In [24]:
scheme = "utf-8"

s = "Tut𓀀𓃒𓁶𓅂"
e = s.encode(scheme)
z = [(c, [bin(i)[2:].zfill(8) for i in list(c.encode(scheme))]) for c in s]
b = [bin(i)[2:].zfill(8) for i in list(s.encode(scheme))]

s
e
z
b
    
len(s) # number of characters
len(b) # number of bytes (groups of 8 bits) used to store the characters 

'Tut𓀀𓃒𓁶𓅂'

b'Tut\xf0\x93\x80\x80\xf0\x93\x83\x92\xf0\x93\x81\xb6\xf0\x93\x85\x82'

[('T', ['01010100']),
 ('u', ['01110101']),
 ('t', ['01110100']),
 ('𓀀', ['11110000', '10010011', '10000000', '10000000']),
 ('𓃒', ['11110000', '10010011', '10000011', '10010010']),
 ('𓁶', ['11110000', '10010011', '10000001', '10110110']),
 ('𓅂', ['11110000', '10010011', '10000101', '10000010'])]

['01010100',
 '01110101',
 '01110100',
 '11110000',
 '10010011',
 '10000000',
 '10000000',
 '11110000',
 '10010011',
 '10000011',
 '10010010',
 '11110000',
 '10010011',
 '10000001',
 '10110110',
 '11110000',
 '10010011',
 '10000101',
 '10000010']

7

19

In [25]:
scheme = "ascii"

s = "Tut𓀀𓃒𓁶𓅂"
e = s.encode(scheme, "ignore")
z = [(c, [bin(i)[2:].zfill(8) for i in list(c.encode(scheme, "ignore"))]) for c in s]
b = [bin(i)[2:].zfill(8) for i in list(s.encode(scheme, "ignore"))]

s
e
z
b
    
len(s) # number of characters
len(b) # number of bytes (groups of 8 bits) used to store the characters 

'Tut𓀀𓃒𓁶𓅂'

b'Tut'

[('T', ['01010100']),
 ('u', ['01110101']),
 ('t', ['01110100']),
 ('𓀀', []),
 ('𓃒', []),
 ('𓁶', []),
 ('𓅂', [])]

['01010100', '01110101', '01110100']

7

3

In [26]:
scheme = "utf-16"

s = "Tut𓀀𓃒𓁶𓅂"
e = s.encode(scheme)
z = [(c, [bin(i)[2:].zfill(8) for i in list(c.encode(scheme))]) for c in s]
b = [bin(i)[2:].zfill(8) for i in list(s.encode(scheme))]

s
e
z
b
    
len(s) # number of characters
len(b) # number of bytes (groups of 8 bits) used to store the characters 

'Tut𓀀𓃒𓁶𓅂'

b'\xff\xfeT\x00u\x00t\x00\x0c\xd8\x00\xdc\x0c\xd8\xd2\xdc\x0c\xd8v\xdc\x0c\xd8B\xdd'

[('T', ['11111111', '11111110', '01010100', '00000000']),
 ('u', ['11111111', '11111110', '01110101', '00000000']),
 ('t', ['11111111', '11111110', '01110100', '00000000']),
 ('𓀀',
  ['11111111', '11111110', '00001100', '11011000', '00000000', '11011100']),
 ('𓃒',
  ['11111111', '11111110', '00001100', '11011000', '11010010', '11011100']),
 ('𓁶',
  ['11111111', '11111110', '00001100', '11011000', '01110110', '11011100']),
 ('𓅂',
  ['11111111', '11111110', '00001100', '11011000', '01000010', '11011101'])]

['11111111',
 '11111110',
 '01010100',
 '00000000',
 '01110101',
 '00000000',
 '01110100',
 '00000000',
 '00001100',
 '11011000',
 '00000000',
 '11011100',
 '00001100',
 '11011000',
 '11010010',
 '11011100',
 '00001100',
 '11011000',
 '01110110',
 '11011100',
 '00001100',
 '11011000',
 '01000010',
 '11011101']

7

24

### Encoding/Decoding gotchas.

In [27]:
s = "Tut𓀀𓃒𓁶𓅂"
e = s.encode("utf-8")
b = [bin(i)[2:].zfill(8) for i in list(e)]

s
e
b

e.decode("utf-8")

'Tut𓀀𓃒𓁶𓅂'

b'Tut\xf0\x93\x80\x80\xf0\x93\x83\x92\xf0\x93\x81\xb6\xf0\x93\x85\x82'

['01010100',
 '01110101',
 '01110100',
 '11110000',
 '10010011',
 '10000000',
 '10000000',
 '11110000',
 '10010011',
 '10000011',
 '10010010',
 '11110000',
 '10010011',
 '10000001',
 '10110110',
 '11110000',
 '10010011',
 '10000101',
 '10000010']

'Tut𓀀𓃒𓁶𓅂'

In [28]:
s = "Tut𓀀𓃒𓁶𓅂"
e = s.encode("utf-16")
b = [bin(i)[2:].zfill(8) for i in list(e)]

s
e
b

'Tut𓀀𓃒𓁶𓅂'

b'\xff\xfeT\x00u\x00t\x00\x0c\xd8\x00\xdc\x0c\xd8\xd2\xdc\x0c\xd8v\xdc\x0c\xd8B\xdd'

['11111111',
 '11111110',
 '01010100',
 '00000000',
 '01110101',
 '00000000',
 '01110100',
 '00000000',
 '00001100',
 '11011000',
 '00000000',
 '11011100',
 '00001100',
 '11011000',
 '11010010',
 '11011100',
 '00001100',
 '11011000',
 '01110110',
 '11011100',
 '00001100',
 '11011000',
 '01000010',
 '11011101']

In [29]:
e.decode("utf-8") # tries to decode utf-16 as utf-8 - causes error

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

### Regular Expressions (RegEx)

In [31]:
import re

In [32]:
s = """\
The long and winding road that leads to your door
Will never disappear, I've seen that road before.
It always leads me here, lead me to your door.
The wild and windy night that the rain washed away
Has left a pool of tears, crying for the day.
Why leave me standing here, let me know the way.

Many times I've been alone, and many times I've cried.
Any way, you'll never know the many ways I've tried.
And still they lead me back to the long winding road.
You left me standing here a long long time ago.
Don't leave me waiting here, lead me to your door.

Yeah, yeah, yeah, yeah.
"""

In [34]:
# Search for first door

x = re.search("door", s)
x
x.group()
x.start()
x.end()

s[x.start():x.end()]

<_sre.SRE_Match object; span=(45, 49), match='door'>

'door'

45

49

'door'

In [54]:
# Search for all door

x = re.findall("door", s)
x

['door', 'door', 'door']

In [53]:
# Search for all words that start with wind

x = re.findall("wind\S*", s)
x

['winding', 'windy', 'winding']

In [55]:
# Substitute data for door

x = re.sub("door", "data", s)
print(x)

The long and winding road that leads to your data
Will never disappear, I've seen that road before.
It always leads me here, lead me to your data.
The wild and windy night that the rain washed away
Has left a pool of tears, crying for the day.
Why leave me standing here, let me know the way.

Many times I've been alone, and many times I've cried.
Any way, you'll never know the many ways I've tried.
And still they lead me back to the long winding road.
You left me standing here a long long time ago.
Don't leave me waiting here, lead me to your data.

Yeah, yeah, yeah, yeah.



In [57]:
# Remove last verse

x = re.split("\n\n", s)
x

print(x[0], x[1], sep="\n\n")

["The long and winding road that leads to your door\nWill never disappear, I've seen that road before.\nIt always leads me here, lead me to your door.\nThe wild and windy night that the rain washed away\nHas left a pool of tears, crying for the day.\nWhy leave me standing here, let me know the way.",
 "Many times I've been alone, and many times I've cried.\nAny way, you'll never know the many ways I've tried.\nAnd still they lead me back to the long winding road.\nYou left me standing here a long long time ago.\nDon't leave me waiting here, lead me to your door.",
 'Yeah, yeah, yeah, yeah.\n']

The long and winding road that leads to your door
Will never disappear, I've seen that road before.
It always leads me here, lead me to your door.
The wild and windy night that the rain washed away
Has left a pool of tears, crying for the day.
Why leave me standing here, let me know the way.

Many times I've been alone, and many times I've cried.
Any way, you'll never know the many ways I've tried.
And still they lead me back to the long winding road.
You left me standing here a long long time ago.
Don't leave me waiting here, lead me to your door.


In [59]:
# Remove last verse another way

x = re.sub("\n\n.*$", "", s)
print(x)

The long and winding road that leads to your door
Will never disappear, I've seen that road before.
It always leads me here, lead me to your door.
The wild and windy night that the rain washed away
Has left a pool of tears, crying for the day.
Why leave me standing here, let me know the way.

Many times I've been alone, and many times I've cried.
Any way, you'll never know the many ways I've tried.
And still they lead me back to the long winding road.
You left me standing here a long long time ago.
Don't leave me waiting here, lead me to your door.



In [60]:
# Move reprise to front

x = re.split("\n\n", s)
print(x[2][0:-1], x[0], x[1], sep="\n\n")

Yeah, yeah, yeah, yeah.

The long and winding road that leads to your door
Will never disappear, I've seen that road before.
It always leads me here, lead me to your door.
The wild and windy night that the rain washed away
Has left a pool of tears, crying for the day.
Why leave me standing here, let me know the way.

Many times I've been alone, and many times I've cried.
Any way, you'll never know the many ways I've tried.
And still they lead me back to the long winding road.
You left me standing here a long long time ago.
Don't leave me waiting here, lead me to your door.


In [61]:
# Move reprise to front another way

x = re.sub("^(.*)\n\n(.*)$", "\g<2>\n\g<1>", s, flags=re.DOTALL) # . matches newline, too
print(x)

Yeah, yeah, yeah, yeah.

The long and winding road that leads to your door
Will never disappear, I've seen that road before.
It always leads me here, lead me to your door.
The wild and windy night that the rain washed away
Has left a pool of tears, crying for the day.
Why leave me standing here, let me know the way.

Many times I've been alone, and many times I've cried.
Any way, you'll never know the many ways I've tried.
And still they lead me back to the long winding road.
You left me standing here a long long time ago.
Don't leave me waiting here, lead me to your door.


#### Compiled patterns. 

Compiled patterns work faster.

In [65]:
p = re.compile("door")
x = p.search(s)

x
x.group()
x.start()
x.end()

s[x.start():x.end()]

<_sre.SRE_Match object; span=(45, 49), match='door'>

'door'

45

49

'door'

In [66]:
p = re.compile("^(.*)\n\n(.*)$", flags=re.DOTALL)
x = p.sub("\g<2>\n\g<1>", s)
print(x)

Yeah, yeah, yeah, yeah.

The long and winding road that leads to your door
Will never disappear, I've seen that road before.
It always leads me here, lead me to your door.
The wild and windy night that the rain washed away
Has left a pool of tears, crying for the day.
Why leave me standing here, let me know the way.

Many times I've been alone, and many times I've cried.
Any way, you'll never know the many ways I've tried.
And still they lead me back to the long winding road.
You left me standing here a long long time ago.
Don't leave me waiting here, lead me to your door.


### Text Output

#### Print without formatting.

In [67]:
print("Answer #", 1, " to life, the universe, and everything is ", 42, ".", sep="")

Answer #1 to life, the universe, and everything is 42.


In [68]:
print("Answer #" + str(1) + " to life, the universe, and everything is " + str(42) + ".")

Answer #1 to life, the universe, and everything is 42.


#### String using old style formatting.

In [69]:
"Answer #%d to life, the universe, and everything is %d." % (1, 42)

'Answer #1 to life, the universe, and everything is 42.'

In [70]:
"Answer #%d to life, the universe, and everything is %f." % (1, 42)

'Answer #1 to life, the universe, and everything is 42.000000.'

In [71]:
"Answer #%d to life, the universe, and everything is %0.2f." % (1, 42)

'Answer #1 to life, the universe, and everything is 42.00.'

In [72]:
"Answer #%d to life, the universe, and everything is %0.2E." % (1, 42)

'Answer #1 to life, the universe, and everything is 4.20E+01.'

#### Print with new style formatting.

In [73]:
"Answer #{} to life, the universe, and everything is {}.".format(1, 42)

'Answer #1 to life, the universe, and everything is 42.'

In [74]:
"Answer #{} to life, the universe, and everything is {:f}.".format(1, 42)

'Answer #1 to life, the universe, and everything is 42.000000.'

In [75]:
"Answer #{} to life, the universe, and everything is {:0.2f}.".format(1, 42)

'Answer #1 to life, the universe, and everything is 42.00.'

In [76]:
"Answer #{} to life, the universe, and everything is {:0.2E}.".format(1, 42)

'Answer #1 to life, the universe, and everything is 4.20E+01.'

In [77]:
a = 42
"Answer #{attempt} to life, the universe, and everything is {answer}.".format(answer=a, attempt=1)

'Answer #1 to life, the universe, and everything is 42.'

### Files

In [79]:
# Read text file - open/close form

import json

g = open("twitter.json", "rt")
tweet = json.load(g)
g.close()

tweet["created_at"]

'Sun Apr 03 23:48:36 +0000 2011'

In [80]:
# Read text file - with form

import json

with open("twitter.json", "r") as f:
    tweet = json.load(f)
    
tweet["created_at"]

'Sun Apr 03 23:48:36 +0000 2011'

In [85]:
# Read and write binary file

f = open("logo.jpg", "rb")
data = f.read()
f.close()

len(data)
data[0:100] # inspect first 100 bytes of file content

g = open("logo2.jpg", "wb")
r = g.write(data)
g.close()

21642

b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff\xdb\x00C\x00\x03\x02\x02\x03\x02\x02\x03\x03\x03\x03\x04\x03\x03\x04\x05\x08\x05\x05\x04\x04\x05\n\x07\x07\x06\x08\x0c\n\x0c\x0c\x0b\n\x0b\x0b\r\x0e\x12\x10\r\x0e\x11\x0e\x0b\x0b\x10\x16\x10\x11\x13\x14\x15\x15\x15\x0c\x0f\x17\x18\x16\x14\x18\x12\x14\x15\x14\xff\xdb\x00C\x01\x03\x04\x04\x05\x04\x05'

![](logo2.jpg)

In [1]:
# Read and write binary file

f = open("logo.jpg", "rb")
data = f.read()
f.close()

datax = data[0:10000] + data[15000:] # purposely corrupt data to see effect

g = open("logo3.jpg", "wb")
r = g.write(datax)
g.close()

![](logo3.jpg)

# Numpy

Study and practice using numpy arrays.