In [None]:
# This line loads into iPython the libraries needed to generate 
# graphics in-line
%pylab inline

# Using python packages
Python code is organized in units called "packages". The central repository of public packages
is [pypi](https://pypi.python.org/)

Using a python library requires two steps:

1. Installing the packages (and their dependencies)
2. Importing the packages into the program

## installing packages
We use two package managers: pip and conda. Here we demonstrate the use pip. Conda is similar.

In [None]:
#get help on the pip command
!pip -h

In [None]:
# List the installed packages
!pip list

In [None]:
# Show information regarding a specific package
!pip show -v boto

In [None]:
# Install a package
!pip install boto

In [None]:
# upgrade the package
!pip install --upgrade boto

**Note:** You would usually run the installation in a shell, rather than in iPython. This is for various reasons, one of which is that you often need root permission to install a package, which means that you need to use the command `sudo` in front of pip. i.e.

  `sudo pip install --upgrade boto`

Doing so requires entering a password, which is hard to do inside a notebook, and is not recommended because it leaves
a trace of the password in an open file.

## Importing packages

We review the different ways to load a package. 

Note that we have already loaded pylab using the magic `%pylab inline`
so that for this notebook, these operations are unnecessary.

In [None]:
import pylab

To get the documentation about a package, put `"?"` after it

In [None]:
pylab?

A package is a collection of objects (variables, classes, methods).

To get a list of the objects defined in pylab, type `tab` after the `"."`

In [None]:
pylab.

To get information about a particular method, put a question mark `"?"` after the full name of the method

In [None]:
pylab.plot?

In [None]:
# to import pylab but use a shorter name
import pylab as pl

In [None]:
 pl.plot([1,2,1,4])

In [None]:
# you can also import specific object directly into the current namespace
from pylab import plot
plot([1,3,1,3])

In [None]:
# you can even import all objects into the current name space, 
# but beware of name colissions!
from pylab import *

## Auto Reload
Suppose you imported a package and then, at a later point, the package changed (for example, you used pip to install a new version of the package). Rerunning the `import` command in your notebook will not reload the package. The only way to load the updated module into the notebook is to restart the kernel, thereby losing all variables that have been computed so far.

If you want your packages to get reloaded automatically before each cell is executed, use the following ipython extension:

In [None]:
%load_ext autoreload
%autoreload 2

# Data Input and Output

In [None]:
!cp survey.pl survey.tmp
#To read or write to a file we need a file handle
handle=open('survey.tmp','r+')

In [None]:
# 'r+' means that the file is opened for both reading and writing. 
# For more information we check the documentation for "open"
open?

In [None]:
# read the file line by line and output the lines that contain the string 'class'

i=0        # counts the detected lines
for line in handle.readlines():
    if 'class' in line:
        print (i,line,end = "") 
        i+=1
handle.close()

In [None]:
# read the file line by line and output the lines 
# that contain the string 'class'
# and store the location of the beginning of such lines
handle=open('survey.tmp','r+')

table=[]   # the locations of the start of the detected lines in the file
i=0        # counts the detected lines
c=0        # counts all the lines
line='start'
while line != '':
    loc=handle.tell()
    line=handle.readline()
    if 'class' in line:
        print (i,line,end = "")
        table.append((i,loc))
        i=i+1
handle.close()
print(table)

In [None]:
#we can jump from place to place in the file
handle=open('survey.tmp','r+')
for (i,loc) in table:
    handle.seek(loc)
    line=handle.readline()
    print (i,line,end = "")
    
    #alter the line and write it back in.
    line=line[:7]+'***'+line[10:]
    handle.seek(loc)
    handle.write(line)

handle.close()

In [None]:
!cat survey.tmp

### Random access vs. sequential access
One can access file contents randomly, but there is a large penalty in terms of latency and latency variability.
We will look into this later today.

# File formats
The files we dealt with so far are plain text files. This is the format that is most portable, because you can always open
a text file in a text editor or look at some lines using `head` or `tail`

However, there are some important down sides to using text files:

1. **Space inefficiency:** the number 12345678901234567890 requires 20 bytes of disk space to store in ascii, but only  8 bytes to store in binary format. Moreover, structured data such as digitized sound can be compressed by large factors with negligible error.
1. **Self description vs. external description:** When we store data using ascii, we need to write two programs - an encoding program and a decoding program. This is a lot of additional work. By using **self-describing** data structures we can avoid this work altogether.

In [None]:
#the calculation of the number of bytes required 
from math import log
(log(12345678901234567890)/log(2))/8

## Binary file formats
Binary files are written using the command `write` and read using the command `read`. Binary files lack the concept of **lines**, they only have positions. 

In [None]:
file_=open('binaryFile','wb')
buffer_=bytearray(range(100,200))
file_.write(buffer_)
file_.close()

In [None]:
# printing the content of a binary file as text does not make much sense
!head binaryFile

In [None]:
file_=open('binaryFile','rb')
buffer2=bytearray(file_.read())
print(type(buffer2))
S=','.join([str(i) for i in buffer2])
print(S)
print('length as csv',len(S))
file_.close()
print('length of binary file=100 byte')
!ls -l binaryFile

## Self describing files
We will cover two types of self-describing files:

* The first are `pickle` files. This format is popular within the python environment and can store any python data structure. The encoded file is typically readable, but that is not the main feature.  
* The second are `json`. This format came out of `javascript` and is used extensively over the web as a way to communicate complex data structure. It is a format that emphasizes readability and interoperability. It is now the de-facto standard for internet communication between programs.

In [None]:
# Lets define a complex data structure
A={'first name':'Yoav',
   'last name':'Freund',
   'speaks':('English','Hebrew'),
   'Research':{'Probability':7,
               'bio-informatics':3,
               'Data Science':2.1
               },
   'longList':numpy.zeros(10000)
   }
A

In [None]:
# Lets store this data in a Pickle file
import pickle
pickle.dump(A,open('pickle.pkl','wb'))
!ls -l pickle*

### There are currently 3 different protocols which can be used for pickling.

* Protocol version 0 is the original ASCII protocol and is backwards compatible with earlier versions of Python.
* Protocol version 1 is the old binary format which is also compatible with earlier versions of Python.
* Protocol version 2 was introduced in Python 2.3. It provides much more efficient pickling of new-style classes.
* Protocol version 3 is for Python3

The optional *protocol* argument tells the pickler to use the given
protocol supported protocols are 0, 1, 2, 3 and 4.  The default
protocol is 3; a backward-incompatible protocol designed for Python 3.

Specifying a negative protocol version selects the highest protocol
version supported.  The higher the protocol used, the more recent the
version of Python needed to read the pickle produced.

In [None]:
pickle.dump?

In [None]:
pickle.dump(A,open('pickle0.pkl','wb'),0)
pickle.dump(A,open('pickle1.pkl','wb'),1)
pickle.dump(A,open('pickle2.pkl','wb'),2)
pickle.dump(A,open('pickle3.pkl','wb'),3)
!ls -l pickle*

In [None]:
# only protocol 0 uses ascii, 1,2 generate smaller files
!head -100 pickle.pkl

## Pickling and compressing
One useful combination is to use gzip to compress the pickled file. This way you get both the self description and the space reduction benefits. The main thing you lose is the readability of the stored file.

In [None]:
import gzip
pickle.dump(A,gzip.open('picklez0.pkl','wb'),0)
pickle.dump(A,gzip.open('picklez1.pkl','wb'),1)
pickle.dump(A,gzip.open('picklez2.pkl','wb'),2)

In [None]:
#loading the file back
B=pickle.load(gzip.open('picklez2.pkl','rb'))
B

In [None]:
ls -l pickl*

## JSON
Using JSON is very similar to pickle, the results are more readable, but json cannot encode arbitrary python data types.

In [None]:
import json
A['longList']='shortList'  # json cannot deal with numpy arrays
json.dump(A,open('json.jsn','w'))
pickle.dump(A,open('json.pkl','wb'))

In [None]:
!cat json.jsn

In [None]:
#json is almost identical to the string generated by "print"
print(A)

In [None]:
B=json.load(open('json.jsn','rb'))
B

In [None]:
!ls -l json* pick*

## Formatted printing
Sometimes we want to print variables in a nicely formatted way. For this we use the command "format-string'%(list of variables)

In [None]:
A=[ (i,i*i) for i in range(10)]
B=['even' if i%2==0 else 'odd' for i in range(10)]
print(A,B)

To print this nicely, one line per number, we can use the following 
### Old string formatting

In [None]:
for i in range(10):
    print("%1d is %4s, and its square is %d"%(A[i][0],B[i],A[i][1]))
#the %5s element pads the string to be of length 5, which guarantees that the columns will be aligned

In [None]:
print ('\n'.join(["%1d is %4s, and its square is %d"%(A[i][0],B[i],A[i][1]) for i in range(10)]))

### using format( ) method

In [None]:
print("Sammy the {0} has a pet {1}!".format("shark", "pilot fish"))
print("Sammy the {1} has a pet {0}!".format("shark", "pilot fish"))
print("Sammy the {0} {1} a {pr}.".format("shark", "made", pr = "pull request"))
print("Sammy ate {0:.3f} percent of a pizza!".format(75.765367))

In [None]:
print("Sammy has {0:4} red {1:16}!".format(5, "balloons"))

In [None]:
# by default strings are left-justified within the field, and numbers are right-justified.
# You can modify this by placing an alignment code just following the colon.
# < will left-align the text in a field, ^ will center the text in the field, and > will right-align it.
print("Sammy has {0:<4} red {1:^16}!".format(5, "balloons"))

In [None]:
print ('\n'.join(["{0:1d} is {1:>4s}, and its square is {2:1d}".format(A[i][0],B[i],A[i][1]) for i in range(10)]))

### Fancier Output Formatting

In [None]:
print ('\n'.join([f"{A[i][0]:1d} is {B[i]:>4s}, and its square is {A[i][1]:1d}" for i in range(10)]))

For more information about formatting, see this page: https://docs.python.org/3/tutorial/inputoutput.html