# Jupyter Notebook

### Cells

Cells can be markdown or code

In [2]:
# Code
print 'hello world'

hello world


In [3]:
def print_hello():
    '''
    Prints hello
    '''
    print 'hello'


In [4]:
print_hello()

hello


And this is markdown

### Some nice things


In [5]:
print_hello?

In [6]:
print_hello??

#### Tab Completion

prin<TAB>

#### System Shell

Starting a cell with ! tells Jupyter to execute everything on the system shell

In [7]:
!pwd

/Users/mike/Desktop/DataExplorationExamples


#### Magic Commands

Start with % 

* %magic
* %time: times code
* %timeit: times code
* %run: runs python script

In [8]:
%magic

In [9]:
%time x = [i**2 for i in range(200)]

CPU times: user 74 µs, sys: 33 µs, total: 107 µs
Wall time: 87 µs


In [10]:
%timeit x = [i**2 for i in range(200)]

10000 loops, best of 3: 32.1 µs per loop


#### Extensions

There are several extensions that you can add on 

# Numpy

Basic data structure in numpy is the ndarray.

Standard import is to use

import numpy as n p

### ndarray

ndarray is a multidimensional container for homogenous data. You can create an ndarray with the array function which turns sequence like objects into ndarrays.

#### array function

In [11]:
import numpy as np

np.array([1,2,3,4])

array([1, 2, 3, 4])

In [12]:
np.array([[1,2],[3,4]])

array([[1, 2],
       [3, 4]])

#### Shape and dtype

In [13]:
ex = np.array([[1,2],[3,4]])
print ex.dtype
print ex.shape

int64
(2, 2)


#### initialize arrays

In [17]:
print np.zeros(4)
print np.ones((2,5))
print np.empty(5)

[ 0.  0.  0.  0.]
[[ 1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.]]
[  0.00000000e+000   0.00000000e+000   2.12354999e-314   2.78136356e-309
   0.00000000e+000]


#### Homogenous

In [18]:
tmp1 = [1, 'h', 3.0, True]
tmp2 = np.array(tmp1)

In [19]:
for i in tmp1:
    print '{} type: {}'.format(i, type(i))

print 
    
for i in tmp2:
    print '{} type: {}'.format(i, type(i))

1 type: <type 'int'>
h type: <type 'str'>
3.0 type: <type 'float'>
True type: <type 'bool'>

1 type: <type 'numpy.string_'>
h type: <type 'numpy.string_'>
3.0 type: <type 'numpy.string_'>
True type: <type 'numpy.string_'>


### Some differences with lists

#### Elementwise operations

In [20]:
lst = [1,2]
arr = np.array(lst)

In [21]:
print lst* 3

print arr*2

[1, 2, 1, 2, 1, 2]
[2 4]


In [22]:
print lst + lst

print arr + arr

[1, 2, 1, 2]
[2 4]


In [23]:
try:
    print lst * lst
except: 
    pass

print arr * arr

[1 4]


In [48]:
ex = np.array([[1,2],[3,4]])
print ex
print ex.sum()
print ex.sum(axis = 0)
print ex.sum(axis = 1)

[[1 2]
 [3 4]]
10
[4 6]
[3 7]


#### Copies vs Views
#### Deep vs Shallow copying

In [28]:
lst = range(10)
arr = np.array(lst)

In [29]:
print lst
print arr

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0 1 2 3 4 5 6 7 8 9]


In [30]:
lst_copy = lst[2:5]
arr_copy = arr[2:5]

In [31]:
print lst_copy 
print arr_copy 

[2, 3, 4]
[2 3 4]


In [32]:
print lst
print lst_copy
lst_copy[0] = 90
print lst
print lst_copy

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[2, 3, 4]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[90, 3, 4]


In [33]:
print arr
print arr_copy
arr_copy[0] = 90
print arr
print arr_copy

[0 1 2 3 4 5 6 7 8 9]
[2 3 4]
[ 0  1 90  3  4  5  6  7  8  9]
[90  3  4]


In [107]:
arr = np.array(range(10))
arr_copy = arr[2:5].copy()
print arr
print arr_copy
arr_copy[0] = 90
print arr
print arr_copy


[0 1 2 3 4 5 6 7 8 9]
[2 3 4]
[0 1 2 3 4 5 6 7 8 9]
[90  3  4]


#### Speed

In [36]:
lst = range(20000)

In [37]:
%timeit tmp = [i**2 for i in lst]

100 loops, best of 3: 2.93 ms per loop


In [38]:
arr = np.array(lst)

In [39]:
%timeit tmp = arr**2

The slowest run took 23.22 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 13.1 µs per loop


### Linear Algebra

#### Dot Product

In [41]:
W = np.array([[1,2],[3,4]])
x = np.array([1,2])

In [42]:
W.dot(x)

array([ 5, 11])

In [43]:
np.dot(W,x)

array([ 5, 11])

# Pandas

standard import 

import pandas as pd

* Series
* Dataframe

In [49]:
import pandas as pd

### Series

Object containing ndarray and index

In [50]:
srs = pd.Series(range(10))

In [51]:
type(srs)

pandas.core.series.Series

In [52]:
srs.index

RangeIndex(start=0, stop=10, step=1)

In [53]:
srs.dtype

dtype('int64')

In [54]:
srs.values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [55]:
type(srs.values)

numpy.ndarray

### DataFrame

Create using DataFrame function

Can use on dictionary or ndarray

In [58]:
d = {'c1':[1,2,3], 'c2':[3,4,5]}

In [59]:
pd.DataFrame(d)

Unnamed: 0,c1,c2
0,1,3
1,2,4
2,3,5


In [60]:
pd.DataFrame(np.array([[1,2,3],[3,4,5]]))

Unnamed: 0,0,1,2
0,1,2,3
1,3,4,5


In [61]:
pd.DataFrame(np.array([[1,2,3],[3,4,5]]), columns = ['c1','c2','c3'])

Unnamed: 0,c1,c2,c3
0,1,2,3
1,3,4,5


In [62]:
pd.DataFrame(np.array([[1,2,3],[3,4,5]]), index = ['r1','r2'])

Unnamed: 0,0,1,2
r1,1,2,3
r2,3,4,5


#### Read/Write

In [104]:
# Write using to_csv
pd.DataFrame(d).to_csv('data/df.csv')

In [63]:
# Read using read_csv
df = pd.read_csv('data/df.csv')

In [64]:
df

Unnamed: 0.1,Unnamed: 0,c1,c2
0,0,1,3
1,1,2,4
2,2,3,5


In [65]:
# Get a column using dictionary syntax
df['Unnamed: 0']

0    0
1    1
2    2
Name: Unnamed: 0, dtype: int64

In [66]:
# Delete using del
del df['Unnamed: 0']

In [67]:
df

Unnamed: 0,c1,c2
0,1,3
1,2,4
2,3,5


In [68]:
df.columns

Index([u'c1', u'c2'], dtype='object')

In [69]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [70]:
df.index = ['r1', 'r2', 'r3']

In [71]:
df

Unnamed: 0,c1,c2
r1,1,3
r2,2,4
r3,3,5


#### Indexing
* .loc: Index based
* .iloc: Location based

In [72]:
df.loc['r1']

c1    1
c2    3
Name: r1, dtype: int64

In [73]:
df.iloc[0]

c1    1
c2    3
Name: r1, dtype: int64

#### Boolean

In [74]:
df > 3

Unnamed: 0,c1,c2
r1,False,False
r2,False,True
r3,False,True


In [75]:
df[df > 3]

Unnamed: 0,c1,c2
r1,,
r2,,4.0
r3,,5.0


In [76]:
df[[True, False ,True]]

Unnamed: 0,c1,c2
r1,1,3
r3,3,5


#### Missing values

In [79]:
df2 = df[df > 3]

In [80]:
df2

Unnamed: 0,c1,c2
r1,,
r2,,4.0
r3,,5.0


In [81]:
df2.fillna(-1)

Unnamed: 0,c1,c2
r1,-1.0,-1.0
r2,-1.0,4.0
r3,-1.0,5.0


In [82]:
df2

Unnamed: 0,c1,c2
r1,,
r2,,4.0
r3,,5.0


In [87]:
df2.fillna(df2.mean())

Unnamed: 0,c1,c2
r1,,4.5
r2,,4.0
r3,,5.0


# IMPORTANT
# Get in the habit of looking at documentation

### Dataset

In [88]:
iris = pd.read_csv('data/iris.csv')

In [89]:
iris.head()

Unnamed: 0,0,1,2,3,targets
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [90]:
iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'targets']

In [91]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,targets
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


#### Some other nice things

In [92]:
# Describe
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,targets
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,1.0
std,0.828066,0.433594,1.76442,0.763161,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


* merge: Like a SQL join
* groupby: Aggregate function

In [98]:
iris_means = iris.groupby(['targets']).mean()
iris_means

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.006,3.418,1.464,0.244
1,5.936,2.77,4.26,1.326
2,6.588,2.974,5.552,2.026


In [100]:
merged = pd.merge(iris, iris_means, how = 'inner', left_on=['targets'], right_index=True)

In [101]:
merged.head()

Unnamed: 0,sepal_length_x,sepal_width_x,petal_length_x,petal_width_x,targets,sepal_length_y,sepal_width_y,petal_length_y,petal_width_y
0,5.1,3.5,1.4,0.2,0,5.006,3.418,1.464,0.244
1,4.9,3.0,1.4,0.2,0,5.006,3.418,1.464,0.244
2,4.7,3.2,1.3,0.2,0,5.006,3.418,1.464,0.244
3,4.6,3.1,1.5,0.2,0,5.006,3.418,1.464,0.244
4,5.0,3.6,1.4,0.2,0,5.006,3.418,1.464,0.244


In [104]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,targets
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [105]:
iris.head().sort(['sepal_length'])

  if __name__ == '__main__':


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,targets
3,4.6,3.1,1.5,0.2,0
2,4.7,3.2,1.3,0.2,0
1,4.9,3.0,1.4,0.2,0
4,5.0,3.6,1.4,0.2,0
0,5.1,3.5,1.4,0.2,0


# My Workflow

* Jupyter Notebook for scratch paper and exploratory analysis
* Take modularized and reusable code out 

In [148]:
def some__preprocessing_function(x):
    '''
    zero center
    '''
    return x - x.mean()

In [152]:
x = np.array([1,2,3,4])

In [153]:
some__preprocessing_function(x)

array([-1.5, -0.5,  0.5,  1.5])

Move, create modules, .py files

In [154]:
import data_utils

In [155]:
data_utils.some__preprocessing_function(x)

array([-1.5, -0.5,  0.5,  1.5])