# 0217 Class Notes

## Generators

In [6]:
s = '1,4,2\n5\n8,7,9'

In [7]:
for line in s.split('\n'):
    print(int(line.split(',')[0]))

1
5
8


In [13]:

# generators : pausible function. 
# All generators are iterators but not all iterators are generators

def parse_rows(s):
    """takes string and breaks up into lines
    parses lines as if text was csv"""
    for line in s.split('\n'):
        print('in parse_rows', line)
        parsed_row = line.split(',')
        print(int(line.split(',')[0]))
        yield parsed_row

def extract_first(rows):
    """gives back first element in each row"""
    for row in rows:
        print('in extract_first',row)
        yield row[0]

In [23]:
g= parse_rows(s) # object that implements next

In [20]:
next(g)

in parse_rows 1,4,2
1


['1', '4', '2']

In [21]:
next(g)

in parse_rows 5
5


['5']

In [24]:
for row in g:
    print(row)

in parse_rows 1,4,2
1
['1', '4', '2']
in parse_rows 5
5
['5']
in parse_rows 8,7,9
8
['8', '7', '9']


In [25]:
def parse_rows(s):
    """takes string and breaks up into lines
    parses lines as if text was csv"""
    for line in s.split('\n'):
        print('in parse_rows', line)
        parsed_row = line.split(',')
        print(int(line.split(',')[0]))
        yield parsed_row

def extract_first(rows):
    """gives back first element in each row"""
    for row in rows:
        print('in extract_first',row)
        yield row[0]

rows = parse_rows(s)
first_elements = extract_first(rows)

In [26]:
# generators are lazily executed. There is no print out until it needs it 
next(first_elements)

in parse_rows 1,4,2
1
in extract_first ['1', '4', '2']


'1'

In [27]:
next(first_elements)

in parse_rows 5
5
in extract_first ['5']


'5'

In [28]:
# A lot of built in functions can take a generator as an argument 
rows = parse_rows(s)
first_elements = extract_first(rows)
numbers=(int (n) for n in first_elements) 

In [29]:
sum(numbers)

in parse_rows 1,4,2
1
in extract_first ['1', '4', '2']
in parse_rows 5
5
in extract_first ['5']
in parse_rows 8,7,9
8
in extract_first ['8', '7', '9']


14

In [30]:
min(numbers) # Because generator all finished running/iterating

ValueError: min() arg is an empty sequence

In [None]:
# Generators don't do anything until you need it 
# IT could pause function! 

In [34]:
import sys

In [41]:
nums_g = (i for i in range(10000))

In [42]:
sys.getsizeof(nums_g)

112

In [43]:
rows=parse_rows(s)

In [44]:
type(rows)

generator

In [45]:
sys.getsizeof(rows)

112

In [46]:
nums=list(range(10000))

In [47]:
sys.getsizeof(nums)

80056

In [48]:
rows[1] # For generators, we can't rewind. 

TypeError: 'generator' object is not subscriptable

In [49]:
len(rows)

TypeError: object of type 'generator' has no len()

In [50]:
sum(1 for row in rows)

in parse_rows 1,4,2
1
in parse_rows 5
5
in parse_rows 8,7,9
8


3

In [None]:
# generators could be infinite 

## Overview of summary statistics that we will run

### Exploring Data Set

* Looking at rows of data 
1. Take a look at first few rows
2. Look at last rows
3. Random sampling of rows 

* What are the types of data I have
1. Look at the header
2. Look values in a column
    if numeric, continous(float?) or whole/int?
    if categorical, discrete values? ordinal(ranked in some way)? yes/no?
 
### u haz numerical data 
- range `max` and `min`
- where your data is "centered"? 
    - mean
    - median
    - mode <--- also works for categorical data
    

In [51]:
mean = lambda data:sum(data) /len(data)

In [52]:
# mode ... dict, counter 
import random

In [53]:
rolls = [random.randint (1,6) for _ in range(100)]
# _ as a single char variable name means we are not using loop variable

In [58]:
d= {}
for res in rolls:
    d[res] = d[res]+1
    # d[res] does not exist! 
    # either have to initialize values with zero in dictionary so..

KeyError: 5

In [59]:
d= {}
for res in rolls:
    try:
        d[res]=d[res]+1
    except KeyError:
        d[res] =1

In [60]:
d

{5: 18, 6: 14, 4: 20, 1: 21, 2: 14, 3: 13}

In [None]:
# other than try and except 

In [61]:
d= {}
for res in rolls:
    d[res] =d.get(res,0)+1

In [62]:
d

{5: 18, 6: 14, 4: 20, 1: 21, 2: 14, 3: 13}

### USEFUL Library

In [64]:
from collections import Counter 

In [65]:
c = Counter()

In [66]:
c['whatever']=5

In [68]:
c

Counter({'whatever': 5})

In [69]:
c = Counter(rolls)

In [70]:
c

Counter({5: 18, 6: 14, 4: 20, 1: 21, 2: 14, 3: 13})

In [71]:
c.most_common(2)

[(1, 21), (4, 20)]

In [None]:
variance: average of difference between every data points in square
standard deviation: normalizes variance

### Useful library

In [74]:
import numpy as np

In [75]:
np.std

<function numpy.std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>)>

In [None]:
What are the associate python types: int, float, str ..
numpy
pandas

## Numpy

In [2]:
import numpy as np

In [3]:
np.arange(9)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [5]:
arr= np.array([[1,2],[3,4],[5,6]])

In [6]:
arr.shape

(3, 2)

In [7]:
arr2= np.arange(9)

In [8]:
arr2

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [10]:
arr2.shape=(3,3)

In [11]:
arr2

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [13]:
arr2.shape =(9,)

In [14]:
arr2

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [15]:
arr2.reshape(3,3)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [16]:
arr2 #remains same 

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [17]:
arr2.dtype

dtype('int64')

In [21]:
arr3 =arr2.astype('float64')

In [22]:
arr3

array([0., 1., 2., 3., 4., 5., 6., 7., 8.])

In [23]:
arr3.reshape(3,3)

array([[0., 1., 2.],
       [3., 4., 5.],
       [6., 7., 8.]])

In [20]:
arr2

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [24]:
animals = np.array(['ant','cat','bat'])

In [26]:
animals.dtype #unicode 3bytes less

dtype('<U3')

In [27]:
animals = np.array(['ant','cat','batttt'])

In [28]:
animals.dtype

dtype('<U6')

In [29]:
arr= np.array([[1,1],[2,2],[3,4],[5,6]])

In [None]:
#axis! very confusing but first element given by shape 

In [None]:
#shape (2,3) => axis 0 is 2, axis 1 is going to be 3 

In [None]:
# usually if dealing with tabular data, it would be 2d array

In [None]:
# Vectorized operations 

In [34]:
arr

array([[1, 1],
       [2, 2],
       [3, 4],
       [5, 6]])

In [31]:
arr *3

array([[ 3,  3],
       [ 6,  6],
       [ 9, 12],
       [15, 18]])

In [32]:
arr >4

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

In [33]:
arr + arr

array([[ 2,  2],
       [ 4,  4],
       [ 6,  8],
       [10, 12]])