## Introduction to Data Science

### Introduction to Numpy, Matplotlib and Pandas

In [1]:
#import pylab
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import pathlib


%matplotlib inline
#%matplotlib notebook

In [2]:
datapath1 = pathlib.Path("../datasets/CSVs/")
datapath2 = pathlib.Path("../datasets/names/")

## [Numpy Basics](https://www.datacamp.com/community/tutorials/python-numpy-tutorial)

What Is A Python Numpy Array?  

NumPy arrays are a bit like Python lists, but still very much different at the same time. For those of you who are new to the topic, let’s clarify what it exactly is and what it’s good for.   

As the name kind of gives away, a NumPy array is a central data structure of the numpy library. The library’s name is actually short for “Numeric Python” or “Numerical Python”.  

In other words, NumPy is a Python library that is the core library for scientific computing in Python. It contains a collection of tools and techniques that can be used to solve on a computer mathematical models of problems in Science and Engineering. One of these tools is a high-performance multidimensional array object that is a powerful data structure for efficient computation of arrays and matrices. To work with these arrays, there’s a huge amount of high-level mathematical functions operate on these matrices and arrays.  

Then, what is an array?  

When you look at the print of a couple arrays, you could see it as grid that contains values of the same type. The array holds and represents any regular data in a structured way.  

However, you should know that, on a structural level, an array is basically nothing but pointers. It’s a combination of a memory address, a data type, a shape and strides:  

The data pointer indicates the memory address of the first byte in the array,  
The data type or dtype pointer describes the kind of elements that are contained within the array,  
The shape indicates the shape of the array, and
The strides are the number of bytes that should be skipped in memory to go to the next element.   

If your strides are (10,1), you need to proceed one byte to get to the next column and 10 bytes to locate the next row.  

### Creating Arrays:

In [3]:
my_numbers = [1,2,3,4]
simple_array = np.array(my_numbers)
print(simple_array)

[1 2 3 4]


In [4]:
simple_array + 34

array([35, 36, 37, 38])

In [5]:
simple_array.shape

(4,)

In [6]:
simple_array.dtype

dtype('int64')

In [7]:
simple_array.data

<memory at 0x7f122dc1e288>

In [8]:
simple_array.strides

(8,)

In [9]:
my_other_numbers = [[1,2,3],[4,5,6],[7,8,9]]
other_simple_array = np.array(my_other_numbers)
other_simple_array

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [10]:
#a = np.arange(20)
a = np.arange(1,5,0.2)
print(a)

[1.  1.2 1.4 1.6 1.8 2.  2.2 2.4 2.6 2.8 3.  3.2 3.4 3.6 3.8 4.  4.2 4.4
 4.6 4.8]


In [11]:
b = np.linspace(1,10,30)
#b = np.linspace(1,2*np.pi,50)
print(b)

[ 1.          1.31034483  1.62068966  1.93103448  2.24137931  2.55172414
  2.86206897  3.17241379  3.48275862  3.79310345  4.10344828  4.4137931
  4.72413793  5.03448276  5.34482759  5.65517241  5.96551724  6.27586207
  6.5862069   6.89655172  7.20689655  7.51724138  7.82758621  8.13793103
  8.44827586  8.75862069  9.06896552  9.37931034  9.68965517 10.        ]


In [12]:
b2 = np.logspace(1,100,30)
print(b2)

[1.00000000e+001 2.59294380e+004 6.72335754e+007 1.74332882e+011
 4.52035366e+014 1.17210230e+018 3.03919538e+021 7.88046282e+024
 2.04335972e+028 5.29831691e+031 1.37382380e+035 3.56224789e+038
 9.23670857e+041 2.39502662e+045 6.21016942e+048 1.61026203e+052
 4.17531894e+055 1.08263673e+059 2.80721620e+062 7.27895384e+065
 1.88739182e+069 4.89390092e+072 1.26896100e+076 3.29034456e+079
 8.53167852e+082 2.21221629e+086 5.73615251e+089 1.48735211e+093
 3.85662042e+096 1.00000000e+100]


In [13]:
a1 = np.zeros((3,4))
print(a1)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [14]:
a2 = np.ones((2,2))
print(a2)

[[1. 1.]
 [1. 1.]]


In [15]:
a3 = np.empty((2,3))
print(a3)

[[1.66294117e-316 0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000]]


In [16]:
a4 = np.identity(3)
print(a4)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [17]:
a5 = np.eye(2)        
print(a5)

[[1. 0.]
 [0. 1.]]


In [18]:
a6 = np.full((2,2), 7)
print(a6)

[[7 7]
 [7 7]]


In [19]:
a7 = np.random.random((2,2))  
print(a7)

[[0.9279452  0.32252334]
 [0.47987193 0.41123597]]


#### Modifying Dimensions:

In [20]:
c = np.arange(100)
print(c)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]


In [21]:
d = c.reshape(4,25)
print(d)

[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  24]
 [25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
  49]
 [50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
  74]
 [75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
  99]]


In [22]:
d = np.arange(100).reshape(4,25)
print(d)

[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  24]
 [25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
  49]
 [50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
  74]
 [75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
  99]]


In [23]:
print(c.shape)
print(d.shape)
print(np.ndim(d))
print(d.dtype.name)

(100,)
(4, 25)
2
int64


In [24]:
d2 = np.arange(100).reshape(2,10,5)
d2

array([[[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44],
        [45, 46, 47, 48, 49]],

       [[50, 51, 52, 53, 54],
        [55, 56, 57, 58, 59],
        [60, 61, 62, 63, 64],
        [65, 66, 67, 68, 69],
        [70, 71, 72, 73, 74],
        [75, 76, 77, 78, 79],
        [80, 81, 82, 83, 84],
        [85, 86, 87, 88, 89],
        [90, 91, 92, 93, 94],
        [95, 96, 97, 98, 99]]])

In [25]:
x = np.arange(12).reshape((3,4))
print(x)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [26]:
print(x.shape)

(3, 4)


In [27]:
# Resize `x` to ((6,4))
y = np.resize(x, (6,4))
print(x)
print()
print(y)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


### Slicing multidimensional arrays

In [28]:
d = np.arange(40).reshape(4,10)
print(d)

[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]
 [20 21 22 23 24 25 26 27 28 29]
 [30 31 32 33 34 35 36 37 38 39]]


In [29]:
d[:,:2]

array([[ 0,  1],
       [10, 11],
       [20, 21],
       [30, 31]])

In [30]:
d.shape

(4, 10)

In [31]:
d[d%2==0]

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
       34, 36, 38])

In [32]:
d[~d%2==0]  #negation of condition

array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33,
       35, 37, 39])

In [33]:
# Create a new array from which we will select elements
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
print(a)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [34]:
# Create an array of indices
b = np.array([0, 2, 0, 1])
print(b)

[0 2 0 1]


In [35]:
# Select one element from each row of a using the indices in b
print(a[np.arange(4), b])

[ 1  6  7 11]


In [36]:
# Mutate one element from each row of a using the indices in b
a[np.arange(4), b] += 10
print(a)

[[11  2  3]
 [ 4  5 16]
 [17  8  9]
 [10 21 12]]


In [37]:
# Boolean array indexing
bool_idx = (d > 5)
print(bool_idx)

[[False False False False False False  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True]]


In [38]:
print(d[bool_idx])

[ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 30 31 32 33 34 35 36 37 38 39]


### datatypes

In [39]:
x = np.array([1, 2])   # Let numpy choose the datatype
print(x.dtype)         # Prints "int64"

int64


In [40]:
x = np.array([1.0, 2.0])   # Let numpy choose the datatype
print(x.dtype)             # Prints "float64"

float64


In [41]:
x = np.array([1, 2], dtype=np.float64)   # Force a particular datatype
print(x.dtype)                         # Prints "int64"

float64


### Inline and vectorized operations:

In [42]:
a

array([[11,  2,  3],
       [ 4,  5, 16],
       [17,  8,  9],
       [10, 21, 12]])

In [43]:
a * 2

array([[22,  4,  6],
       [ 8, 10, 32],
       [34, 16, 18],
       [20, 42, 24]])

In [44]:
# the original array stays the same
a

array([[11,  2,  3],
       [ 4,  5, 16],
       [17,  8,  9],
       [10, 21, 12]])

In [45]:
a.cumsum()

array([ 11,  13,  16,  20,  25,  41,  58,  66,  75,  85, 106, 118])

In [46]:
a = np.arange(16).reshape(4,4)
np.vstack([a,np.arange(4).reshape(1,4)])

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [ 0,  1,  2,  3]])

In [47]:
np.hstack([a,np.arange(4).reshape(4,1)])

array([[ 0,  1,  2,  3,  0],
       [ 4,  5,  6,  7,  1],
       [ 8,  9, 10, 11,  2],
       [12, 13, 14, 15,  3]])

### Array Math

In [48]:
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

# Elementwise sum; both produce the array
# [[ 6.0  8.0]
#  [10.0 12.0]]
print(x + y)
print(np.add(x, y))

[[ 6.  8.]
 [10. 12.]]
[[ 6.  8.]
 [10. 12.]]


In [49]:
# Elementwise difference; both produce the array
# [[-4.0 -4.0]
#  [-4.0 -4.0]]
print(x - y)
print(np.subtract(x, y))

[[-4. -4.]
 [-4. -4.]]
[[-4. -4.]
 [-4. -4.]]


In [50]:
# Elementwise product; both produce the array
# [[ 5.0 12.0]
#  [21.0 32.0]]
print(x * y)
print(np.multiply(x, y))

[[ 5. 12.]
 [21. 32.]]
[[ 5. 12.]
 [21. 32.]]


In [51]:
# Elementwise division; both produce the array
# [[ 0.2         0.33333333]
#  [ 0.42857143  0.5       ]]
print(x / y)
print(np.divide(x, y))

[[0.2        0.33333333]
 [0.42857143 0.5       ]]
[[0.2        0.33333333]
 [0.42857143 0.5       ]]


In [52]:
# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
print(np.sqrt(x))

[[1.         1.41421356]
 [1.73205081 2.        ]]


In [53]:
x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])
v = np.array([9,10])
w = np.array([11, 12])

In [54]:
print(x)
print()
print(y)
print()
print(v)
print()
print(w)

[[1 2]
 [3 4]]

[[5 6]
 [7 8]]

[ 9 10]

[11 12]


In [55]:
# Inner product of vectors; both produce 219
print(v.dot(w))
print(np.dot(v, w))

219
219


In [56]:
# Matrix / vector product; both produce the rank 1 array [29 67]
print(x.dot(v))
print(np.dot(x, v))

[29 67]
[29 67]


In [57]:
# Matrix / matrix product; both produce the rank 2 array
print(x.dot(y))
print(np.dot(x, y))

[[19 22]
 [43 50]]
[[19 22]
 [43 50]]


In [58]:
x = np.array([[1,2],[3,4]])

print(np.sum(x))  # Compute sum of all elements; prints "10"
print(np.sum(x, axis=0))  # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1))  # Compute sum of each row; prints "[3 7]"

10
[4 6]
[3 7]


In [59]:
x = np.array([[1,2], [3,4]])

In [60]:
print(x)
print()
print(x.T)

[[1 2]
 [3 4]]

[[1 3]
 [2 4]]


In [61]:
# Note that taking the transpose of a rank 1 array does nothing:
v = np.array([1,2,3])

In [62]:
print(v)
print()
print(v.T)

[1 2 3]

[1 2 3]


In [63]:
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = np.empty_like(x)   # Create an empty matrix with the same shape as x

In [64]:
print(x)
print()
print(v)
print()
print(y)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]

[1 0 1]

[[31239136        0        0]
 [       0        0        0]
 [       0        0        0]
 [       0        0        0]]


In [65]:
for i in range(4):
    y[i, :] = x[i, :] + v
print(y)

[[ 2  2  4]
 [ 5  5  7]
 [ 8  8 10]
 [11 11 13]]


This works; however when the matrix x is very large, computing an explicit loop in Python could be slow. Note that adding the vector v to each row of the matrix x is equivalent to forming a matrix vv by stacking multiple copies of v vertically, then performing elementwise summation of x and vv. We could implement this approach like this:

In [66]:
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
vv = np.tile(v, (4, 1))   # Stack 4 copies of v on top of each other
print(vv)

[[1 0 1]
 [1 0 1]
 [1 0 1]
 [1 0 1]]


In [67]:
y = x + vv  # Add x and vv elementwise
print(y)

[[ 2  2  4]
 [ 5  5  7]
 [ 8  8 10]
 [11 11 13]]


Numpy broadcasting allows us to perform this computation without actually creating multiple copies of v. Consider this version, using broadcasting:

In [68]:
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = x + v  # Add v to each row of x using broadcasting
print(y)

[[ 2  2  4]
 [ 5  5  7]
 [ 8  8 10]
 [11 11 13]]


The line y = x + v works even though x has shape (4, 3) and v has shape (3,) due to broadcasting; this line works as if v actually had shape (4, 3), where each row was a copy of v, and the sum was performed elementwise.

In [69]:
# Initialize `x` and `y`
x = np.ones((3,4))
y = np.random.random((5,1,4))

# Add `x` and `y`
print(x + y)

[[[1.23248504 1.95918432 1.84895205 1.81771837]
  [1.23248504 1.95918432 1.84895205 1.81771837]
  [1.23248504 1.95918432 1.84895205 1.81771837]]

 [[1.70456914 1.43477429 1.00795803 1.07195029]
  [1.70456914 1.43477429 1.00795803 1.07195029]
  [1.70456914 1.43477429 1.00795803 1.07195029]]

 [[1.07219036 1.59947285 1.58043802 1.4336534 ]
  [1.07219036 1.59947285 1.58043802 1.4336534 ]
  [1.07219036 1.59947285 1.58043802 1.4336534 ]]

 [[1.14610074 1.50729495 1.03512513 1.43532481]
  [1.14610074 1.50729495 1.03512513 1.43532481]
  [1.14610074 1.50729495 1.03512513 1.43532481]]

 [[1.34176032 1.18601683 1.56663408 1.49674229]
  [1.34176032 1.18601683 1.56663408 1.49674229]
  [1.34176032 1.18601683 1.56663408 1.49674229]]]


You see that, even though x and y seem to have somewhat different dimensions, the two can be added together.  
That is because they are compatible in all dimensions:

    Array x has dimensions 3 X 4,
    Array y has dimensions 5 X 1 X 4

Since you have seen above that dimensions are also compatible if one of them is equal to 1, you see that these two arrays are indeed a good candidate for broadcasting!  

What you will notice is that in the dimension where y has size 1 and the other array has a size greater than 1 (that is, 3), the first array behaves as if it were copied along that dimension.  

Note that the shape of the resulting array will again be the maximum size along each dimension of x and y: the dimension of the result will be (5,3,4)  

In short, if you want to make use of broadcasting, you will rely a lot on the shape and dimensions of the arrays with which you’re working.  

# Pandas

### Pandas Data Structures: Series

In [70]:
obj = pd.Series([4, 7, -5, 3, 5])
obj

0    4
1    7
2   -5
3    3
4    5
dtype: int64

In [71]:
obj.values

array([ 4,  7, -5,  3,  5])

In [72]:
obj.index

RangeIndex(start=0, stop=5, step=1)

In [73]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan', 'Fernie']
obj

Bob       4
Steve     7
Jeff     -5
Ryan      3
Fernie    5
dtype: int64

In [74]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [75]:
obj2['c']

3

In [76]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

In [77]:
obj2[obj2 < 0]

a   -5
dtype: int64

In [78]:
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [79]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [80]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [81]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [82]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [83]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [84]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [85]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

### Pandas Data Structures: Dataframe

In [86]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],'year': [2000, 2001, 2002, 2001, 2002],'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [87]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [88]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [89]:
frame2['nova'] = 13
frame2

Unnamed: 0,year,state,pop,debt,nova
one,2000,Ohio,1.5,,13
two,2001,Ohio,1.7,,13
three,2002,Ohio,3.6,,13
four,2001,Nevada,2.4,,13
five,2002,Nevada,2.9,,13


In [90]:
frame2.nova = 23
frame2

Unnamed: 0,year,state,pop,debt,nova
one,2000,Ohio,1.5,,23
two,2001,Ohio,1.7,,23
three,2002,Ohio,3.6,,23
four,2001,Nevada,2.4,,23
five,2002,Nevada,2.9,,23


In [91]:
frame2.columns

Index(['year', 'state', 'pop', 'debt', 'nova'], dtype='object')

In [92]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [93]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [94]:
#frame2.loc['three']
frame2.loc['three','state']

'Ohio'

In [95]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt,nova
one,2000,Ohio,1.5,16.5,23
two,2001,Ohio,1.7,16.5,23
three,2002,Ohio,3.6,16.5,23
four,2001,Nevada,2.4,16.5,23
five,2002,Nevada,2.9,16.5,23


In [96]:
frame2['debt'] = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt,nova
one,2000,Ohio,1.5,0.0,23
two,2001,Ohio,1.7,1.0,23
three,2002,Ohio,3.6,2.0,23
four,2001,Nevada,2.4,3.0,23
five,2002,Nevada,2.9,4.0,23


In [97]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt,nova
one,2000,Ohio,1.5,,23
two,2001,Ohio,1.7,-1.2,23
three,2002,Ohio,3.6,,23
four,2001,Nevada,2.4,-1.5,23
five,2002,Nevada,2.9,-1.7,23


In [98]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,nova,eastern
one,2000,Ohio,1.5,,23,True
two,2001,Ohio,1.7,-1.2,23,True
three,2002,Ohio,3.6,,23,True
four,2001,Nevada,2.4,-1.5,23,False
five,2002,Nevada,2.9,-1.7,23,False


In [99]:
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt', 'nova'], dtype='object')

In [100]:
transpose = frame2.pivot(index= 'year', columns='state', values='pop') 
transpose

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [101]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [102]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [103]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [104]:
pdata = {'Ohio': frame3['Ohio'][:-1],'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [105]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [106]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame4 = pd.DataFrame(pop)
frame4

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [107]:
frame4.loc[2000,'Nevada'] = 2
frame4

Unnamed: 0,Nevada,Ohio
2000,2.0,1.5
2001,2.4,1.7
2002,2.9,3.6


In [108]:
frame5 = pd.concat([frame4, frame4])
frame5

Unnamed: 0,Nevada,Ohio
2000,2.0,1.5
2001,2.4,1.7
2002,2.9,3.6
2000,2.0,1.5
2001,2.4,1.7
2002,2.9,3.6


In [109]:
frame5.drop_duplicates(['Nevada'])

Unnamed: 0,Nevada,Ohio
2000,2.0,1.5
2001,2.4,1.7
2002,2.9,3.6


In [110]:
dates = pd.date_range("20160101", periods=10)
data = np.random.random((10,3))
column_names = ['Column1', 'Column2', 'Column3']
df = pd.DataFrame(data, index=dates, columns=column_names)
df.head(10)

Unnamed: 0,Column1,Column2,Column3
2016-01-01,0.253652,0.171362,0.194025
2016-01-02,0.192441,0.665739,0.074615
2016-01-03,0.25586,0.422352,0.705201
2016-01-04,0.853399,0.034667,0.890584
2016-01-05,0.197432,0.53059,0.816618
2016-01-06,0.192501,0.123488,0.543227
2016-01-07,0.836152,0.74048,0.65462
2016-01-08,0.901779,0.427372,0.014093
2016-01-09,0.586928,0.915442,0.468451
2016-01-10,0.687208,0.303729,0.761824


In [111]:
df[1:3]

Unnamed: 0,Column1,Column2,Column3
2016-01-02,0.192441,0.665739,0.074615
2016-01-03,0.25586,0.422352,0.705201


In [112]:
df['20160104':'20160107']

Unnamed: 0,Column1,Column2,Column3
2016-01-04,0.853399,0.034667,0.890584
2016-01-05,0.197432,0.53059,0.816618
2016-01-06,0.192501,0.123488,0.543227
2016-01-07,0.836152,0.74048,0.65462


In [113]:
df.loc['20160101':'20160102',['Column1','Column3']]

Unnamed: 0,Column1,Column3
2016-01-01,0.253652,0.194025
2016-01-02,0.192441,0.074615


In [114]:
df.iloc[3:5, 0:2]

Unnamed: 0,Column1,Column2
2016-01-04,0.853399,0.034667
2016-01-05,0.197432,0.53059


In [115]:
df.describe()

Unnamed: 0,Column1,Column2,Column3
count,10.0,10.0,10.0
mean,0.495735,0.433522,0.512326
std,0.306,0.284948,0.316121
min,0.192441,0.034667,0.014093
25%,0.211487,0.204454,0.262631
50%,0.421394,0.424862,0.598923
75%,0.798916,0.631952,0.747668
max,0.901779,0.915442,0.890584


In [116]:
df.sort_index(axis=0, ascending=False,) # inplace=True)

Unnamed: 0,Column1,Column2,Column3
2016-01-10,0.687208,0.303729,0.761824
2016-01-09,0.586928,0.915442,0.468451
2016-01-08,0.901779,0.427372,0.014093
2016-01-07,0.836152,0.74048,0.65462
2016-01-06,0.192501,0.123488,0.543227
2016-01-05,0.197432,0.53059,0.816618
2016-01-04,0.853399,0.034667,0.890584
2016-01-03,0.25586,0.422352,0.705201
2016-01-02,0.192441,0.665739,0.074615
2016-01-01,0.253652,0.171362,0.194025


In [117]:
df.sort_values(by='Column2')

Unnamed: 0,Column1,Column2,Column3
2016-01-04,0.853399,0.034667,0.890584
2016-01-06,0.192501,0.123488,0.543227
2016-01-01,0.253652,0.171362,0.194025
2016-01-10,0.687208,0.303729,0.761824
2016-01-03,0.25586,0.422352,0.705201
2016-01-08,0.901779,0.427372,0.014093
2016-01-05,0.197432,0.53059,0.816618
2016-01-02,0.192441,0.665739,0.074615
2016-01-07,0.836152,0.74048,0.65462
2016-01-09,0.586928,0.915442,0.468451


In [118]:
dates1 = pd.date_range("20160101", periods=6)
data1 = np.random.random((6,2))
column_names1 = ['ColumnA', 'ColumnB']

dates2 = pd.date_range("20160101", periods=7)
data2 = np.random.random((7,2))
column_names2 = ['ColumnC', 'ColumnD']

df1 = pd.DataFrame(data1, index=dates1, columns=column_names1)
df2 = pd.DataFrame(data2, index=dates2, columns=column_names2)

In [119]:
df1.head()

Unnamed: 0,ColumnA,ColumnB
2016-01-01,0.779924,0.718354
2016-01-02,0.283393,0.919907
2016-01-03,0.355569,0.192782
2016-01-04,0.35518,0.954687
2016-01-05,0.95173,0.594466


In [120]:
df2.head()

Unnamed: 0,ColumnC,ColumnD
2016-01-01,0.120544,0.973978
2016-01-02,0.439753,0.237376
2016-01-03,0.751467,0.185081
2016-01-04,0.554778,0.605112
2016-01-05,0.436507,0.904972


In [121]:
df1.join(df2)

Unnamed: 0,ColumnA,ColumnB,ColumnC,ColumnD
2016-01-01,0.779924,0.718354,0.120544,0.973978
2016-01-02,0.283393,0.919907,0.439753,0.237376
2016-01-03,0.355569,0.192782,0.751467,0.185081
2016-01-04,0.35518,0.954687,0.554778,0.605112
2016-01-05,0.95173,0.594466,0.436507,0.904972
2016-01-06,0.555353,0.378033,0.387914,0.187444


In [122]:
df3 = df1.join(df2)

# add a column to df to group on
df3['ProfitLoss'] = pd.Series(['Profit', 
                               'Loss', 
                               'Profit', 
                               'Profit', 
                               'Profit', 
                               'Loss', 
                               'Profit', 
                               'Profit', 
                               'Profit', 
                               'Loss'], index=dates)

In [123]:
df3.head()

Unnamed: 0,ColumnA,ColumnB,ColumnC,ColumnD,ProfitLoss
2016-01-01,0.779924,0.718354,0.120544,0.973978,Profit
2016-01-02,0.283393,0.919907,0.439753,0.237376,Loss
2016-01-03,0.355569,0.192782,0.751467,0.185081,Profit
2016-01-04,0.35518,0.954687,0.554778,0.605112,Profit
2016-01-05,0.95173,0.594466,0.436507,0.904972,Profit


In [124]:
df3.groupby('ProfitLoss').mean()

Unnamed: 0_level_0,ColumnA,ColumnB,ColumnC,ColumnD
ProfitLoss,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Loss,0.419373,0.64897,0.413834,0.21241
Profit,0.610601,0.615072,0.465824,0.667286


## An example: Querying Quandl for BTC series

In [125]:
#### https://hackernoon.com/fundamental-python-data-science-libraries-a-cheatsheet-part-2-4-fcf5fab9cdf1
import quandl

# set up the Quandl connection

#api_key = 'GETYOURAPIKEY'
#quandl.ApiConfig.api_key = api_key
quandl_code = "BITSTAMP/USD"

# get the data from the API
bitcoin_data = quandl.get(quandl_code, start_date="2017-01-01", end_date="2018-01-17", returns="numpy")

# set up the data in pandas
df = pd.DataFrame(data=bitcoin_data, columns=['Date', 'High', 'Low', 'Last', 'Bid', 'Ask', 'Volume', 'VWAP'])

# make the 'Date' column the index
df.set_index('Date', inplace=True) 

# find a rolling 30 day average
df['RollingMean'] = df['Last'].rolling(window=30).mean().shift(1)

# label when the last price is less than L30D average
df['Buy'] = df['Last'] < df['RollingMean']

# create a strategic trading DataFrame
trading_info = df.loc[:,['Last', 'RollingMean', 'Buy']]

trading_info.tail(10) # lets look at last 10 days

Unnamed: 0_level_0,Last,RollingMean,Buy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-08,16173.98,15693.421333,False
2018-01-09,15000.0,15704.147667,True
2018-01-10,14397.3,15716.680333,True
2018-01-11,14900.0,15706.590333,True
2018-01-12,13220.0,15655.209333,True
2018-01-13,13829.29,15539.209333,True
2018-01-14,14189.66,15458.548,True
2018-01-15,13648.0,15384.76,True
2018-01-16,13581.66,15258.109,True
2018-01-17,11378.66,15070.668667,True


## An example: Baby names in the USA

In [126]:
names1880 = pd.read_csv(datapath2 / 'yob1880.txt', names=['name', 'sex', 'births'])

FileNotFoundError: File b'../datasets/names/yob1880.txt' does not exist

In [None]:
names1880[0:20]

In [None]:
#names1880.head()
names1880.tail()

In [None]:
names1880.groupby('sex').births.sum()

In [None]:
years = range(1880, 2012)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
    path = datapath2 / 'yob{}.txt'.format(year)
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year
    pieces.append(frame)
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [None]:
names
names[:10]
#names.groupby('sex').births.sum()

In [None]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)

In [None]:
total_births

In [None]:
total_births.tail()

In [None]:
total_births.plot(title='Total births by sex and year')

In [None]:
def add_prop(group):
    # Integer division floors
    births = group.births.astype(float)
    group['percent'] = births / births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)

In [None]:
#names
names[:10]

In [None]:
names[names.name.str.startswith('Renat')]

In [None]:
np.allclose(names.groupby(['year', 'sex']).percent.sum(), 1)

In [None]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

In [None]:
#top1000
pd.options.display.float_format = '{:,.3f}'.format
top1000[:15]

In [None]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

In [None]:
Walter_names = boys[boys.name=='Walter']
Walter_names[:10]

In [None]:
total_births_top1000 = top1000.pivot_table('births', index='year', columns='name',aggfunc=sum)

In [None]:
subset = total_births_top1000[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
title="Number of births per year")

In [None]:
table = top1000.pivot_table('percent', index='year', columns='sex', aggfunc=sum)
table.plot(title='Sum of table1000.percent by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

In [None]:
df = boys[boys.year == 2010]
prop_cumsum = df.sort_values(by='percent', ascending=False).percent.cumsum()

In [None]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='percent', ascending=False)
    return group.percent.cumsum().values.searchsorted(q) + 1

#prop_cumsum.values.searchsorted(0.5)

In [None]:
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()

In [None]:
diversity.plot(title="Number of popular names in top 50%")

In [None]:
# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()

In [None]:
subtable.sum()

In [None]:
letter_prop = subtable / subtable.sum().astype(float)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',legend=False)

In [None]:
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()

In [None]:
dny_ts.plot()

In [None]:
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like

In [None]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

In [None]:
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

In [None]:
table.plot(style={'M': 'k-', 'F': 'k--'})