# Numpy with 2D Arrays

In [139]:
import numpy as np

In [140]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

In [141]:
# Change False to True for each block of code to see what it does

# Accessing elements
if True:
    print (ridership[1, 3])  # guess - 2328 (correct!)
    print (ridership[1:3, 3:5])  # guess - explanation below
    print (ridership[1, :])  # guess - [1478, 3877, 3674, 2328, 2539] (correct!)


2328
[[2328 2539]
 [6461 2691]]
[1478 3877 3674 2328 2539]


## But why did the second command produce such a matrix?

Well, firstly, let's have a look at the matrix, and this time, let's mark the rows.
```py
[
    [   0,    0,    2,    5,    0],  # row 0
    [1478, 3877, 3674, 2328, 2539],  # row 1
    [1613, 4088, 3991, 6461, 2691],  # row 2
    [1560, 3392, 3826, 4787, 2613],  # row 3
    [1608, 4802, 3932, 4477, 2705],  # row 4
    [1576, 3933, 3909, 4979, 2685],  # row 5
    [  95,  229,  255,  496,  201],  # row 6
    [   2,    0,    1,   27,    0],  # row 7
    [1438, 3785, 3589, 4174, 2215],  # row 8
    [1342, 4043, 4009, 4665, 3033]  # row 9
]
```

In order to answer this, break the command by the parameters within the square brackets.
- Parameter 1 - `[1:3]`
- Parameter 2 - `[3:5]`

These are essentially two consecutive slices. First will be the row slicing from 1 to 3-1, and second will be the column slicing from 3 to 5-1.

### Parameter 1 - [1:3]
`a:b` means you start at `a`th row, and slice your way down to the `b-1`th row. So in this case, the sliced array of the original array would start at row 1, and go on till row 3 - 1 = 2, resulting in the following array:
```py
#    0       1      2    3     4    <= columns
[
    [1478, 3877, 3674, 2328, 2539],  # row 1
    [1613, 4088, 3991, 6461, 2691]  # row 2
]
```
Note that this time, the columns are also marked, because they will be needed in the second slicing of parameter 2.

### Parameter 2 - [3:5]
This time, instead of slicing rows, slice the columns (that's what the second parameter is for). So we begin at column 3, and slice uptil column 5-1 = 4, resulting in the following final matrix.
```py
[
    [2328, 2539],
    [6461, 2691]
]
```

In [142]:
# Vectorized operations on rows or columns
if True:
    print (ridership[0, :] + ridership[1, :])  # guess - [0, 0, 2, 5, 0] + [1478, 3877, 3674, 2328, 2539] = [1478, 3877, 3676, 2333, 2539]
    print (ridership[:, 0] + ridership[:, 1])  # guess - column 0 + column 1


[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]


In [143]:
# Vectorized operations on entire arrays
if True:
    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
    print (a + b)  # guess - normal matrix addition

[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]


In [144]:
ridership

array([[   0,    0,    2,    5,    0],
       [1478, 3877, 3674, 2328, 2539],
       [1613, 4088, 3991, 6461, 2691],
       [1560, 3392, 3826, 4787, 2613],
       [1608, 4802, 3932, 4477, 2705],
       [1576, 3933, 3909, 4979, 2685],
       [  95,  229,  255,  496,  201],
       [   2,    0,    1,   27,    0],
       [1438, 3785, 3589, 4174, 2215],
       [1342, 4043, 4009, 4665, 3033]])

In [145]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    max_for_first = np.argmax(ridership[0, :])
    overall_mean = ridership.mean() # Replace this with your code - DONE
    mean_for_max = (ridership[:, max_for_first]).mean() # Replace this with your code - DONE
    
    return (overall_mean, mean_for_max)

In [146]:
mean_riders_for_max_station(ridership)

(2342.6, 3239.9)

## Numpy axes

Most numpy methods take an `axis` parameter.

In case of a two-dimensional array, the axis can take a value of 0 or 1. The operation will then happen across the column or the row respectively.

|`axis` | Operation|
|-------|----------|
|1 | across the row|
|0 | across the column|


In [147]:
# Change False to True for this block of code to see what it does

# NumPy axis argument
if True:
    a = np.array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
    
    print (a.sum())  # guess: 45
    print (a.sum(axis=0))  # guess: [1+4+7, 2+5+8, 3+6+9] = [12, 15, 18]
    print (a.sum(axis=1))  # guess: [1+2+3, 4+5+6, 7+8+9] = [6, 15, 24]

45
[12 15 18]
[ 6 15 24]


In [148]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    mean_riders_per_day = ridership.mean(axis=0)
    max_daily_ridership = mean_riders_per_day.max()     # Replace this with your code - DONE
    min_daily_ridership = mean_riders_per_day.min()     # Replace this with your code - DONE
    
    return (max_daily_ridership, min_daily_ridership)

In [149]:
min_and_max_riders_per_day(ridership)

(3239.9, 1071.2)

# Pandas DataFrames

In [150]:
import pandas as pd

In [151]:
# DataFrame creation

# You can create a DataFrame out of a dictionary mapping column names to values
df_1 = pd.DataFrame({
    'A': [0, 1, 2],
    'B': [3, 4, 5]
})
print(df_1)


   A  B
0  0  3
1  1  4
2  2  5


In [152]:
# You can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]])
print(df_2)

   0  1  2
0  0  1  2
1  3  4  5


In [153]:
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[
        [   0,    0,    2,    5,    0],
        [1478, 3877, 3674, 2328, 2539],
        [1613, 4088, 3991, 6461, 2691],
        [1560, 3392, 3826, 4787, 2613],
        [1608, 4802, 3932, 4477, 2705],
        [1576, 3933, 3909, 4979, 2685],
        [  95,  229,  255,  496,  201],
        [   2,    0,    1,   27,    0],
        [1438, 3785, 3589, 4174, 2215],
        [1342, 4043, 4009, 4665, 3033]
    ],
    # indices will be the rows
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
ridership_df

Unnamed: 0,R003,R004,R005,R006,R007
05-01-11,0,0,2,5,0
05-02-11,1478,3877,3674,2328,2539
05-03-11,1613,4088,3991,6461,2691
05-04-11,1560,3392,3826,4787,2613
05-05-11,1608,4802,3932,4477,2705
05-06-11,1576,3933,3909,4979,2685
05-07-11,95,229,255,496,201
05-08-11,2,0,1,27,0
05-09-11,1438,3785,3589,4174,2215
05-10-11,1342,4043,4009,4665,3033


In [154]:
# Accessing elements

# Rememeber - 
# index is the key to access an element,
# and position is the number where the element is located

print(ridership_df.iloc[0]) # gets the 0th row (position)

R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64


In [155]:
print(ridership_df.loc['05-05-11']) # gets the row with the given row name (index)

R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64


In [156]:
ridership_df['R003'] # gets the column with the column name

05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64

In [157]:
ridership_df.iloc[1, 3] # gets the element at row 1 and column 3 (position)

2328

Therefore, remember this:
- Both `loc` and `iloc` are used to access elements in a Pandas DataFrame
- `loc` takes the _index_ parameter, which the key or the row name
- `iloc` takes the _position_ parameter, which is the the integer or slice in the array.

In [158]:
# Accessing multiple rows
ridership_df.iloc[1:4] # gets rows 1, 2, and 3

Unnamed: 0,R003,R004,R005,R006,R007
05-02-11,1478,3877,3674,2328,2539
05-03-11,1613,4088,3991,6461,2691
05-04-11,1560,3392,3826,4787,2613


In [159]:
# Accessing multiple columns
ridership_df[['R003', 'R005']]

Unnamed: 0,R003,R005
05-01-11,0,2
05-02-11,1478,3674
05-03-11,1613,3991
05-04-11,1560,3826
05-05-11,1608,3932
05-06-11,1576,3909
05-07-11,95,255
05-08-11,2,1
05-09-11,1438,3589
05-10-11,1342,4009


## Pandas Axes

In [160]:
df = pd.DataFrame(df_1)
df

Unnamed: 0,A,B
0,0,3
1,1,4
2,2,5


In [161]:
df.sum()  # prints the sum per column (axis = 0 by default)

A     3
B    12
dtype: int64

In [162]:
df.sum(axis=1)  # prints the sum per row

0    3
1    5
2    7
dtype: int64

In [163]:
df.values  # prints the dataframe as a 2D array, used for summing the all the numbers (as shown below)

array([[0, 3],
       [1, 4],
       [2, 5]], dtype=int64)

In [164]:
df.values.sum()

15

In [165]:
ridership_df

Unnamed: 0,R003,R004,R005,R006,R007
05-01-11,0,0,2,5,0
05-02-11,1478,3877,3674,2328,2539
05-03-11,1613,4088,3991,6461,2691
05-04-11,1560,3392,3826,4787,2613
05-05-11,1608,4802,3932,4477,2705
05-06-11,1576,3933,3909,4979,2685
05-07-11,95,229,255,496,201
05-08-11,2,0,1,27,0
05-09-11,1438,3785,3589,4174,2215
05-10-11,1342,4043,4009,4665,3033


In [166]:
def mean_riders_for_max_station_pd(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    max_first_day = ridership.iloc[0].idxmax()

    overall_mean = ridership.values.mean() # Replace this with your code - DONE
    mean_for_max = ridership[max_first_day].mean() # Replace this with your code - DONE
    
    return (overall_mean, mean_for_max)

In [167]:
mean_riders_for_max_station_pd(ridership_df)

(2342.6, 3239.9)

## CSVs with Pandas DataFrames

Pandas DataFrames are particularly suitable for CSV files. DataFrames are 2D stuctures, just as CSV sheets, and they can have different column names for each column, also like CSVs.

In [168]:
subway_df = pd.read_csv('./nyc_subway_weather.csv')

In [172]:
def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    n = len(x)
    xbar = x.mean()
    ybar = y.mean()
    # sop = sum([(xi-xbar)*(yi-ybar) for (xi, yi) in zip(x,y)])  # sum of products
    sop  = ((x-xbar)*(y-ybar)).sum()  # NOTE: I did not have to use the above commented way for SOP as Pandas does vector operations
    cov = sop / n  # covariance; formula for covariance described here: https://corporatefinanceinstitute.com/resources/knowledge/finance/covariance/
    x_std = x.std(ddof=0)
    y_std = y.std(ddof=0)
    return cov / (x_std*y_std)

In [173]:
entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']
entries.std(ddof=0)

print (correlation(entries, rain))


0.03564851577224406


In [174]:
print (correlation(entries, temp))
print (correlation(rain, temp))

print (correlation(entries, cum_entries))

-0.026693348321570783
-0.22903432340843471
0.5858954707662075
