* 2D arrays are more memory efficient compared to an array of arrays, and accessing elements are different ( a[1,3] vs. a[1][3] )

In [1]:
import numpy as np

# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

print( ridership[1, 3])
print( ridership[1:3, 3:5])
print( ridership[1, :]) # 2nd row

2328
[[2328 2539]
 [6461 2691]]
[1478 3877 3674 2328 2539]


In [2]:
print (ridership[0, :] + ridership[1, :]) # vector math
print( ridership[:, 0] + ridership[:, 1])

[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]


In [3]:
# Vectorized operations on entire arrays
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
print( a + b)

[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]


In [37]:
def mean_riders_for_max_station(ridership):
    # Find station with the maximum riders from all days + return the mean riders per day for that station.
    # Return the mean ridership overall for comparison.  
    overall_mean = ridership.mean()
    
    max_col = np.unravel_index(ridership.argmax(), ridership.shape)[1]
    mean_for_max_all = ridership[:,max_col].mean()
    
    # Find station with the maximum riders from FIRST day + return the mean riders per day for that station. Also
    # Return the mean ridership overall for comparison.
    overall_mean_first_day = ridership[0,:].mean()

    mean_for_max_first = ridership[0,np.argmax(ridership[0,:])].mean()
    
    return (overall_mean, mean_for_max_all,overall_mean_first_day,mean_for_max_first)

mean_riders_for_max_station(ridership)

(2342.5999999999999, 3239.9000000000001, 1.3999999999999999, 5.0)

So the station with the most passengers on the first day had a higher overall overage over the 10 days compared to all stations.

In [40]:
def min_and_max_riders_per_day(ridership):
    # For each subway station, calculate the mean ridership per day. 
    mean_rider_per_day = ridership.mean(axis = 0)
    
    #Out of all subway stations, return the maximum and minimum mean values.
    min_daily_ridership = mean_rider_per_day.min()
    max_daily_ridership = mean_rider_per_day.max()
    
    return (max_daily_ridership, min_daily_ridership)

min_and_max_riders_per_day(ridership)

(3239.9000000000001, 1071.2)

Can see the max mean and min mean are both where one would expect them to be compared to the overall mean from above

<h1> DataFrames vs. Arrays </h1>

When values are of different types in 2D NumPy Arrays, then each is converted to a string. To keep original data types, use DataFrames

In [42]:
import pandas as pd

# Create a DataFrame out of a dictionary, mapping column names to values ( --> {'col': [value1, value2], 'col2':....})
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print(df_1)

   A  B
0  0  3
1  1  4
2  2  5


In [43]:
# Can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
print( df_2)

   A  B  C
0  0  1  2
1  3  4  5


In [46]:
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007'])

# Accessing elements
print( ridership_df.iloc[0]) # 1st row
print( ridership_df.loc['05-05-11']) # row w/ that title
print( ridership_df['R003']) # that column
print( ridership_df.iloc[1, 3]) # specific cell

R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64
R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64
05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64
2328


In [49]:
# Accessing multiple rows
print( ridership_df.iloc[1:4])
    
# Accessing multiple columns (notice double brackets)
print( ridership_df[['R003', 'R005']])

          R003  R004  R005  R006  R007
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613
          R003  R005
05-01-11     0     2
05-02-11  1478  3674
05-03-11  1613  3991
05-04-11  1560  3826
05-05-11  1608  3932
05-06-11  1576  3909
05-07-11    95   255
05-08-11     2     1
05-09-11  1438  3589
05-10-11  1342  4009


In [55]:
# Pandas axis
df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print( df.sum()) # sums rows + gives an index
print( df.sum(axis=1)) # sums cols + gives an index
print( df.values.sum()) # sum of all numeric values

A     3
B    12
dtype: int64
0    3
1    5
2    7
dtype: int64
15


In [76]:
def mean_riders_for_max_station(ridership):
    # Find station with max riders on 1st day + return mean riders per day for that station. 
    mean_for_max = ridership[[ridership.iloc[0].argmax()]].mean()
    
    # return the mean ridership overall for comparsion.
    overall_mean = ridership.values.mean()
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership_df)

(2342.5999999999999, R006    3239.9
 dtype: float64)

<h1>NYC</h1>

In [78]:
nyc = pd.read_csv('nyc_subway_weather.csv')
nyc.head()

Unnamed: 0,UNIT,DATEn,TIMEn,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,datetime,hour,day_week,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
0,R003,05-01-11,00:00:00,4388333,2911002,0.0,0.0,2011-05-01 00:00:00,0,6,...,30.22,0,55.9,3.5,0.0,30.258,55.98,7.86,40.700348,-73.887177
1,R003,05-01-11,04:00:00,4388333,2911002,0.0,0.0,2011-05-01 04:00:00,4,6,...,30.25,0,52.0,3.5,0.0,30.258,55.98,7.86,40.700348,-73.887177
2,R003,05-01-11,12:00:00,4388333,2911002,0.0,0.0,2011-05-01 12:00:00,12,6,...,30.28,0,62.1,6.9,0.0,30.258,55.98,7.86,40.700348,-73.887177
3,R003,05-01-11,16:00:00,4388333,2911002,0.0,0.0,2011-05-01 16:00:00,16,6,...,30.26,0,57.9,15.0,0.0,30.258,55.98,7.86,40.700348,-73.887177
4,R003,05-01-11,20:00:00,4388333,2911002,0.0,0.0,2011-05-01 20:00:00,20,6,...,30.28,0,52.0,10.4,0.0,30.258,55.98,7.86,40.700348,-73.887177


In [79]:
nyc.describe()

Unnamed: 0,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,hour,day_week,weekday,latitude,longitude,fog,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
count,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,...,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0
mean,28124860.0,19869930.0,1886.589955,1361.487866,10.046754,2.905719,0.714436,40.724647,-73.940364,0.009824,...,29.971096,0.224741,63.10378,6.927872,0.004618,29.971096,63.10378,6.927872,40.728555,-73.938693
std,30436070.0,20289860.0,2952.385585,2183.845409,6.938928,2.079231,0.451688,0.07165,0.059713,0.098631,...,0.137942,0.417417,8.455597,4.510178,0.016344,0.131158,6.939011,3.179832,0.06542,0.059582
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.576152,-74.073622,0.0,...,29.55,0.0,46.9,0.0,0.0,29.59,49.4,0.0,40.600204,-74.01487
25%,10397620.0,7613712.0,274.0,237.0,4.0,1.0,0.0,40.677107,-73.987342,0.0,...,29.89,0.0,57.0,4.6,0.0,29.913333,58.283333,4.816667,40.688591,-73.98513
50%,18183890.0,13316090.0,905.0,664.0,12.0,3.0,1.0,40.717241,-73.953459,0.0,...,29.96,0.0,61.0,6.9,0.0,29.958,60.95,6.166667,40.72057,-73.94915
75%,32630490.0,23937710.0,2255.0,1537.0,16.0,5.0,1.0,40.759123,-73.907733,0.0,...,30.06,0.0,69.1,9.2,0.0,30.06,67.466667,8.85,40.755226,-73.912033
max,235774600.0,149378200.0,32814.0,34828.0,20.0,6.0,1.0,40.889185,-73.755383,1.0,...,30.32,1.0,86.0,23.0,0.1575,30.293333,79.8,17.083333,40.862064,-73.694176


In [89]:
import pandas as pd

def correlation(x, y):
    # standardize each value using ddof = 0 to NOT use Bessel's correction for samples
    x_std = (x - x.mean())/x.std(ddof = 0)
    y_std = (y - y.mean())/y.std(ddof = 0)
    
    # Multiply and take average 
    #    - correlation = average of (x in standard units) times (y in standard units)
    r = (x_std*y_std).mean()
    
    return r

entries = nyc['ENTRIESn_hourly']
cum_entries = nyc['ENTRIESn']
rain = nyc['meanprecipi']
temp = nyc['meantempi']

print( correlation(entries, rain))
print (correlation(entries, temp))
print( correlation(rain, temp))
print (correlation(entries, cum_entries))

0.03564851577223041
-0.026693348321569912
-0.22903432340833663
0.5858954707662182


So all have weak correlations, and **entries** and **cumulative entries** have a moderate positive relationship

In [91]:
np.corrcoef(entries,cum_entries)

array([[ 1.        ,  0.58589547],
       [ 0.58589547,  1.        ]])