In [1]:
# setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    max_first_day = np.argmax(ridership[0,:])
    mean_for_max = np.mean(ridership[:,max_first_day], axis=0)
    overall_mean = np.mean(ridership)
    
    return (overall_mean, mean_for_max)

In [3]:
mean_riders_for_max_station(ridership)

(2342.5999999999999, 3239.9000000000001)

In [4]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    mean_ridership_per_station = np.mean(ridership, axis=0)
    max_daily_ridership = np.max(mean_ridership_per_station)
    min_daily_ridership = np.min(mean_ridership_per_station)
    
    return (max_daily_ridership, min_daily_ridership)

In [5]:
min_and_max_riders_per_day(ridership)

(3239.9000000000001, 1071.2)

In [6]:
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
    
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    max_first_day = ridership_df.iloc[0].argmax()
    mean_for_max = ridership_df[max_first_day].mean()
    overall_mean = ridership_df.values.mean()
    
    return (overall_mean, mean_for_max)

In [7]:
mean_riders_for_max_station(ridership)

(2342.5999999999999, 3239.9)

In [8]:
filename = './nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)

def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    
    # mean of x
    u_x = x.mean()
    
    # mean of y
    u_y = y.mean()
    
    # std. deviation of x
    s_x = x.std(ddof=0)
    
    # std. deviation of y
    s_y = y.std(ddof=0)
    
    # calculate z-score of x
    z_x = (x - u_x) / s_x
    
    # calculate z-score of y
    z_y = (y - u_y) / s_y
    
    # calculate correlation, average(z_x) * average(z_y)
    corr = (z_x * z_y).mean()
    
    return corr

entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print(correlation(entries, rain))
print(correlation(entries, temp))
print(correlation(rain, temp))

print(correlation(entries, cum_entries))

0.03564851577223041
-0.026693348321569912
-0.22903432340833663
0.5858954707662182


In [39]:
# Cumulative entries and exits for one station for a few hours
entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})
    
def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    return entries_and_exits - entries_and_exits.shift(1, axis=0)

In [40]:
get_hourly_entries_and_exits(entries_and_exits)

Unnamed: 0,ENTRIESn,EXITSn
0,,
1,23.0,8.0
2,18.0,18.0
3,71.0,54.0
4,170.0,44.0
5,214.0,42.0
6,87.0,11.0
7,10.0,3.0
8,36.0,89.0
9,153.0,333.0


In [49]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
    
def convert_grades(grades):
    '''
    Fill in this function to convert the given DataFrame of numerical
    grades to letter grades. Return a new DataFrame with the converted
    grade.
    
    The conversion rule is:
        90-100 -> A
        80-89  -> B
        70-79  -> C
        60-69  -> D
        0-59   -> F
    '''
    def apply_conv_rule(score):
        if score >= 90:
            return "A"
        elif score < 90 and score >= 80:
            return "B"
        elif score < 80 and score >= 70:
            return "C"
        elif score < 70 and score >= 60:
            return "D"
        else:
            return "F"
        
    return grades.applymap(apply_conv_rule)

In [50]:
convert_grades(grades_df)

Unnamed: 0,exam1,exam2
Andre,F,F
Barry,B,D
Chris,C,F
Dan,C,F
Emilio,B,D
Fred,C,F
Greta,A,C
Humbert,D,F
Ivan,A,C
James,B,D


In [51]:
grades = pd.DataFrame(
    {0: [90, 80, 70, 60, 50], 1: [89, 79, 69, 59, 49]},
    index=[0, 1, 2, 3, 4]
)

In [52]:
grades

Unnamed: 0,0,1
0,90,89
1,80,79
2,70,69
3,60,59
4,50,49


In [53]:
convert_grades(grades)

Unnamed: 0,0,1
0,A,B
1,B,C
2,C,D
3,D,F
4,F,F


In [55]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
    
def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    '''
    def z_score(x):
        return (x - x.mean()) / x.std(ddof=0)
        
    return df.apply(z_score)

In [57]:
grades_df

Unnamed: 0,exam1,exam2
Andre,43,24
Barry,81,63
Chris,78,56
Dan,75,56
Emilio,89,67
Fred,70,51
Greta,91,79
Humbert,65,46
Ivan,98,72
James,87,60


In [59]:
print(
    grades_df["exam1"].mean(),
    grades_df["exam1"].std(ddof=0))

77.7 14.986994361779146


In [56]:
standardize(grades_df)

Unnamed: 0,exam1,exam2
Andre,-2.315341,-2.304599
Barry,0.220191,0.3864
Chris,0.020017,-0.0966
Dan,-0.180156,-0.0966
Emilio,0.753987,0.6624
Fred,-0.513779,-0.4416
Greta,0.887436,1.4904
Humbert,-0.847401,-0.7866
Ivan,1.354508,1.0074
James,0.620538,0.1794


In [112]:
df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})
    
def second_largest(df):
    '''
    Fill in this function to return the second-largest value of each 
    column of the input DataFrame.
    '''
    def second_largest_by_column(column):
        return column.sort_values(ascending = False).iloc[1]
        
    return df.apply(second_largest_by_column)

In [113]:
second_largest(df)

a     4
b    40
c    20
dtype: int64

In [136]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    
    This time, try to use vectorized operations instead of apply().
    You should get the same results as you did before.
    '''
    u = df.mean()
    s = df.std(ddof=0)
    nominator = df.sub(u)
    z = nominator.div(s)
    return z

def standardize_rows(df):
    '''
    Optional: Fill in this function to standardize each row of the given
    DataFrame. Again, try not to use apply().
    
    This one is more challenging than standardizing each column!
    '''
    u = df.mean(axis="columns")
    s = df.std(ddof=0, axis="columns")
    nominator = df.sub(u, axis="index")
    z = nominator.div(s, axis="index")
    return z

In [137]:
standardize(grades_df)

Unnamed: 0,exam1,exam2
Andre,-2.315341,-2.304599
Barry,0.220191,0.3864
Chris,0.020017,-0.0966
Dan,-0.180156,-0.0966
Emilio,0.753987,0.6624
Fred,-0.513779,-0.4416
Greta,0.887436,1.4904
Humbert,-0.847401,-0.7866
Ivan,1.354508,1.0074
James,0.620538,0.1794


In [138]:
standardize_rows(grades_df)

Unnamed: 0,exam1,exam2
Andre,1.0,-1.0
Barry,1.0,-1.0
Chris,1.0,-1.0
Dan,1.0,-1.0
Emilio,1.0,-1.0
Fred,1.0,-1.0
Greta,1.0,-1.0
Humbert,1.0,-1.0
Ivan,1.0,-1.0
James,1.0,-1.0


In [None]:
values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
    'value': values,
    'even': values % 2 == 0,
    'above_three': values > 3 
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
    
filename = '/datasets/ud170/subway/nyc_subway_weather.csv'
subway_df = pd.read_csv(filename)

### Write code here to group the subway data by a variable of your choice, then
### either print out the mean ridership within each group or create a plot.