# Lesson 3: NumPy and Pandas for 2D Data


# 2D NumPy Arrays¶

In [2]:
import numpy as np

# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

In [5]:
# Accessing elements
print(ridership[1,3])
print(ridership[1:3,3:5])
print(ridership[1, :])

2328
[[2328 2539]
 [6461 2691]]
[1478 3877 3674 2328 2539]


In [6]:
# Vectorized operations on rows or columns

print(ridership[0, :] + ridership[1, :])
print(ridership[:, 0] + ridership[:, 1])

[1478 3877 3676 2333 2539]
[   0 5355 5701 4952 6410 5509  324    2 5223 5385]


In [12]:
# Vectorized operations on entire arrays

a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
print(a)
print('\n')
print(b)
print('\n')
print(a+b)

[[1 2 3]
 [4 5 6]
 [7 8 9]]


[[1 1 1]
 [2 2 2]
 [3 3 3]]


[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]


In [13]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    # ridership[day, station]

    overall_mean = ridership.mean()
    mean_for_max = ridership[:,ridership[0,:].argmax()].mean()
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership)

(2342.6, 3239.9)

In [24]:
# break into steps
import numpy as np
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])
#  find the station with maximum riders on the first day
max_station = ridership[0, :].argmax()
print('station number', max_station, 'has the most riders on the first day','\n')

# return the mean riders per day for that station
mean_for_max = ridership[:, max_station].mean()
print(mean_for_max,' riders per day for station', max_station,'in average.','\n')

# return the mean ridership overall
overall_mean = ridership.mean()
print(overall_mean,' riders per day in average.')

station number 3 has the most riders on the first day 

3239.9  riders per day for station 3 in average. 

2342.6  riders per day in average.


# NumPy Axis

In [26]:
# NumPy axis argument

a = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

print(a.sum(), '\n')
# along columns
print(a.sum(axis=0), '\n')
# along rows
print(a.sum(axis=1))

45 

[12 15 18] 

[ 6 15 24]


In [27]:
def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    mean_ridership_per_day = ridership.mean(axis=0) # ...for each subway station
    max_daily_ridership = mean_ridership_per_day.max()
    min_daily_ridership = mean_ridership_per_day.min()
    
    return (max_daily_ridership, min_daily_ridership)

min_and_max_riders_per_day(ridership)

(3239.9, 1071.2)

# Accessing Elements of a DataFrame

In [33]:
# DataFrame creation

# You can create a DataFrame out of a dictionary mapping column names to values
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print(df_1, '\n')

# You can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
print(df_2)

   A  B
0  0  3
1  1  4
2  2  5 

   A  B  C
0  0  1  2
1  3  4  5


In [34]:
import pandas as pd

In [35]:
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)
ridership_df

Unnamed: 0,R003,R004,R005,R006,R007
05-01-11,0,0,2,5,0
05-02-11,1478,3877,3674,2328,2539
05-03-11,1613,4088,3991,6461,2691
05-04-11,1560,3392,3826,4787,2613
05-05-11,1608,4802,3932,4477,2705
05-06-11,1576,3933,3909,4979,2685
05-07-11,95,229,255,496,201
05-08-11,2,0,1,27,0
05-09-11,1438,3785,3589,4174,2215
05-10-11,1342,4043,4009,4665,3033


In [36]:
# Accessing elements

print(ridership_df.iloc[0], '\n')
print(ridership_df.loc['05-05-11'], '\n')
# print ridership_df.loc['2011-05-05'], '\n' # returns an error
print(ridership_df['R003'], '\n')
# print ridership_df['R002'], '\n ] # returns an error
print(ridership_df.iloc[1, 3])

R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64 

R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64 

05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64 

2328


In [38]:
# Accessing multiple rows
print(ridership_df.iloc[1:4])
ridership_df.iloc[1:4]

          R003  R004  R005  R006  R007
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613


Unnamed: 0,R003,R004,R005,R006,R007
05-02-11,1478,3877,3674,2328,2539
05-03-11,1613,4088,3991,6461,2691
05-04-11,1560,3392,3826,4787,2613


In [40]:
# Accessing multiple columns
print(ridership_df[['R003', 'R005']])
ridership_df[['R003', 'R005']]

          R003  R005
05-01-11     0     2
05-02-11  1478  3674
05-03-11  1613  3991
05-04-11  1560  3826
05-05-11  1608  3932
05-06-11  1576  3909
05-07-11    95   255
05-08-11     2     1
05-09-11  1438  3589
05-10-11  1342  4009


Unnamed: 0,R003,R005
05-01-11,0,2
05-02-11,1478,3674
05-03-11,1613,3991
05-04-11,1560,3826
05-05-11,1608,3932
05-06-11,1576,3909
05-07-11,95,255
05-08-11,2,1
05-09-11,1438,3589
05-10-11,1342,4009


In [42]:
# Pandas axis

df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
print('df = ', '\n', df, '\n')
print('df.sum() = ', '\n', df.sum(), '\n')
print('df.sum(axis=0) = ', '\n', df.sum(axis=0), '\n')
print('df.sum(axis=1) = ', '\n', df.sum(axis=1), '\n')
print('df.values.sum() = ', '\n', df.values.sum())

df =  
    A  B
0  0  3
1  1  4
2  2  5 

df.sum() =  
 A     3
B    12
dtype: int64 

df.sum(axis=0) =  
 A     3
B    12
dtype: int64 

df.sum(axis=1) =  
 0    3
1    5
2    7
dtype: int64 

df.values.sum() =  
 15


In [75]:
import pandas as pd
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)


In [76]:
ridership_df.iloc[0].argmax() # find the station which has max riders

3

In [79]:
ridership_df['R006'].mean() # average riders for station 3

3239.9

In [80]:
ridership_df.values.mean() # .value

2342.6

In [83]:
import pandas as pd
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    overall_mean = ridership.values.mean()    
    mean_for_max = ridership['R006'].mean()
    
    return (overall_mean, mean_for_max)
mean_riders_for_max_station(ridership_df)

(2342.6, 3239.9)

# Calculatin Correlation

In [84]:
def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    return ((x-x.mean())/x.std(ddof=0) * (y-y.mean())/y.std(ddof=0)).mean()

# test code:
x = pd.Series([1, 2, 3, 4])
y = pd.Series([10, 11, 12, 13])
print(correlation(x, y))

0.9999999999999999


In [98]:
import pandas as pd
subway_df = pd.read_csv('../../../../Jupyter/Udacity/IntroDataAnalysis/lesson3/nyc_subway_weather.csv')

def correlation(x,y):
    std_x = (x-x.mean()) / x.std(ddof=0)
    std_y = (y-y.mean())/ y.std(ddof=0)
    return (std_x * std_y).mean()

entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print(correlation(entries, rain))
print(correlation(entries, temp))
print(correlation(rain, temp))
print(correlation(entries, cum_entries))

0.03564851577223041
-0.026693348321569912
-0.22903432340833663
0.5858954707662182


# Data Frame Vectorised Operations

In [104]:
# Adding DataFrames with the column names

df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
df1 + df2

Unnamed: 0,a,b,c
0,11,44,77
1,22,55,88
2,33,66,99


In [105]:
# Adding DataFrames with overlapping column names 

df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
df1 + df2

Unnamed: 0,a,b,c,d
0,,74,47,
1,,85,58,
2,,96,69,


In [106]:
# Adding DataFrames with overlapping row indexes

df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
                   index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
                   index=['row4', 'row3', 'row2'])
df1 + df2

Unnamed: 0,a,b,c
row1,,,
row2,32.0,65.0,98.0
row3,23.0,56.0,89.0
row4,,,


In [107]:
print(df1)
print(df1.diff())
df1.shift(1)

      a  b  c
row1  1  4  7
row2  2  5  8
row3  3  6  9
        a    b    c
row1  NaN  NaN  NaN
row2  1.0  1.0  1.0
row3  1.0  1.0  1.0


Unnamed: 0,a,b,c
row1,,,
row2,1.0,4.0,7.0
row3,2.0,5.0,8.0


In [108]:
# Cumulative entries and exits for one station for a few hours
entries_and_exits = pd.DataFrame({
    'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
                 3144808, 3144895, 3144905, 3144941, 3145094],
    'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
               1088317, 1088328, 1088331, 1088420, 1088753]
})
entries_and_exits

Unnamed: 0,ENTRIESn,EXITSn
0,3144312,1088151
1,3144335,1088159
2,3144353,1088177
3,3144424,1088231
4,3144594,1088275
5,3144808,1088317
6,3144895,1088328
7,3144905,1088331
8,3144941,1088420
9,3145094,1088753


In [109]:
def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits (entries in the first column, exits in the second) and
    return a DataFrame with hourly entries and exits (entries in the
    first column, exits in the second).
    '''
    return entries_and_exits.diff()
get_hourly_entries_and_exits(entries_and_exits)

Unnamed: 0,ENTRIESn,EXITSn
0,,
1,23.0,8.0
2,18.0,18.0
3,71.0,54.0
4,170.0,44.0
5,214.0,42.0
6,87.0,11.0
7,10.0,3.0
8,36.0,89.0
9,153.0,333.0


# DataFrame applymap()

In [112]:
# DataFrame applymap()
df = pd.DataFrame({
    'a': [1,2,3],
    'b': [10,20,30],
    'c': [5,10,15]
})
def add_one(x):
    return x+1
print(df, '\n')
print(df.applymap(add_one))

   a   b   c
0  1  10   5
1  2  20  10
2  3  30  15 

   a   b   c
0  2  11   6
1  3  21  11
2  4  31  16


In [123]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
print(grades_df)

         exam1  exam2
Andre       43     24
Barry       81     63
Chris       78     56
Dan         75     56
Emilio      89     67
Fred        70     51
Greta       91     79
Humbert     65     46
Ivan        98     72
James       87     60


In [121]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)

def convert_grade(grade):
    if grade >=90:
        return 'A'
    elif grade >= 80:
        return 'B'
    elif grade >= 70:
        return 'C'
    elif grade >= 60:
        return 'D'
    else:
        return 'F'
    
def convert_grades(grades):
    '''
    Fill in this function to convert the given DataFrame of numerical
    grades to letter grades. Return a new DataFrame with the converted
    grade.
    
    The conversion rule is:
        90-100 -> A
        80-89  -> B
        70-79  -> C
        60-69  -> D
        0-59   -> F
    '''
    return grades.applymap(convert_grade)

convert_grades(grades_df)


Unnamed: 0,exam1,exam2
Andre,F,F
Barry,B,D
Chris,C,F
Dan,C,F
Emilio,B,D
Fred,C,F
Greta,A,C
Humbert,D,F
Ivan,A,C
James,B,D


# DataFrame apply()

In [126]:
# DataFrame apply()

def convert_grades_curve(exam_grades):
    # Pandas has a built-in function that will perform this calculation
    # This will give the bottom 0% to 10% of students the grade 'F',
    # 10% to 20% the grade 'D', and so on. You can read more about
    # the qcut() function here:
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
    return pd.qcut(exam_grades,
                   [0, 0.1, 0.2, 0.5, 0.8, 1],
                   labels=['F', 'D', 'C', 'B', 'A'])

# qcut() operates on a list, array, or Series. This is the
# result of running the function on a single column of the
# DataFrame.
print(convert_grades_curve(grades_df['exam1']),'\n')

# qcut() does not work on DataFrames, but we can use apply()
# to call the function on each column separately
print(grades_df.apply(convert_grades_curve))

Andre      F
Barry      B
Chris      C
Dan        C
Emilio     B
Fred       C
Greta      A
Humbert    D
Ivan       A
James      B
Name: exam1, dtype: category
Categories (5, object): [F < D < C < B < A] 

        exam1 exam2
Andre       F     F
Barry       B     B
Chris       C     C
Dan         C     C
Emilio      B     B
Fred        C     C
Greta       A     A
Humbert     D     D
Ivan        A     A
James       B     B


In [129]:
def standardise(column):
    return (column - column.mean())/column.std(ddof=0)

def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    '''
    return df.apply(standardise)

standardize(grades_df)

Unnamed: 0,exam1,exam2
Andre,-2.315341,-2.304599
Barry,0.220191,0.3864
Chris,0.020017,-0.0966
Dan,-0.180156,-0.0966
Emilio,0.753987,0.6624
Fred,-0.513779,-0.4416
Greta,0.887436,1.4904
Humbert,-0.847401,-0.7866
Ivan,1.354508,1.0074
James,0.620538,0.1794


# DataFrame apply() Use Case 2

In [136]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

# Change False to True for this block of code to see what it does

# DataFrame apply() - use case 2
if True:   
    print(df.apply(np.mean))
    print(df.apply(np.max))


a     3.0
b    30.0
c    15.0
dtype: float64
a     5
b    50
c    25
dtype: int64


In [153]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

def second_largest_in_column(column):
    sorted_column = column.sort_values(ascending=False)
    return sorted_column.iloc[1]

second_column_a = second_largest_in_column(df['a'])
print('The second largest number in column a is:', second_column_a, '\n')

def second_largest_dataframe(df):
    return df.apply(second_largest_in_column)
second_dataframe = second_largest_dataframe(df)
print('The second largest number in dataframe is:', '\n', second_dataframe)


The second largest number in column a is: 4 

The second largest number in dataframe is: 
 a     4
b    40
c    20
dtype: int64


# Adding a DataFrame to a Series

In [156]:
# Adding a Series to a square DataFrame

s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
    0: [10, 20, 30, 40],
    1: [50, 60, 70, 80],
    2: [90, 100, 110, 120],
    3: [130, 140, 150, 160]
})

print(s)
print(df)
print(' ') # Create a blank line between outputs
df + s

0    1
1    2
2    3
3    4
dtype: int64
    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160
 


Unnamed: 0,0,1,2,3
0,11,52,93,134
1,21,62,103,144
2,31,72,113,154
3,41,82,123,164


In [159]:
# Adding a Series to a one-row DataFrame 

s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})

print(s)
print(' ')
print(df)
print(' ')
df + s

0    1
1    2
2    3
3    4
dtype: int64
 
    0   1   2   3
0  10  20  30  40
 


Unnamed: 0,0,1,2,3
0,11,22,33,44


In [160]:
# Adding a Series to a one-column DataFrame

s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({0: [10, 20, 30, 40]})

print(s)
print(' ') 
print(df)
print(' ')
df + s

0    1
1    2
2    3
3    4
dtype: int64
 
    0
0  10
1  20
2  30
3  40
 


Unnamed: 0,0,1,2,3
0,11,,,
1,21,,,
2,31,,,
3,41,,,


In [162]:
# Adding when DataFrame column names match Series index

s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
df = pd.DataFrame({
    'a': [10, 20, 30, 40],
    'b': [50, 60, 70, 80],
    'c': [90, 100, 110, 120],
    'd': [130, 140, 150, 160]
})

print(s)
print(' ')
print(df)
print(' ')
df + s

a    1
b    2
c    3
d    4
dtype: int64
 
    a   b    c    d
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160
 


Unnamed: 0,a,b,c,d
0,11,52,93,134
1,21,62,103,144
2,31,72,113,154
3,41,82,123,164


In [163]:
# Adding when DataFrame column names don't match Series index

s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
    'a': [10, 20, 30, 40],
    'b': [50, 60, 70, 80],
    'c': [90, 100, 110, 120],
    'd': [130, 140, 150, 160]
})

print(s)
print(' ')
print(df)
print(' ')
df + s

0    1
1    2
2    3
3    4
dtype: int64
 
    a   b    c    d
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160
 


Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


# Standardising Each Column Again

In [164]:
# Adding using +

s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
    0: [10, 20, 30, 40],
    1: [50, 60, 70, 80],
    2: [90, 100, 110, 120],
    3: [130, 140, 150, 160]
})

print(s)
print(' ')
print(df)
print(' ')
df + s

0    1
1    2
2    3
3    4
dtype: int64
 
    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160
 


Unnamed: 0,0,1,2,3
0,11,52,93,134
1,21,62,103,144
2,31,72,113,154
3,41,82,123,164


In [165]:
# Adding with axis='index'

s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
    0: [10, 20, 30, 40],
    1: [50, 60, 70, 80],
    2: [90, 100, 110, 120],
    3: [130, 140, 150, 160]
})

print(s)
print(' ')
print(df)
print(' ') # Create a blank line between outputs
df.add(s, axis='index')
# The functions sub(), mul(), and div() work similarly to add()

0    1
1    2
2    3
3    4
dtype: int64
 
    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160
 


Unnamed: 0,0,1,2,3
0,11,51,91,131
1,22,62,102,142
2,33,73,113,153
3,44,84,124,164


In [166]:
# Adding with axis='columns'

s = pd.Series([1, 2, 3, 4])
df = pd.DataFrame({
    0: [10, 20, 30, 40],
    1: [50, 60, 70, 80],
    2: [90, 100, 110, 120],
    3: [130, 140, 150, 160]
})

print(s)
print(' ')
print(df)
print(' ') # Create a blank line between outputs
df.add(s, axis='columns')
# The functions sub(), mul(), and div() work similarly to add()

0    1
1    2
2    3
3    4
dtype: int64
 
    0   1    2    3
0  10  50   90  130
1  20  60  100  140
2  30  70  110  150
3  40  80  120  160
 


Unnamed: 0,0,1,2,3
0,11,52,93,134
1,21,62,103,144
2,31,72,113,154
3,41,82,123,164


In [167]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
grades_df

Unnamed: 0,exam1,exam2
Andre,43,24
Barry,81,63
Chris,78,56
Dan,75,56
Emilio,89,67
Fred,70,51
Greta,91,79
Humbert,65,46
Ivan,98,72
James,87,60


In [171]:
grades_df.mean()

exam1    77.7
exam2    57.4
dtype: float64

In [172]:
grades_df - grades_df.mean()

Unnamed: 0,exam1,exam2
Andre,-34.7,-33.4
Barry,3.3,5.6
Chris,0.3,-1.4
Dan,-2.7,-1.4
Emilio,11.3,9.6
Fred,-7.7,-6.4
Greta,13.3,21.6
Humbert,-12.7,-11.4
Ivan,20.3,14.6
James,9.3,2.6


In [173]:
(grades_df - grades_df.mean()) / grades_df.std()

Unnamed: 0,exam1,exam2
Andre,-2.196525,-2.186335
Barry,0.208891,0.366571
Chris,0.01899,-0.091643
Dan,-0.170911,-0.091643
Emilio,0.715295,0.628408
Fred,-0.487413,-0.418938
Greta,0.841896,1.413917
Humbert,-0.803916,-0.746234
Ivan,1.284999,0.955703
James,0.588694,0.170194


In [175]:
def standardize(df):
    '''
    Fill in this function to standardize each column of the given
    DataFrame. To standardize a variable, convert each value to the
    number of standard deviations it is above or below the mean.
    
    This time, try to use vectorized operations instead of apply().
    You should get the same results as you did before.
    '''
    return (df - df.mean())/df.std(ddof = 0)
standardize(grades_df)

Unnamed: 0,exam1,exam2
Andre,-2.315341,-2.304599
Barry,0.220191,0.3864
Chris,0.020017,-0.0966
Dan,-0.180156,-0.0966
Emilio,0.753987,0.6624
Fred,-0.513779,-0.4416
Greta,0.887436,1.4904
Humbert,-0.847401,-0.7866
Ivan,1.354508,1.0074
James,0.620538,0.1794


In [176]:
grades_df = pd.DataFrame(
    data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
          'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
    index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
           'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
)
grades_df
def standardize_rows(df):
    '''
    Optional: Fill in this function to standardize each row of the given
    DataFrame. Again, try not to use apply().
    
    This one is more challenging than standardizing each column!
    '''
    return (grades_df.sub(grades_df.mean(axis='columns'), axis='index')).div(grades_df.std(ddof = 0, axis='columns'), axis='index')
standardize_rows(grades_df)

Unnamed: 0,exam1,exam2
Andre,1.0,-1.0
Barry,1.0,-1.0
Chris,1.0,-1.0
Dan,1.0,-1.0
Emilio,1.0,-1.0
Fred,1.0,-1.0
Greta,1.0,-1.0
Humbert,1.0,-1.0
Ivan,1.0,-1.0
James,1.0,-1.0
