In [2]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

In [8]:
np.random.seed(0)
s = Series(np.random.randint(-50, 50, 10),
           index=list('acegihfjdb'))
s

a    -6
c    -3
e    14
g    17
i    17
h   -41
f    33
j   -29
d   -14
b    37
dtype: int64

In [4]:
s.loc['e']

14

In [5]:
# fancy index
s.loc[['e','a']]

e    14
a    -6
dtype: int64

In [6]:
s.loc['e':'h']

e    14
g    17
i    17
h   -41
dtype: int64

In [7]:
# sort the index
(
    s
    .sort_index()
    .head()
)

a    -6
b    37
c    -3
d   -14
e    14
dtype: int64

In [10]:
np.random.seed(0)
s = Series(np.random.randint(-50, 50, 10),
           index=list('abcfhiajbl'))
s

a    -6
b    -3
c    14
f    17
h    17
i   -41
a    33
j   -29
b   -14
l    37
dtype: int64

In [11]:
(
    s
    .sort_index()
    ['a':'c']
)


a    -6
a    33
b    -3
b   -14
c    14
dtype: int64

In [13]:
# sort_values method to sort by value
s.sort_values()

i   -41
j   -29
b   -14
a    -6
b    -3
c    14
f    17
h    17
a    33
l    37
dtype: int64

In [24]:
s = Series([25, 60, 86, 12, 5, 49, 32],
           index=('Jack','Gary','Jenny','Jason','Greg','Charles','Matilda'))
s

Jack       25
Gary       60
Jenny      86
Jason      12
Greg        5
Charles    49
Matilda    32
dtype: int64

In [25]:
# mean age of first 3 people
(
    s
    .sort_index()
    .head(3)
    .mean()
)

38.0

In [35]:
# names of eldest and youngest
(
    s
    .sort_values()
    .head(1)
)

Greg    5
dtype: int64

'Greg'

In [43]:
(
    s
    .sort_values()
    .iloc[[0, -1]]
)

Greg      5
Jenny    86
dtype: int64

In [40]:
# Aggregate idxmin and idxmax methods
(
    s
    .agg(['idxmin', 'idxmax'])
)

idxmin     Greg
idxmax    Jenny
dtype: object

In [44]:
# sort in descending order

(
    s
    .sort_values(ascending=False)
)

Jenny      86
Gary       60
Charles    49
Matilda    32
Jack       25
Jason      12
Greg        5
dtype: int64

In [45]:
# sort in descending order

(
    s
    .sort_index(ascending=False)
)

Matilda    32
Jenny      86
Jason      12
Jack       25
Greg        5
Gary       60
Charles    49
dtype: int64

In [47]:
# change the way they are compared

np.random.seed(0)
s = Series(np.random.randint(-50, 50, 10),
           index=list('abcdefghij'))
s

a    -6
b    -3
c    14
d    17
e    17
f   -41
g    33
h   -29
i   -14
j    37
dtype: int64

In [48]:
s.sort_values()

f   -41
h   -29
i   -14
a    -6
b    -3
c    14
d    17
e    17
g    33
j    37
dtype: int64

In [49]:
# sort by absolute values - however makes negative positive
s.abs().sort_values()

b     3
a     6
c    14
i    14
d    17
e    17
h    29
g    33
j    37
f    41
dtype: int64

In [51]:
# sort by absolute value AND keep original values?

# pass kwarg of "key" to sort_index or sort_values

# value for "key" arg is a function
# function is invoked on each element, used for sorting purposes only
# we dont see its outputs

s.sort_values(key=abs)

b    -3
a    -6
c    14
i   -14
d    17
e    17
h   -29
g    33
j    37
f   -41
dtype: int64

In [57]:
# sort by the final digit

def get_final_digit(a_series):
    return a_series.astype(str).str.get(-1).astype(int)

s = Series([10, 15, 22, 28])

get_final_digit(s)

0    0
1    5
2    2
3    8
dtype: int64

In [58]:

np.random.seed(0)
s = Series(np.random.randint(-50, 50, 10),
           index=list('abcdefghij'))
s.sort_values(key=get_final_digit)

f   -41
b    -3
g    33
c    14
i   -14
a    -6
d    17
e    17
j    37
h   -29
dtype: int64

In [61]:
# create anonymous function via lambda

s.sort_values(key=lambda a_series: a_series.astype(str).str.get(-1).astype(int))

f   -41
b    -3
g    33
c    14
i   -14
a    -6
d    17
e    17
j    37
h   -29
dtype: int64

In [62]:
# Data Frames

filename = '../data/taxi.csv'
df = pd.read_csv(filename)
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.00,0.0,0.3,17.80
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.00,0.0,0.3,8.30
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.20,0.0,0.3,11.00
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.760330,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.00,0.0,0.3,10.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,1,2015-06-01 00:12:59,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.00,0.0,0.3,12.30
9995,1,2015-06-01 00:12:59,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.00,0.0,0.3,20.30
9996,2,2015-06-01 00:13:00,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.00,0.0,0.3,22.30
9997,2,2015-06-01 00:13:02,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,40.762852,2,6.5,0.5,0.5,0.00,0.0,0.3,7.80


In [65]:
# make a tpep_pickup_datetime column into the index for the df 
# via set_index method

df = df.set_index('tpep_pickup_datetime')
df

Unnamed: 0_level_0,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-06-02 11:19:29,2,2015-06-02 11:47:52,1,1.63,-73.954430,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.00,0.0,0.3,17.80
2015-06-02 11:19:30,2,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.00,0.0,0.3,8.30
2015-06-02 11:19:31,2,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.20,0.0,0.3,11.00
2015-06-02 11:19:31,2,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.760330,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
2015-06-02 11:19:32,1,2015-06-02 11:32:49,1,1.40,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.00,0.0,0.3,10.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-06-01 00:12:59,1,2015-06-01 00:24:18,1,2.70,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.00,0.0,0.3,12.30
2015-06-01 00:12:59,1,2015-06-01 00:28:16,1,4.50,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.00,0.0,0.3,20.30
2015-06-01 00:13:00,2,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.00,0.0,0.3,22.30
2015-06-01 00:13:02,2,2015-06-01 00:19:10,6,1.54,-73.978302,40.748531,1,N,-73.989166,40.762852,2,6.5,0.5,0.5,0.00,0.0,0.3,7.80


In [67]:
# retrive slice of values from df

(
    df
    .sort_index()
    .loc['2015-06-01 00:12:59':'2015-06-01 00:13:04']
)

Unnamed: 0_level_0,VendorID,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-06-01 00:12:59,2,2015-06-01 00:25:04,1,2.89,-74.003098,40.718269,1,N,-73.999634,40.687263,2,11.5,0.5,0.5,0.0,0.0,0.3,12.8
2015-06-01 00:12:59,1,2015-06-01 00:24:18,1,2.7,-73.947792,40.814972,1,N,-73.973358,40.783638,2,11.0,0.5,0.5,0.0,0.0,0.3,12.3
2015-06-01 00:12:59,1,2015-06-01 00:26:17,1,4.1,-73.994362,40.727089,1,N,-73.993248,40.73317,1,15.0,0.5,0.5,3.26,0.0,0.3,19.56
2015-06-01 00:12:59,1,2015-06-01 00:18:08,1,0.7,-73.985619,40.760563,1,N,-73.986572,40.766663,1,5.5,0.5,0.5,1.35,0.0,0.3,8.15
2015-06-01 00:12:59,2,2015-06-01 00:14:07,1,0.18,-74.005539,40.725544,1,N,-74.002983,40.725056,1,3.0,0.5,0.5,0.86,0.0,0.3,5.16
2015-06-01 00:12:59,1,2015-06-01 00:28:16,1,4.5,-74.004066,40.747818,1,N,-73.953758,40.779285,1,16.0,0.5,0.5,3.0,0.0,0.3,20.3
2015-06-01 00:13:00,2,2015-06-01 00:37:25,1,5.59,-73.994377,40.766102,1,N,-73.903206,40.750546,2,21.0,0.5,0.5,0.0,0.0,0.3,22.3
2015-06-01 00:13:00,2,2015-06-01 00:20:16,1,12.56,-73.948746,40.741535,2,N,-73.971687,40.7439,2,52.0,0.0,0.5,0.0,5.54,0.3,58.34
2015-06-01 00:13:00,2,2015-06-01 00:30:49,1,4.91,-73.985291,40.741871,1,N,-73.935081,40.767193,1,17.0,0.5,0.5,5.49,0.0,0.3,23.79
2015-06-01 00:13:00,2,2015-06-01 00:22:16,1,2.5,-73.985794,40.770344,1,N,-74.003677,40.740116,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8


In [69]:
df = pd.read_json("https://gist.githubusercontent.com/reuven/77edbb0292901f35019f17edb9794358/raw/2bf258763cdddd704f8ffd3ea9a3e81d25e2c6f6/cities.json")
df

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
0,New York,4.8%,40.712784,-74.005941,8405837,1,New York
1,Los Angeles,4.8%,34.052234,-118.243685,3884307,2,California
2,Chicago,-6.1%,41.878114,-87.629798,2718782,3,Illinois
3,Houston,11.0%,29.760427,-95.369803,2195914,4,Texas
4,Philadelphia,2.6%,39.952584,-75.165222,1553165,5,Pennsylvania
...,...,...,...,...,...,...,...
995,Weslaco,28.8%,26.159519,-97.990837,37093,996,Texas
996,Keizer,14.4%,44.990119,-123.026208,37064,997,Oregon
997,Spanish Fork,78.1%,40.114955,-111.654923,36956,998,Utah
998,Beloit,2.9%,42.508348,-89.031776,36888,999,Wisconsin


In [79]:
# mena population of first 20 cities alpabetially
(
    df
    .set_index('city')
    .sort_index()
    .head(20)
    ['population']
    .mean()
)

125541.7

In [85]:
# mean popluation for all cities alphabetically, Iowa to Nebraska
(
    df
    .set_index('state')
    .sort_index()
    ['Iowa':'Nebraska']
    ['population']
    .mean()
)

102408.45508982036

In [125]:
# mean latitude for the 50 largest vs 50 smallest
largestLatitude = (
    df
    .set_index('population')
    .sort_index()
    .head(50)
    ['latitude']
    .mean()
)

smallestLatitude = (
    df
    .set_index('population')
    .sort_index()
    .tail(50)
    ['latitude']
    .mean()
)

print(f"Mean latitude of top 50 largest cities is : {largestLatitude}")
print(f"Mean latitude of top 50 smallest cities is : {smallestLatitude}")

Mean latitude of top 50 largest cities is : 37.303587242
Mean latitude of top 50 smallest cities is : 36.838639806
