# Agenda: Real-world

1. Recap and Q&A
2. More about CSV
3. Sorting with data
4. Grouping
5. Pivot tables
6. Joining
7. Cleaning


In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [4]:
df = DataFrame([[10, 20, 30, 40],
                [50, 60, 70, 80]],
               index=list('ab'),
               columns=list('wxyz'))

In [5]:
df

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80


In [9]:
# retrieve an entire row, use .loc

df.loc['b']     # this retrieves a row

w    50
x    60
y    70
z    80
Name: b, dtype: int64

In [10]:
df['x']    # this retrieves a column

a    20
b    60
Name: x, dtype: int64

In [11]:
df['y']   # so does this

a    30
b    70
Name: y, dtype: int64

In [12]:
# row selectors and column selectors with .loc

df.loc['b', 'y']  # row b and column y

70

In [13]:
# I want all of the elements from row b, columns x and y

df.loc['b', ['x', 'y']]

x    60
y    70
Name: b, dtype: int64

In [14]:
df

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80


In [15]:
df['w'] > 30

a    False
b     True
Name: w, dtype: bool

In [17]:
# wherever w is greater than 30,
# we want to see columns x and z

df.loc[
    df['w'] > 30,   # row selector -- a boolean series, which will act as a mask index
    ['x', 'z']       
    ]

Unnamed: 0,x,z
b,60,80


In [19]:
df = pd.read_csv('taxi.csv')
df.head()   # shows us the first five rows

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [21]:
df = pd.read_csv('taxi.csv', usecols=['passenger_count', 'trip_distance', 'total_amount'])
df.head()   # shows us the first five rows

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.63,17.8
1,1,0.46,8.3
2,1,0.87,11.0
3,1,2.13,17.16
4,1,1.4,10.3


In [23]:
# show me how much, on average, people paid and how far they traveled when the number of passengers was
# greater than 2

df.loc[
    df['passenger_count'] > 2,  # row selector 
    ['trip_distance', 'total_amount']  # column selector
].mean()

trip_distance     3.278653
total_amount     17.679939
dtype: float64

In [26]:
df = pd.read_csv('taxi.csv', header=None,
                usecols=[3, 4])  # the first row is *not* considered to be the headers (column names)

In [27]:
df.head()

Unnamed: 0,3,4
0,passenger_count,trip_distance
1,1,1.63
2,1,.46
3,1,.87
4,1,2.13


In [29]:
df = pd.read_csv('taxi.csv', header=None,
                usecols=[3, 4],
                names=['count', 'distance'])  # the first row is *not* considered to be the headers (column names)

In [30]:
df

Unnamed: 0,count,distance
0,passenger_count,trip_distance
1,1,1.63
2,1,.46
3,1,.87
4,1,2.13
...,...,...
9995,1,2.70
9996,1,4.50
9997,1,5.59
9998,6,1.54


In [31]:
# get all values in df other than those in row 0
df = df[1:]

In [32]:
df

Unnamed: 0,count,distance
1,1,1.63
2,1,.46
3,1,.87
4,1,2.13
5,1,1.40
...,...,...
9995,1,2.70
9996,1,4.50
9997,1,5.59
9998,6,1.54


In [33]:
# what happens if I now ask for the mean values of these two columns?
df.mean()

  df.mean()


count    inf
dtype: float64

In [34]:
df.dtypes  # what dtypes do we have in these two columns

count       object
distance    object
dtype: object

In [36]:
df['count'] = df.loc['count'].astype(np.int64)
df['distance'] = df.loc['distance'].astype(np.float64)


KeyError: 'count'

In [37]:
df.head()

Unnamed: 0,count,distance
1,1,1.63
2,1,0.46
3,1,0.87
4,1,2.13
5,1,1.4


In [38]:
df.dtypes

count         int64
distance    float64
dtype: object

In [39]:
df = pd.read_csv('AAPL.csv')

In [40]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-04-09,169.880005,173.089996,169.850006,170.050003,170.050003,29017700
1,2018-04-10,173.0,174.0,171.529999,173.25,173.25,28408600
2,2018-04-11,172.229996,173.919998,171.699997,172.440002,172.440002,22431600
3,2018-04-12,173.410004,175.0,173.039993,174.139999,174.139999,22889300
4,2018-04-13,174.779999,175.839996,173.850006,174.729996,174.729996,25124300


In [41]:
df.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [43]:
# set the index of the data frame to be the Date column from the input CSV
df = pd.read_csv('AAPL.csv', index_col='Date')

In [44]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-04-09,169.880005,173.089996,169.850006,170.050003,170.050003,29017700
2018-04-10,173.0,174.0,171.529999,173.25,173.25,28408600
2018-04-11,172.229996,173.919998,171.699997,172.440002,172.440002,22431600
2018-04-12,173.410004,175.0,173.039993,174.139999,174.139999,22889300
2018-04-13,174.779999,175.839996,173.850006,174.729996,174.729996,25124300
2018-04-16,175.029999,176.190002,174.830002,175.820007,175.820007,21578400
2018-04-17,176.490005,178.940002,176.410004,178.240005,178.240005,26605400
2018-04-18,177.809998,178.820007,176.880005,177.839996,177.839996,20754500
2018-04-19,173.759995,175.389999,172.660004,172.800003,172.800003,34808800
2018-04-20,170.600006,171.220001,165.429993,165.720001,165.720001,65491100


In [45]:
df.loc['2018-05-07']

Open         1.851800e+02
High         1.876700e+02
Low          1.847500e+02
Close        1.851600e+02
Adj Close    1.851600e+02
Volume       4.245140e+07
Name: 2018-05-07, dtype: float64

In [46]:
# I can give read_csv  a URL!

pd.read_csv('https://gist.githubusercontent.com/reuven/361d2c2b12dab426f4ed4efb396c89e5/raw/744dc0e9b193b53e3f76712cdfa32fa443440594/AAPL.csv')

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-04-09,169.880005,173.089996,169.850006,170.050003,170.050003,29017700
1,2018-04-10,173.0,174.0,171.529999,173.25,173.25,28408600
2,2018-04-11,172.229996,173.919998,171.699997,172.440002,172.440002,22431600
3,2018-04-12,173.410004,175.0,173.039993,174.139999,174.139999,22889300
4,2018-04-13,174.779999,175.839996,173.850006,174.729996,174.729996,25124300
5,2018-04-16,175.029999,176.190002,174.830002,175.820007,175.820007,21578400
6,2018-04-17,176.490005,178.940002,176.410004,178.240005,178.240005,26605400
7,2018-04-18,177.809998,178.820007,176.880005,177.839996,177.839996,20754500
8,2018-04-19,173.759995,175.389999,172.660004,172.800003,172.800003,34808800
9,2018-04-20,170.600006,171.220001,165.429993,165.720001,165.720001,65491100


In [47]:
#  we can choose columns, using usecols

pd.read_csv('https://gist.githubusercontent.com/reuven/361d2c2b12dab426f4ed4efb396c89e5/raw/744dc0e9b193b53e3f76712cdfa32fa443440594/AAPL.csv',
           usecols=['Open', 'Close'])

Unnamed: 0,Open,Close
0,169.880005,170.050003
1,173.0,173.25
2,172.229996,172.440002
3,173.410004,174.139999
4,174.779999,174.729996
5,175.029999,175.820007
6,176.490005,178.240005
7,177.809998,177.839996
8,173.759995,172.800003
9,170.600006,165.720001


# Exercise: Cisco stock info (from 5 years ago)

- Gist: https://gist.github.com/reuven/bb116ba2034bb10bb7e4e2caa5d8a000
- URL of the gist's raw CSV, which you should use, is here: https://gist.githubusercontent.com/reuven/bb116ba2034bb10bb7e4e2caa5d8a000/raw/3660c4af808684dbf17af48b3d2f25b6a218535f/CSCO.csv

1. Use `read_csv` to retrieve the Cisco stock information from 5 years ago, and put it into a data frame.  We're only interested in the `Date`, `Open`, `Close`, and `Volume` columns.
2. Find the day on which the `Open` price was the highest.
3. Find the day on which the `Close` price was the lowest.
4. Find the `Date`, `Open`, and `Close` values on the day when the `Volume` was at its highest.

In [48]:
pd.read_csv('https://oreilly.com')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 8, saw 4


In [49]:
cisco_csv_url = 'https://gist.githubusercontent.com/reuven/bb116ba2034bb10bb7e4e2caa5d8a000/raw/3660c4af808684dbf17af48b3d2f25b6a218535f/CSCO.csv'

pd.read_csv(cisco_csv_url)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2017-12-22,38.52,38.740002,38.470001,38.549999,38.264591,11441600
1,2017-12-26,38.549999,38.68,38.360001,38.48,38.19511,8186100
2,2017-12-27,38.540001,38.650002,38.450001,38.560001,38.274517,10543000
3,2017-12-28,38.73,38.73,38.450001,38.59,38.304295,8807700
4,2017-12-29,38.41,38.619999,38.299999,38.299999,38.016441,12583600
5,2018-01-02,38.669998,38.950001,38.43,38.860001,38.572296,20135700
6,2018-01-03,38.720001,39.279999,38.529999,39.169998,38.879997,29536000
7,2018-01-04,39.049999,39.540001,38.93,38.990002,38.990002,20731400
8,2018-01-05,39.549999,39.880001,39.369999,39.529999,39.529999,24588200
9,2018-01-08,39.52,39.959999,39.349998,39.939999,39.939999,16582000


In [50]:
df.dtypes

Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

In [57]:
cisco_csv_url = 'https://gist.githubusercontent.com/reuven/bb116ba2034bb10bb7e4e2caa5d8a000/raw/3660c4af808684dbf17af48b3d2f25b6a218535f/CSCO.csv'

df = pd.read_csv(cisco_csv_url, 
            usecols=['Date', 'Open', 'Close', 'Volume'])
df

Unnamed: 0,Date,Open,Close,Volume
0,2017-12-22,38.52,38.549999,11441600
1,2017-12-26,38.549999,38.48,8186100
2,2017-12-27,38.540001,38.560001,10543000
3,2017-12-28,38.73,38.59,8807700
4,2017-12-29,38.41,38.299999,12583600
5,2018-01-02,38.669998,38.860001,20135700
6,2018-01-03,38.720001,39.169998,29536000
7,2018-01-04,39.049999,38.990002,20731400
8,2018-01-05,39.549999,39.529999,24588200
9,2018-01-08,39.52,39.939999,16582000


In [58]:
# Find the day on which the open price was the highest

df.loc[
    # row selector: All days on which 'Open' was the same as df['Open'].max()
    df['Open'] == df['Open'].max(),

    # column selector is 'Date'
    'Date'
]

17    2018-01-19
Name: Date, dtype: object

In [59]:
# Find the day on which the closing price was the lowest

df.loc[
    # row selector: All days on which 'Close' was the same as df['Close'].min()
    df['Close'] == df['Close'].min(),

    # column selector is 'Date'
    'Date'
]

4    2017-12-29
Name: Date, dtype: object

In [60]:
# Find the Date, Open, and Close values on the day when the Volume was at its highest.

df.loc[
    
    df['Volume'] == df['Volume'].max(),   # row selector
    ['Date', 'Open', 'Close']

    
]

Unnamed: 0,Date,Open,Close
14,2018-01-16,40.900002,40.540001


# `read_html`

The method `pd.read_html` goes to a Web page, and finds all of the HTML tables that are on that page. Each one is downloaded as a separate data frame, into a list of data frames.  You can then retrieve the table that you want from that list, and perform analyses on it.

In [61]:
all_dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita')

In [62]:
len(all_dfs)

6

In [63]:
all_dfs[0]

Unnamed: 0,0,1,2
0,.mw-parser-output .legend{page-break-inside:av...,"$20,000 - $30,000 $10,000 - $20,000 $5,000 - $...","$1,000 - $2,500 $500 - $1,000 <$500 No data"


In [64]:
all_dfs[1]

Unnamed: 0_level_0,Country/Territory,UN Region,IMF[4][5],IMF[4][5],United Nations[6],United Nations[6],World Bank[7],World Bank[7]
Unnamed: 0_level_1,Country/Territory,UN Region,Estimate,Year,Estimate,Year,Estimate,Year
0,,,,,,,,
1,Liechtenstein *,Europe,—,—,180227,2020,169049,2019
2,Monaco *,Europe,—,—,173696,2020,173688,2020
3,Luxembourg *,Europe,135046,2022,117182,2020,135683,2021
4,Bermuda *,Americas,—,—,123945,2020,110870,2021
...,...,...,...,...,...,...,...,...
213,Central AfricanRepublic *,Africa,527,2022,481,2020,477,2020
214,Sierra Leone *,Africa,513,2022,475,2020,485,2020
215,Madagascar *,Africa,504,2022,470,2020,496,2020
216,South Sudan *,Africa,393,2022,1421,2020,1120,2015


In [65]:
df = all_dfs[1]

In [69]:
df.dtypes

Country/Territory  Country/Territory    object
UN Region          UN Region            object
IMF[4][5]          Estimate             object
                   Year                 object
United Nations[6]  Estimate             object
                   Year                 object
World Bank[7]      Estimate             object
                   Year                 object
dtype: object

In [70]:
# let's reassign the index on df

df.columns = ['country', 'un region', 'IMF estimate', 'IMF year', 'UN estimate', 'UN year', 'WB estimate', 'WB year']


In [71]:
df

Unnamed: 0,country,un region,IMF estimate,IMF year,UN estimate,UN year,WB estimate,WB year
0,,,,,,,,
1,Liechtenstein *,Europe,—,—,180227,2020,169049,2019
2,Monaco *,Europe,—,—,173696,2020,173688,2020
3,Luxembourg *,Europe,135046,2022,117182,2020,135683,2021
4,Bermuda *,Americas,—,—,123945,2020,110870,2021
...,...,...,...,...,...,...,...,...
213,Central AfricanRepublic *,Africa,527,2022,481,2020,477,2020
214,Sierra Leone *,Africa,513,2022,475,2020,485,2020
215,Madagascar *,Africa,504,2022,470,2020,496,2020
216,South Sudan *,Africa,393,2022,1421,2020,1120,2015


In [72]:
# row selector: slice, until (not including) row 5
# column selector: 'country'
df.loc[:5, 'country']

0                NaN
1    Liechtenstein *
2           Monaco *
3       Luxembourg *
4          Bermuda *
5          Ireland *
Name: country, dtype: object

# Exercise: Gasoline prices

1. Create a data frame of gasoline prices around the world, based on the URL https://tradingeconomics.com/country-list/gasoline-prices.
2. In which countries is gasoline (USD/liter) more expensive than the US?
3. In which countries is gasoline (USD/liter) more than 1.5x as expensive as in the US?

In [73]:
# all_dfs == list of data frames we got back from pd.read_html
# read_html needs to get a URL as its argument
# the URL can be either a string literal (with quotes) or a variable containing that string literal

all_dfs = pd.read_html('https://tradingeconomics.com/country-list/gasoline-prices')

In [74]:
len(all_dfs)

1

In [75]:
# get the data frame, and assign it to df
df = all_dfs[0]

In [76]:
df.head()

Unnamed: 0,Country,Last,Previous,Reference,Unit
0,Indonesia,0.51,0.52,Jun/22,USD/Liter
1,Saudi Arabia,0.62,0.62,Jul/22,USD/Liter
2,Russia,0.86,0.96,Jul/22,USD/Liter
3,Argentina,0.91,0.95,Jul/22,USD/Liter
4,China,1.03,1.06,Jul/22,USD/Liter


In [77]:
df.dtypes

Country       object
Last         float64
Previous     float64
Reference     object
Unit          object
dtype: object

In [79]:
df.loc[
    df['Country'] == 'United States',  # row selector -- looking for US row(s)
    'Last'
]

7    1.2
Name: Last, dtype: float64

In [86]:
# in which countries is gasoline more expensive than the United States?

df.loc[df['Last'] > df.loc[
            df['Country'] == 'United States', 
            'Last'
        ].mean(),
       'Country']

8              India
9             Turkey
10         Australia
11             Japan
12            Canada
13       South Korea
14      South Africa
15           Germany
16            France
17             Italy
18         Singapore
19             Spain
20    United Kingdom
21       Switzerland
22       Netherlands
Name: Country, dtype: object

In [87]:
df.loc[df['Last'] > (1.5 * df.loc[
            df['Country'] == 'United States', 
            'Last'
        ].mean()),
       'Country']

15           Germany
16            France
17             Italy
18         Singapore
19             Spain
20    United Kingdom
21       Switzerland
22       Netherlands
Name: Country, dtype: object

In [88]:
df.loc[df['Last'] > (2 * df.loc[
            df['Country'] == 'United States', 
            'Last'
        ].mean()),
       'Country']

Series([], Name: Country, dtype: object)

# Next up:

1. Sorting (by value, and by index)
2. Grouping

In [None]:
# why might you get an error from read_html?

# because read_html depends on the lxml package, which you can download and install from PyPI:

# pip install lxml   # note: this is not a Python command, but rather something you should write at the command line


all_dfs = pd.read_html('https://tradingeconomics.com/country-list/gasoline-prices')

In [89]:
df

Unnamed: 0,Country,Last,Previous,Reference,Unit
0,Indonesia,0.51,0.52,Jun/22,USD/Liter
1,Saudi Arabia,0.62,0.62,Jul/22,USD/Liter
2,Russia,0.86,0.96,Jul/22,USD/Liter
3,Argentina,0.91,0.95,Jul/22,USD/Liter
4,China,1.03,1.06,Jul/22,USD/Liter
5,Mexico,1.06,1.07,Jul/22,USD/Liter
6,Brazil,1.09,1.41,Jul/22,USD/Liter
7,United States,1.2,1.3,Jul/22,USD/Liter
8,India,1.21,1.23,Jul/22,USD/Liter
9,Turkey,1.25,1.49,Jul/22,USD/Liter


In [92]:
# this returns a series with one element, not a single floating-point value
us_price = df.loc[df['Country'] == 'United States', 'Last']


In [94]:
us_price

7    1.2
Name: Last, dtype: float64

In [95]:
# since you know that there's only 1 value, you can use min, max, mean, etc.
us_price.min()

1.2

In [97]:
df['Last'] > us_price.min()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
Name: Last, dtype: bool

In [100]:
# I want to sort gas prices by country name

df.sort_values(by='Country')   # give me back a new data frame, same as df, but sorted (ascending) by country

Unnamed: 0,Country,Last,Previous,Reference,Unit
3,Argentina,0.91,0.95,Jul/22,USD/Liter
10,Australia,1.26,1.46,Jul/22,USD/Liter
6,Brazil,1.09,1.41,Jul/22,USD/Liter
12,Canada,1.42,1.59,Jul/22,USD/Liter
4,China,1.03,1.06,Jul/22,USD/Liter
16,France,1.98,2.27,Jul/22,USD/Liter
15,Germany,1.82,1.99,Jul/22,USD/Liter
8,India,1.21,1.23,Jul/22,USD/Liter
0,Indonesia,0.51,0.52,Jun/22,USD/Liter
17,Italy,2.04,2.26,Jul/22,USD/Liter


# Where/why sort?

1. For more aesthetic presentations
2. Grab the top/bottom n values from our data set
3. Sometimes, you can sort the data in a bunch of different ways -- sort_values lets us choose which way to sort

To sort our data by a column, just name the column in the `by` keyword argument

Like most other methods that we can run on a data frame, `sort_values` does *not* modify anything, but rather returns a new data frame.  

In [101]:
# to keep the changes, assign back to df

df = df.sort_values(by='Country')

In [102]:

df.head()

Unnamed: 0,Country,Last,Previous,Reference,Unit
3,Argentina,0.91,0.95,Jul/22,USD/Liter
10,Australia,1.26,1.46,Jul/22,USD/Liter
6,Brazil,1.09,1.41,Jul/22,USD/Liter
12,Canada,1.42,1.59,Jul/22,USD/Liter
4,China,1.03,1.06,Jul/22,USD/Liter


# Sorting by the index

Often, I want to sort values by the index.  This is especially true when the index contains useful information (e.g., country names, usernames, prices).



In [103]:
df = df.set_index('Country')
df

Unnamed: 0_level_0,Last,Previous,Reference,Unit
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Argentina,0.91,0.95,Jul/22,USD/Liter
Australia,1.26,1.46,Jul/22,USD/Liter
Brazil,1.09,1.41,Jul/22,USD/Liter
Canada,1.42,1.59,Jul/22,USD/Liter
China,1.03,1.06,Jul/22,USD/Liter
France,1.98,2.27,Jul/22,USD/Liter
Germany,1.82,1.99,Jul/22,USD/Liter
India,1.21,1.23,Jul/22,USD/Liter
Indonesia,0.51,0.52,Jun/22,USD/Liter
Italy,2.04,2.26,Jul/22,USD/Liter


In [106]:
# now if I sort by something else, say 'Previous', the index will be unsorted

df = df.sort_values(by='Previous')  # we get a new data frame back, based on df
df

Unnamed: 0_level_0,Last,Previous,Reference,Unit
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Indonesia,0.51,0.52,Jun/22,USD/Liter
Saudi Arabia,0.62,0.62,Jul/22,USD/Liter
Argentina,0.91,0.95,Jul/22,USD/Liter
Russia,0.86,0.96,Jul/22,USD/Liter
China,1.03,1.06,Jul/22,USD/Liter
Mexico,1.06,1.07,Jul/22,USD/Liter
India,1.21,1.23,Jul/22,USD/Liter
Japan,1.28,1.29,Jul/22,USD/Liter
United States,1.2,1.3,Jul/22,USD/Liter
Brazil,1.09,1.41,Jul/22,USD/Liter


In [108]:
# what if the country names are now my index
# and I want to sort by country name again?

# I can use sort_index

df = df.sort_index()
df

Unnamed: 0_level_0,Last,Previous,Reference,Unit
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Argentina,0.91,0.95,Jul/22,USD/Liter
Australia,1.26,1.46,Jul/22,USD/Liter
Brazil,1.09,1.41,Jul/22,USD/Liter
Canada,1.42,1.59,Jul/22,USD/Liter
China,1.03,1.06,Jul/22,USD/Liter
France,1.98,2.27,Jul/22,USD/Liter
Germany,1.82,1.99,Jul/22,USD/Liter
India,1.21,1.23,Jul/22,USD/Liter
Indonesia,0.51,0.52,Jun/22,USD/Liter
Italy,2.04,2.26,Jul/22,USD/Liter


In [110]:
# let's sort by previous price again

df.sort_values('Last')

Unnamed: 0_level_0,Last,Previous,Reference,Unit
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Indonesia,0.51,0.52,Jun/22,USD/Liter
Saudi Arabia,0.62,0.62,Jul/22,USD/Liter
Russia,0.86,0.96,Jul/22,USD/Liter
Argentina,0.91,0.95,Jul/22,USD/Liter
China,1.03,1.06,Jul/22,USD/Liter
Mexico,1.06,1.07,Jul/22,USD/Liter
Brazil,1.09,1.41,Jul/22,USD/Liter
United States,1.2,1.3,Jul/22,USD/Liter
India,1.21,1.23,Jul/22,USD/Liter
Turkey,1.25,1.49,Jul/22,USD/Liter


In [111]:
df.sort_values('Reference')

Unnamed: 0_level_0,Last,Previous,Reference,Unit
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Argentina,0.91,0.95,Jul/22,USD/Liter
Turkey,1.25,1.49,Jul/22,USD/Liter
Spain,2.18,2.29,Jul/22,USD/Liter
South Korea,1.46,1.64,Jul/22,USD/Liter
South Africa,1.6,1.48,Jul/22,USD/Liter
Singapore,2.1,2.36,Jul/22,USD/Liter
Saudi Arabia,0.62,0.62,Jul/22,USD/Liter
Russia,0.86,0.96,Jul/22,USD/Liter
Netherlands,2.29,2.54,Jul/22,USD/Liter
United Kingdom,2.22,2.32,Jul/22,USD/Liter


In [113]:
# can I sort first by date (alphabetically, in this case, which is weird!)
# and then, within the dates, sort by Previous?

# use a list, instead of a string, to name a column

# in this case, we're telling Pandas:
# first sort the rows by Reference
# if the Reference value is the same for two rows, then we sort by Previous value

df.sort_values(by=['Reference', 'Previous'])  # Reference in ascending order, then (if needed) Previous

Unnamed: 0_level_0,Last,Previous,Reference,Unit
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Saudi Arabia,0.62,0.62,Jul/22,USD/Liter
Argentina,0.91,0.95,Jul/22,USD/Liter
Russia,0.86,0.96,Jul/22,USD/Liter
China,1.03,1.06,Jul/22,USD/Liter
Mexico,1.06,1.07,Jul/22,USD/Liter
India,1.21,1.23,Jul/22,USD/Liter
Japan,1.28,1.29,Jul/22,USD/Liter
United States,1.2,1.3,Jul/22,USD/Liter
Brazil,1.09,1.41,Jul/22,USD/Liter
Australia,1.26,1.46,Jul/22,USD/Liter


In [114]:
# can I sort first by date (alphabetically, in this case, which is weird!)
# and then, within the dates, sort by Previous, in descending order?

# use a list, instead of a string, to name a column

# in this case, we're telling Pandas:
# first sort the rows by Reference
# if the Reference value is the same for two rows, then we sort by Previous value DESCENDING

df.sort_values(by='Previous', ascending=False)  # this is how you sort in descending order

Unnamed: 0_level_0,Last,Previous,Reference,Unit
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Netherlands,2.29,2.54,Jul/22,USD/Liter
Singapore,2.1,2.36,Jul/22,USD/Liter
United Kingdom,2.22,2.32,Jul/22,USD/Liter
Spain,2.18,2.29,Jul/22,USD/Liter
France,1.98,2.27,Jul/22,USD/Liter
Italy,2.04,2.26,Jul/22,USD/Liter
Switzerland,2.24,2.14,May/22,USD/Liter
Germany,1.82,1.99,Jul/22,USD/Liter
South Korea,1.46,1.64,Jul/22,USD/Liter
Canada,1.42,1.59,Jul/22,USD/Liter


In [115]:
# reference in ascending order
# previous in descending order
df.sort_values(by=['Reference','Previous'], ascending=[True, False])  

Unnamed: 0_level_0,Last,Previous,Reference,Unit
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Netherlands,2.29,2.54,Jul/22,USD/Liter
Singapore,2.1,2.36,Jul/22,USD/Liter
United Kingdom,2.22,2.32,Jul/22,USD/Liter
Spain,2.18,2.29,Jul/22,USD/Liter
France,1.98,2.27,Jul/22,USD/Liter
Italy,2.04,2.26,Jul/22,USD/Liter
Germany,1.82,1.99,Jul/22,USD/Liter
South Korea,1.46,1.64,Jul/22,USD/Liter
Canada,1.42,1.59,Jul/22,USD/Liter
Turkey,1.25,1.49,Jul/22,USD/Liter


# Exercise: Sorting taxi data

1. Read the taxi data (`taxi.csv` or `nyc_taxi_2019-01.csv`) in, looking at columns `passenger_count`, `trip_distance`, and `total_amount`.
2. What were the 10 longest trips that people took?
3. What were the 10 cheapest trips that people took, where the amount was > 0?
4. Set `trip_distance` to be the index.  Now sort by the index, and find the 10 shortest trips.

In [116]:
!ls taxi.csv

taxi.csv


In [None]:
# use a raw string when you're trying to open a path with \ in it!  That
# doubles the backslashes.

df = pd.read_csv(r'E:\Orielly\Python for DA 5 weeks 2nd aug 930 pm\taxi.csv')

In [117]:
help(df.sort_values)

Help on method sort_values in module pandas.core.frame:

sort_values(by, axis: 'Axis' = 0, ascending=True, inplace: 'bool' = False, kind: 'str' = 'quicksort', na_position: 'str' = 'last', ignore_index: 'bool' = False, key: 'ValueKeyFunc' = None) method of pandas.core.frame.DataFrame instance
    Sort by the values along either axis.
    
    Parameters
    ----------
            by : str or list of str
                Name or list of names to sort by.
    
                - if `axis` is 0 or `'index'` then `by` may contain index
                  levels and/or column labels.
                - if `axis` is 1 or `'columns'` then `by` may contain column
                  levels and/or index labels.
    axis : {0 or 'index', 1 or 'columns'}, default 0
         Axis to be sorted.
    ascending : bool or list of bool, default True
         Sort ascending vs. descending. Specify list for multiple sort
         orders.  If this is a list of bools, must match the length of
         the by.
    

In [118]:
# short file -- taxi.csv (in the Jupyter directory)

df = pd.read_csv('taxi.csv', usecols=['trip_distance', 'total_amount', 'passenger_count'])
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.63,17.8
1,1,0.46,8.3
2,1,0.87,11.0
3,1,2.13,17.16
4,1,1.4,10.3


In [120]:
# one option: sort in ascending order, and grab the final 10 rows
df.sort_values('trip_distance').tail(10)['trip_distance']

4221    29.78
9231    31.50
4224    31.90
3323    32.10
4291    32.40
5470    34.84
809     35.51
4583    37.20
8513    60.30
4270    64.60
Name: trip_distance, dtype: float64

In [123]:
# another option: sort in DESCENDING order, and grab the first 10 rows
df.sort_values('trip_distance', ascending=False).head(10)

Unnamed: 0,passenger_count,trip_distance,total_amount
4270,1,64.6,79.96
8513,1,60.3,160.05
4583,1,37.2,210.14
809,1,35.51,135.13
5470,1,34.84,137.59
4291,1,32.4,63.36
3323,1,32.1,162.39
4224,1,31.9,252.35
9231,1,31.5,150.05
4221,2,29.78,75.84


In [147]:
# now, let's do the long taxi file, from January 2019

df = pd.read_csv('../data/nyc_taxi_2019-01.csv', usecols=['passenger_count', 'trip_distance', 'total_amount'])

In [130]:
df.shape

(7667792, 3)

In [131]:
# option 1: sort in ascending order and grab the final 10

df.sort_values('trip_distance').tail(10)['trip_distance']

4911293    132.80
1144878    142.88
4876401    143.63
2567394    144.20
4813319    160.52
4881766    201.27
4707513    211.36
6770897    214.01
4286612    700.70
6074021    831.80
Name: trip_distance, dtype: float64

In [132]:
df.sort_values('trip_distance', ascending=False).head(10)

Unnamed: 0,passenger_count,trip_distance,total_amount
6074021,1,831.8,11.76
4286612,1,700.7,9.0
6770897,5,214.01,761.8
4707513,2,211.36,56.56
4881766,1,201.27,152.46
4813319,4,160.52,143.56
2567394,2,144.2,18.8
4876401,1,143.63,456.56
1144878,3,142.88,327.38
4911293,1,132.8,238.7


In [154]:
# What were the 10 cheapest trips that people took, where the amount was > 0?


df.loc[df['total_amount'] > 0].sort_values('total_amount').head(10)

Unnamed: 0,passenger_count,trip_distance,total_amount
1636920,1,0.0,0.11
393524,1,1.1,0.3
3308021,1,0.0,0.3
5569448,1,46.5,0.3
3308022,1,0.0,0.3
7390163,1,0.6,0.3
4220243,1,10.9,0.3
3039165,2,1.1,0.3
4934141,1,0.0,0.3
5700266,1,13.4,0.3


In [151]:
df.loc[df['total_amount'] > 0].sort_values('total_amount')

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.50,9.95
1,1,2.60,16.30
2,3,0.00,5.80
3,5,0.00,7.55
4,5,0.00,55.55
...,...,...,...
7667783,2,4.15,15.80
7667784,1,1.34,9.30
7667785,1,1.45,14.16
7667786,2,4.28,21.96


In [139]:
# Set trip_distance to be the index. Now sort by the index, and find the 10 shortest trips.

df = df.set_index('trip_distance')
df.head()

Unnamed: 0_level_0,passenger_count,total_amount
trip_distance,Unnamed: 1_level_1,Unnamed: 2_level_1
1.5,1,9.95
2.6,1,16.3
0.0,3,5.8
0.0,5,7.55
0.0,5,55.55


In [143]:
df.sort_index().loc[0.00001:]

Unnamed: 0_level_0,passenger_count,total_amount
trip_distance,Unnamed: 1_level_1,Unnamed: 2_level_1
0.01,1,-3.80
0.01,2,3.30
0.01,2,4.80
0.01,1,4.30
0.01,1,52.80
...,...,...
201.27,1,152.46
211.36,2,56.56
214.01,5,761.80
700.70,1,9.00


In [149]:
df.memory_usage()

Index                   128
passenger_count    61342336
trip_distance      61342336
total_amount       61342336
dtype: int64

In [150]:
df.memory_usage().sum()   # 184 MB!

184027136

# Grouping



In [157]:
# I want to know the mean taxi distance when passenger_count is 1

df.loc[
    df['passenger_count'] == 1,
    'trip_distance'
].mean()

2.7790883193389573

In [158]:
# I also want to know the mean trip_distance when passenger_count is 2

df.loc[
    df['passenger_count'] == 2,
    'trip_distance'
].mean()

2.8805724948972533

In [159]:
# I also want to know the mean trip_distance when passenger_count is 3

df.loc[
    df['passenger_count'] == 3,
    'trip_distance'
].mean()

2.8406983328090596

# What are we doing here?

We're trying to:

- For every different value of `passenger_count`
- Apply the `mean` method
- To the `trip_distance` column

This is known as grouping, and we wil get a series back in which the index contains all of the different (unique) values of `passenger_count` and the values will be the mean of `trip_distance` for each of these values of `passenger_count`.

In [160]:
# for all unique values of passenger_count, calculate the mean of trip_distance
df.groupby('passenger_count')['trip_distance'].mean()

passenger_count
0    2.651561
1    2.779088
2    2.880572
3    2.840698
4    2.853084
5    2.865741
6    2.842335
7    2.561579
8    3.142759
9    1.486667
Name: trip_distance, dtype: float64

In [161]:
df.groupby('passenger_count')['total_amount'].mean()

passenger_count
0    18.663658
1    15.609601
2    15.831294
3    15.604015
4    15.650307
5    15.546940
6    15.437892
7    48.278421
8    64.105517
9    31.094444
Name: total_amount, dtype: float64

In [162]:
# don't group by a column with floating-point values, or with very open-ended, distinct values

# for each amount that the taxi passenger(s) paid,
# find out the mean number of passengers

df.groupby('total_amount')['passenger_count'].mean()

total_amount
-362.80       1.0
-322.30       1.0
-320.30       2.0
-300.30       1.0
-284.80       1.0
             ... 
 33023.53     0.0
 34674.65     0.0
 36090.30     0.0
 356214.78    1.0
 623261.66    1.0
Name: passenger_count, Length: 12191, dtype: float64

# Exercise: Average amount per payment type

1. Create a data frame from NYC taxi data in January, 2019.  We want `trip_distance`, `total_amount`, and `payment_type`.  (The value of `payment_type` is 1 for cash, and 2 for credit.)
2. What was the mean `trip_distance` for each type of payment?
3. What was the mean `total_amount` for each type of payment?