In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

filename = '/Users/reuven/Courses/Current/data/nyc_taxi_2019-01.csv'

In [3]:
df = pd.read_csv(filename, 
                usecols=['trip_distance', 'passenger_count', 'total_amount'])

In [4]:
df.shape

(7667792, 3)

In [5]:
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.5,9.95
1,1,2.6,16.3
2,3,0.0,5.8
3,5,0.0,7.55
4,5,0.0,55.55


In [7]:
# if I want to find out which trips were short (< 5 miles)

df.loc[df['trip_distance'] < 5]  # only those rows having < 5 miles

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.50,9.95
1,1,2.60,16.30
2,3,0.00,5.80
3,5,0.00,7.55
4,5,0.00,55.55
...,...,...,...
7667787,1,4.79,23.16
7667788,1,0.00,0.00
7667789,1,0.00,0.00
7667790,1,0.00,0.00


In [8]:
# long trips (> 20 miles)

df.loc[df['trip_distance'] > 20]

Unnamed: 0,passenger_count,trip_distance,total_amount
112,1,22.59,68.30
132,2,20.01,58.56
141,5,21.42,70.27
319,1,31.57,83.80
983,5,20.11,60.56
...,...,...,...
7667169,1,20.43,69.96
7667221,1,20.41,62.80
7667416,1,22.03,69.80
7667451,1,21.60,65.06


In [9]:
# I want to say:

# short trips are < 5 miles
# medium trips are > 5 and < 20 miles
# long trips are > 20 miles

In [10]:
# one way to do this is manually!

df['distance_category'] = 'medium'

df.loc[df['trip_distance'] < 5, 'distance_category'] = 'short'
df.loc[df['trip_distance'] > 20, 'distance_category'] = 'long'

In [11]:
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount,distance_category
0,1,1.5,9.95,short
1,1,2.6,16.3,short
2,3,0.0,5.8,short
3,5,0.0,7.55,short
4,5,0.0,55.55,short


In [12]:
df['distance_category'].value_counts()

short     6665636
medium     954649
long        47507
Name: distance_category, dtype: int64

In [13]:
df['distance_category'].value_counts(normalize=True)

short     0.869303
medium    0.124501
long      0.006196
Name: distance_category, dtype: float64

In [14]:
# the other (better) way to do this is with the "cut" method

# first: remove the distance_category column
df = df.drop('distance_category', axis='columns')

In [15]:
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.5,9.95
1,1,2.6,16.3
2,3,0.0,5.8
3,5,0.0,7.55
4,5,0.0,55.55


In [17]:
help(pd.cut)

Help on function cut in module pandas.core.reshape.tile:

cut(x, bins, right: 'bool' = True, labels=None, retbins: 'bool' = False, precision: 'int' = 3, include_lowest: 'bool' = False, duplicates: 'str' = 'raise', ordered: 'bool' = True)
    Bin values into discrete intervals.
    
    Use `cut` when you need to segment and sort data values into bins. This
    function is also useful for going from a continuous variable to a
    categorical variable. For example, `cut` could convert ages to groups of
    age ranges. Supports binning into an equal number of bins, or a
    pre-specified array of bins.
    
    Parameters
    ----------
    x : array-like
        The input array to be binned. Must be 1-dimensional.
    bins : int, sequence of scalars, or IntervalIndex
        The criteria to bin by.
    
        * int : Defines the number of equal-width bins in the range of `x`. The
          range of `x` is extended by .1% on each side to include the minimum
          and maximum values 

In [19]:
# I'm going to use cut

df['distance_category'] = pd.cut(df['trip_distance'],
       bins=[0, 5, 20, 100_000],
       labels=['short', 'medium', 'long']) 

In [20]:
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount,distance_category
0,1,1.5,9.95,short
1,1,2.6,16.3,short
2,3,0.0,5.8,
3,5,0.0,7.55,
4,5,0.0,55.55,


In [21]:
df['distance_category'].value_counts()

short     6620302
medium     945213
long        47507
Name: distance_category, dtype: int64

In [25]:
df['distance_category'] = pd.cut(df['trip_distance'],
       bins=[df['trip_distance'].min() - 1, 5, 20, df['trip_distance'].max() + 1],
       labels=['short', 'medium', 'long']) 

In [26]:
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount,distance_category
0,1,1.5,9.95,short
1,1,2.6,16.3,short
2,3,0.0,5.8,short
3,5,0.0,7.55,short
4,5,0.0,55.55,short


In [27]:
help(pd.cut)

Help on function cut in module pandas.core.reshape.tile:

cut(x, bins, right: 'bool' = True, labels=None, retbins: 'bool' = False, precision: 'int' = 3, include_lowest: 'bool' = False, duplicates: 'str' = 'raise', ordered: 'bool' = True)
    Bin values into discrete intervals.
    
    Use `cut` when you need to segment and sort data values into bins. This
    function is also useful for going from a continuous variable to a
    categorical variable. For example, `cut` could convert ages to groups of
    age ranges. Supports binning into an equal number of bins, or a
    pre-specified array of bins.
    
    Parameters
    ----------
    x : array-like
        The input array to be binned. Must be 1-dimensional.
    bins : int, sequence of scalars, or IntervalIndex
        The criteria to bin by.
    
        * int : Defines the number of equal-width bins in the range of `x`. The
          range of `x` is extended by .1% on each side to include the minimum
          and maximum values 

In [28]:
df['distance_category']

0          short
1          short
2          short
3          short
4          short
           ...  
7667787    short
7667788    short
7667789    short
7667790    short
7667791    short
Name: distance_category, Length: 7667792, dtype: category
Categories (3, object): ['short' < 'medium' < 'long']