In [1]:
# importing pandas
import pandas as pd

In [2]:
# reading the data and saving it in the taxi variable
taxi = pd.read_csv('2_taxi_nyc.csv')

In [3]:
taxi.head(5)

Unnamed: 0,pickup_dt,pickup_month,borough,pickups,hday,spd,vsb,temp,dewp,slp,pcp 01,pcp 06,pcp 24,sd
0,2015-01-01 01:00:00,Jan,Bronx,152,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0
1,2015-01-01 01:00:00,Jan,Brooklyn,1519,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0
2,2015-01-01 01:00:00,Jan,EWR,0,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0
3,2015-01-01 01:00:00,Jan,Manhattan,5258,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0
4,2015-01-01 01:00:00,Jan,Queens,405,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0


In [4]:
# checking how many rows and columns there are in the dataset.
rows, cols = taxi.shape
print(f'{rows = }, {cols = }')

rows = 29101, cols = 14


In [5]:
# looking at the column types
taxi.dtypes

pickup_dt        object
pickup_month     object
borough          object
pickups           int64
hday             object
spd             float64
vsb             float64
temp            float64
dewp            float64
slp             float64
pcp 01          float64
pcp 06          float64
pcp 24          float64
sd              float64
dtype: object

In [6]:
# replacing the space in the names with an underscore for greater convenience
taxi = taxi.rename(columns={'pcp 01': 'pcp_01', 'pcp 06': 'pcp_06', 'pcp 24': 'pcp_24'})
taxi.columns

Index(['pickup_dt', 'pickup_month', 'borough', 'pickups', 'hday', 'spd', 'vsb',
       'temp', 'dewp', 'slp', 'pcp_01', 'pcp_06', 'pcp_24', 'sd'],
      dtype='object')

In [7]:
# let's see how many times each borough appears in the dataset
taxi.borough.value_counts()

borough
Bronx            4343
Brooklyn         4343
EWR              4343
Manhattan        4343
Queens           4343
Staten Island    4343
Name: count, dtype: int64

In [8]:
# counting the total number of trips
taxi.pickups.sum()

14265773

In [9]:
# grouping the data by borough
(taxi
    .groupby('borough')
    .pickups.sum()
    .sort_values(ascending=False))

borough
Manhattan        10367841
Brooklyn          2321035
Queens            1343528
Bronx              220047
Staten Island        6957
EWR                   105
Name: pickups, dtype: int64

In [10]:
# storing the name of the area with the least number of trips in the min_pickups variable.
min_pickups = (taxi.groupby('borough')
                   .pickups.sum()
                   .idxmin())
min_pickups

'EWR'

In [11]:
# looking at the number of trips on weekends
(taxi
    .groupby(['borough', 'hday'])
    .pickups.mean()
    .to_frame())

Unnamed: 0_level_0,Unnamed: 1_level_0,pickups
borough,hday,Unnamed: 2_level_1
Bronx,N,50.771073
Bronx,Y,48.065868
Brooklyn,N,534.727969
Brooklyn,Y,527.011976
EWR,N,0.023467
EWR,Y,0.041916
Manhattan,N,2401.302921
Manhattan,Y,2035.928144
Queens,N,308.899904
Queens,Y,320.730539


In [12]:
# For each borough, counting the number of trips by month
pickups_by_mon_bor = (taxi
                          .groupby(['borough', 'pickup_month'], as_index=False)
                          .pickups.sum() 
                          .sort_values(by = 'pickups', ascending = False))
pickups_by_mon_bor.head()

Unnamed: 0,borough,pickup_month,pickups
21,Manhattan,Jun,1995388
23,Manhattan,May,1888800
19,Manhattan,Feb,1718571
22,Manhattan,Mar,1661261
18,Manhattan,Apr,1648278


In [13]:
taxi.temp

0        30.0
1        30.0
2        30.0
3        30.0
4        30.0
         ... 
29096    75.0
29097    75.0
29098    75.0
29099    75.0
29100    75.0
Name: temp, Length: 29101, dtype: float64

In [14]:
# function to convert temperature from Fahrenheit to Celsius
def temp_to_celcius(x):
    return (x - 32) * 5 / 9
    

temp_to_celcius(taxi.temp)

0        -1.111111
1        -1.111111
2        -1.111111
3        -1.111111
4        -1.111111
           ...    
29096    23.888889
29097    23.888889
29098    23.888889
29099    23.888889
29100    23.888889
Name: temp, Length: 29101, dtype: float64