# Data Cleaning

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings

warnings.filterwarnings("ignore")

To begin, I will first load the data of each month into a separate data frame and later on merge them all into two separate frames (one for 2019 and one for 2020) for easier cleaning. 

Since March 8, 2020 is widely viewed as the start of the pandemic -- this is when Italy first went into lockdown-- I will filter March's dataframe to reflect the data starting from the 8th.

In [2]:
march = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\march_2020.csv')
march = march.loc[(pd.DatetimeIndex(march["tpep_pickup_datetime"]).month == 3) & 
              (pd.DatetimeIndex(march["tpep_pickup_datetime"]).day >=8)]

**Note:** As seen above, according to our data dictionary, **tpep_pickup_datetime** means the date and time when the [taxi] meter was engaged.

A quick check:

In [3]:
march.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1497226,2.0,2020-03-08 00:00:33,2020-03-08 00:11:14,2.0,1.06,1.0,N,148,4,1.0,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5
1497353,2.0,2020-03-08 00:02:10,2020-03-08 00:06:24,1.0,0.73,1.0,N,74,41,2.0,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0
1497609,2.0,2020-03-08 00:01:34,2020-03-08 00:04:34,1.0,0.48,1.0,N,236,262,1.0,4.0,0.5,0.5,1.0,0.0,0.3,8.8,2.5
1498278,2.0,2020-03-08 00:00:21,2020-03-08 00:25:46,2.0,18.62,2.0,N,132,42,1.0,52.0,0.0,0.5,11.78,6.12,0.3,70.7,0.0
1498328,2.0,2020-03-08 00:00:10,2020-03-08 00:06:23,2.0,1.3,1.0,N,239,163,1.0,7.0,0.5,0.5,2.16,0.0,0.3,12.96,2.5


In [4]:
april = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\april_2020.csv')
may = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\may_2020.csv')
june = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\june_2020.csv')
july = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\july_2020.csv')
august = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\august_2020.csv')
september = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\september_2020.csv')
october = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\october_2020.csv')
november = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\november_2020.csv')
december = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\december_2020.csv')

In [5]:
year_20= pd.concat([march, april, may, june, july, august, september, october, november, december])

A quick check of our full dataframe:

In [6]:
year_20.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1497226,2.0,2020-03-08 00:00:33,2020-03-08 00:11:14,2.0,1.06,1.0,N,148,4,1.0,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5
1497353,2.0,2020-03-08 00:02:10,2020-03-08 00:06:24,1.0,0.73,1.0,N,74,41,2.0,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0
1497609,2.0,2020-03-08 00:01:34,2020-03-08 00:04:34,1.0,0.48,1.0,N,236,262,1.0,4.0,0.5,0.5,1.0,0.0,0.3,8.8,2.5
1498278,2.0,2020-03-08 00:00:21,2020-03-08 00:25:46,2.0,18.62,2.0,N,132,42,1.0,52.0,0.0,0.5,11.78,6.12,0.3,70.7,0.0
1498328,2.0,2020-03-08 00:00:10,2020-03-08 00:06:23,2.0,1.3,1.0,N,239,163,1.0,7.0,0.5,0.5,2.16,0.0,0.3,12.96,2.5


In [7]:
len(year_20)

10421450

Now I will check for the percentages of null values, if any, and decide how to handle accordingly.

In [8]:
null_values = year_20.isna().sum()
null_percentage = null_values[null_values>0] / len(year_20)
null_percentage.to_frame('% Null')

Unnamed: 0,% Null
VendorID,0.06547
passenger_count,0.06547
RatecodeID,0.06547
store_and_fwd_flag,0.06547
payment_type,0.06547


Since the null values are very low (much less than half a percentage point), I've decided to remove all the null values to avoid the possibilty of being thrown an error (should I leave them), or having incorrect results if I were to replace them (with a mean value for example). I've done so because the **payment_type** column is vital to a question I want to answer, so I want to produce as accurate results as possible in my analysis.

I will drop all of the NaN/Null values and perform a quick check of the beginning and end of the dataframe:

In [9]:
year_20.dropna(inplace=True)
year_20.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1497226,2.0,2020-03-08 00:00:33,2020-03-08 00:11:14,2.0,1.06,1.0,N,148,4,1.0,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5
1497353,2.0,2020-03-08 00:02:10,2020-03-08 00:06:24,1.0,0.73,1.0,N,74,41,2.0,5.0,0.5,0.5,0.0,0.0,0.3,6.3,0.0
1497609,2.0,2020-03-08 00:01:34,2020-03-08 00:04:34,1.0,0.48,1.0,N,236,262,1.0,4.0,0.5,0.5,1.0,0.0,0.3,8.8,2.5
1498278,2.0,2020-03-08 00:00:21,2020-03-08 00:25:46,2.0,18.62,2.0,N,132,42,1.0,52.0,0.0,0.5,11.78,6.12,0.3,70.7,0.0
1498328,2.0,2020-03-08 00:00:10,2020-03-08 00:06:23,2.0,1.3,1.0,N,239,163,1.0,7.0,0.5,0.5,2.16,0.0,0.3,12.96,2.5


In [38]:
year_20.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
1362436,2.0,2020-12-31 23:05:33,2020-12-31 23:31:36,1.0,11.3,1.0,N,107,242,2.0,33.0,0.5,0.5,0.0,0.0,0.3,36.8,2.5
1362437,2.0,2020-12-31 22:57:20,2020-12-31 23:05:33,1.0,2.18,1.0,N,236,161,1.0,9.0,0.5,0.5,2.56,0.0,0.3,15.36,2.5
1362438,2.0,2020-12-31 23:40:35,2020-12-31 23:48:43,1.0,2.52,1.0,N,236,24,1.0,9.5,0.5,0.5,4.0,0.0,0.3,17.3,2.5
1362439,2.0,2020-12-31 23:54:57,2020-12-31 23:57:39,1.0,0.59,1.0,N,238,239,1.0,4.5,0.5,0.5,2.08,0.0,0.3,10.38,2.5
1362440,2.0,2020-12-31 23:11:16,2020-12-31 23:24:08,1.0,6.06,1.0,N,75,169,2.0,18.5,0.5,0.5,0.0,0.0,0.3,19.8,0.0


A quick check to be sure all null values were dropped and to see how many rows were removed:

In [10]:
null_values = year_20.isna().sum()
null_percentage = null_values[null_values>0] / len(year_20)
null_percentage.to_frame('% Null')

Unnamed: 0,% Null


In [11]:
len(year_20)

9739158

In [12]:
10421450 - 9739158

682292

So, out of over 10 million entries, only 68,000 contained null values and were subsequently removed.

Now, I will repeat the above steps for the data from 2019, or pre-covid.

In [13]:
march = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\march_2019.csv')
#To achieve a balanced analysis, I will filter the 2019 March data to reflect the same time period as 2020's (starting
#from the 8th)
march = march.loc[(pd.DatetimeIndex(march["tpep_pickup_datetime"]).month == 3) & 
              (pd.DatetimeIndex(march["tpep_pickup_datetime"]).day >=8)]
april = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\april_2019.csv')
may = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\may_2019.csv')
june = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\june_2019.csv')
july = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\july_2019.csv')
august = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\august_2019.csv')
september = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\september_2019.csv')
october = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\october_2019.csv')
november = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\november_2019.csv')
december = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\december_2019.csv')

In [14]:
year_19= pd.concat([march, april, may, june, july, august, september, october, november, december])

In [15]:
year_19.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
130887,2.0,2019-03-22 08:22:39,2019-03-22 08:32:50,1.0,1.45,1.0,N,170,107,1.0,8.5,0.0,0.5,2.95,0.0,0.3,14.75,2.5
160292,2.0,2019-03-21 15:31:46,2019-03-21 15:55:19,1.0,2.0,1.0,N,186,163,1.0,15.0,1.0,0.5,1.5,0.0,0.3,20.8,2.5
160293,2.0,2019-03-21 16:03:03,2019-03-21 16:15:59,1.0,1.36,1.0,N,161,237,1.0,9.5,1.0,0.5,2.76,0.0,0.3,16.56,2.5
160294,2.0,2019-03-21 16:18:06,2019-03-21 23:27:41,1.0,0.97,1.0,N,141,236,1.0,7.5,1.0,0.5,1.5,0.0,0.3,13.3,2.5
1612737,2.0,2019-03-08 00:15:18,2019-03-08 00:22:43,6.0,1.25,1.0,N,249,79,1.0,7.0,0.0,0.5,1.0,0.0,0.3,11.3,2.5


In [16]:
len(year_19)

67904737

*Note: As we can already see, there is a **significant** difference in the amount of taxi trips between 2019 and 2020!*

In [17]:
null_values = year_19.isna().sum()
null_percentage = null_values[null_values>0] / len(year_19)
null_percentage.to_frame('% Null')

Unnamed: 0,% Null
VendorID,0.003631573
passenger_count,0.003631573
RatecodeID,0.003631573
store_and_fwd_flag,0.003631573
payment_type,0.003631573
congestion_surcharge,4.417954e-08


In [18]:
year_19.dropna(inplace=True)
len(year_19)

67658133

In [20]:
print(67904737 - 67658133)
print((67904737 - 67658133) / 67904737 * 100)

246604
0.36316170402073716


After removing the null values, I've dropped 246,604 rows or 0.36% of the data.

# Adding the 'Day of the Week' Column

Another question I seek to answer is what is the most expensive day of the week to travel by taxi and in order to do so, I first have to extract the name of the day from the dates provided.

I begin by first converting the "tpep_pickup_datetime" (according to our data dictionary, this column contains the date and time the ride began) column from an object to a datetimelike type to apply the **dt** accessor.

In [33]:
year_20["tpep_pickup_datetime"] = pd.to_datetime(year_20["tpep_pickup_datetime"],format='%Y-%m-%d %H:%M:%S')

Now I'll check that my code was successful:

In [34]:
year_20.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9739158 entries, 1497226 to 1362440
Data columns (total 18 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               float64       
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  object        
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           float64       
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
dtypes: datetime64[ns

And now I will extract the days of the week from the dates and assign them to a new column.

In [40]:
year_20['day_of_week'] = year_20["tpep_pickup_datetime"].dt.day_name()

In [42]:
year_20['day_of_week'].value_counts()

Wednesday    1565440
Tuesday      1541946
Thursday     1523848
Friday       1467141
Monday       1410616
Saturday     1173838
Sunday       1056329
Name: day_of_week, dtype: int64

Finally, I will repeat the above steps for my prepandemic timeline, the year 2019.

In [43]:
year_19["tpep_pickup_datetime"] = pd.to_datetime(year_19["tpep_pickup_datetime"],format='%Y-%m-%d %H:%M:%S')
year_19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67658133 entries, 130887 to 6845298
Data columns (total 18 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               float64       
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  object        
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           float64       
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
dtypes: datetime64[ns

In [44]:
year_19['day_of_week'] = year_19["tpep_pickup_datetime"].dt.day_name()
year_19['day_of_week'].value_counts()

Friday       10431710
Thursday     10307017
Wednesday     9988358
Tuesday       9857206
Saturday      9737490
Monday        8893041
Sunday        8443311
Name: day_of_week, dtype: int64

# Defining the Zones

I'd also like to answer the question of between which zones the most expensive ride occured. The original datasets contain the 'PULocationID' and 'DOLocationID' columns, or pick up location ID and drop-off location ID repectively. I need to then access my taxi zone lookup table to understand what each ID means.

In [6]:
taxi_zones = pd.read_csv('C:\\Users\\15164\\Downloads\\taxi+_zone_lookup.csv')
taxi_zones.head(2)

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone


I'll now filter the 'Zone' column and add to my tables on the location ID columns. I will then add two columns "Start_Zone" and "End_Zone" to my dataframe by renaming my added "Zone" columns.

In [7]:
taxi_zones = taxi_zones[['Zone']]
taxi_zones.head(2)

Unnamed: 0,Zone
0,Newark Airport
1,Jamaica Bay


In [49]:
year_20 = year_20.join(taxi_zones, on='PULocationID')
year_20 = year_20.rename(columns={'Zone': 'Start_Zone'})

In [53]:
year_20 = year_20.join(taxi_zones, on='DOLocationID')
year_20 = year_20.rename(columns={'Zone': 'End_Zone'})
year_20.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,day_of_week,Start_Zone,End_Zone
1497226,2.0,2020-03-08 00:00:33,2020-03-08 00:11:14,2.0,1.06,1.0,N,148,4,1.0,...,0.5,0.5,2.36,0.0,0.3,14.16,2.5,Sunday,Madison,Arden Heights
1497353,2.0,2020-03-08 00:02:10,2020-03-08 00:06:24,1.0,0.73,1.0,N,74,41,2.0,...,0.5,0.5,0.0,0.0,0.3,6.3,0.0,Sunday,East Harlem South,Central Harlem North
1497609,2.0,2020-03-08 00:01:34,2020-03-08 00:04:34,1.0,0.48,1.0,N,236,262,1.0,...,0.5,0.5,1.0,0.0,0.3,8.8,2.5,Sunday,Upper East Side South,Yorkville West
1498278,2.0,2020-03-08 00:00:21,2020-03-08 00:25:46,2.0,18.62,2.0,N,132,42,1.0,...,0.0,0.5,11.78,6.12,0.3,70.7,0.0,Sunday,Kensington,Central Park
1498328,2.0,2020-03-08 00:00:10,2020-03-08 00:06:23,2.0,1.3,1.0,N,239,163,1.0,...,0.5,0.5,2.16,0.0,0.3,12.96,2.5,Sunday,Van Cortlandt Park,Midtown South


I've successfully defined the start and end zones of each trip for the year 2020 and I will now drop the columns I won't be needing for my analysis (to preserve memory). Then, I'll repeat the above steps for 2019.

In [17]:
year_20.drop(columns=['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'congestion_surcharge', 
                      'improvement_surcharge', 'extra', 'mta_tax'], inplace=True)
year_20.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,tip_amount,tolls_amount,total_amount,day_of_week,Start_Zone,End_Zone
1497226,2020-03-08 00:00:33,2020-03-08 00:11:14,2.0,1.06,148,4,1.0,8.0,2.36,0.0,14.16,Sunday,Madison,Arden Heights
1497353,2020-03-08 00:02:10,2020-03-08 00:06:24,1.0,0.73,74,41,2.0,5.0,0.0,0.0,6.3,Sunday,East Harlem South,Central Harlem North
1497609,2020-03-08 00:01:34,2020-03-08 00:04:34,1.0,0.48,236,262,1.0,4.0,1.0,0.0,8.8,Sunday,Upper East Side South,Yorkville West
1498278,2020-03-08 00:00:21,2020-03-08 00:25:46,2.0,18.62,132,42,1.0,52.0,11.78,6.12,70.7,Sunday,Kensington,Central Park
1498328,2020-03-08 00:00:10,2020-03-08 00:06:23,2.0,1.3,239,163,1.0,7.0,2.16,0.0,12.96,Sunday,Van Cortlandt Park,Midtown South


### Year 2019:

In [None]:
year_19 = year_19.join(taxi_zones, on='PULocationID')
year_19 = year_19.rename(columns={'Zone': 'Start_Zone'})

In [None]:
year_19 = year_19.join(taxi_zones, on='DOLocationID')
year_19 = year_19.rename(columns={'Zone': 'End_Zone'})

In [1]:
year_19.drop(columns=['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'congestion_surcharge', 
                      'improvement_surcharge', 'extra', 'mta_tax'], inplace=True)
year_19.head()

NameError: name 'year_19' is not defined

After researching on the NYC.gov website, I've come across the following:

1- The maximum allowed passengers in a taxi is 5 (in a 5 person car), except that an additional 6th passenger may be accepted if such passenger is under the age of seven and is held on the lap of an adult passenger seated in the rear.

2- Every ride has a **minimum** base fare of $2.50.

Based on this information, I will eliminate any rides having less than 1 passenger and more than 6, as well as trips with a fare amount less than $2.50. This will allow us to have as accurate results as possible in our analysis.

In [3]:
year_19 = year_19[(year_19.passenger_count > 0) & (year_19.passenger_count < 6)]
year_19['passenger_count'].value_counts()

1.0    46804352
2.0    10225236
3.0     2868907
5.0     2660401
4.0     1384776
Name: passenger_count, dtype: int64

In [4]:
year_20 = year_20[(year_20.passenger_count > 0) & (year_20.passenger_count < 6)]
year_20['passenger_count'].value_counts()

1.0    7283260
2.0    1244558
3.0     320379
5.0     253864
4.0     123437
Name: passenger_count, dtype: int64

In [5]:
year_19 = year_19[(year_19.fare_amount > 2.50)]
year_19['fare_amount'].value_counts()

6.00      3072133
6.50      3056918
7.00      3007283
5.50      2953877
7.50      2888747
           ...   
8.39            1
85.56           1
85.69           1
273.00          1
47.78           1
Name: fare_amount, Length: 7350, dtype: int64

In [6]:
year_20 = year_20[(year_20.fare_amount > 2.50)]
year_20['fare_amount'].value_counts()

6.00     526457
5.50     516398
6.50     510421
7.00     490163
5.00     485401
          ...  
11.83         1
70.35         1
65.90         1
11.42         1
6.88          1
Name: fare_amount, Length: 3910, dtype: int64

And lastly, I will drop any trips with a trip_distance of 0 since this indicates that no trip was actually taken.

In [4]:
#Checking how many trip distances are 0:
year_19[year_19.trip_distance == 0.0].shape[0]

280181

In [5]:
year_20[year_20.trip_distance == 0.0].shape[0]

117664

In [6]:
year_19 = year_19[year_19.trip_distance > 0.0]
year_20 = year_20[year_20.trip_distance > 0.0]

In [10]:
year_19.trip_distance.value_counts()

0.90     1204091
0.80     1195947
1.00     1187585
1.10     1137221
0.70     1133183
          ...   
40.37          1
40.13          1
40.12          1
41.38          1
40.49          1
Name: trip_distance, Length: 4759, dtype: int64

I will now add a 'month' column and a 'year' column for later use in visualizations.

In [5]:
year_20["tpep_pickup_datetime"] = pd.to_datetime(year_20["tpep_pickup_datetime"],format='%Y-%m-%d %H:%M:%S')
year_19["tpep_pickup_datetime"] = pd.to_datetime(year_19["tpep_pickup_datetime"],format='%Y-%m-%d %H:%M:%S')

In [6]:
year_20['month'] = year_20["tpep_pickup_datetime"].dt.month_name()
year_19['month'] = year_19["tpep_pickup_datetime"].dt.month_name()

In [8]:
year_20['year'] = year_20["tpep_pickup_datetime"].dt.year
year_19['year'] = year_19["tpep_pickup_datetime"].dt.year

In [9]:
year_20.head(2)

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,tip_amount,tolls_amount,total_amount,day_of_week,Start_Zone,End_Zone,month,year
0,2020-03-08 00:00:33,2020-03-08 00:11:14,2.0,1.06,148,4,1.0,8.0,2.36,0.0,14.16,Sunday,Madison,Arden Heights,March,2020
1,2020-03-08 00:02:10,2020-03-08 00:06:24,1.0,0.73,74,41,2.0,5.0,0.0,0.0,6.3,Sunday,East Harlem South,Central Harlem North,March,2020


In [10]:
year_19.head(2)

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,fare_amount,tip_amount,tolls_amount,total_amount,day_of_week,Start_Zone,End_Zone,month,year
0,2019-03-22 08:22:39,2019-03-22 08:32:50,1.0,1.45,170,107,1.0,8.5,2.95,0.0,14.75,Friday,Murray Hill-Queens,Gravesend,March,2019
1,2019-03-21 15:31:46,2019-03-21 15:55:19,1.0,2.0,186,163,1.0,15.0,1.5,0.0,20.8,Thursday,Port Richmond,Midtown South,March,2019


Now that I've cleaned my datasets, I will drop any nulls and save them both.

In [11]:
year_19.dropna(inplace=True)
year_20.dropna(inplace=True)

In [11]:
year_19.to_csv("taxi_2019.csv", index = False)

In [12]:
year_20.to_csv("taxi_2020.csv", index = False)