In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date
import plotly.graph_objects as go
import warnings

warnings.filterwarnings("ignore")

In [None]:
taxi_2019 = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\taxi_2019.csv')
taxi_2020 = pd.read_csv('C:\\Users\\15164\\Desktop\\nyc-taxis-vs-covid\\data\\taxi_2020.csv')

In [None]:
taxi_2019.head(2)

In [None]:
taxi_2020.head(2)

In [None]:
amounts_19 = taxi_2019[['total_amount', 'month', 'year']]
amounts_19 = pd.DataFrame(amounts_19)
amounts_20 = taxi_2020[['total_amount', 'month', 'year']]
amounts_20 = pd.DataFrame(amounts_20)

In [None]:
#Dropping January and February because we're examining from March (since the pandemic started March 2020)
amounts_19 = amounts_19.drop(amounts_19[(amounts_19['month'] == 'January') | (amounts_19['month'] == 'February')].index)
amounts_20 = amounts_20.drop(amounts_20[(amounts_20['month'] == 'January') | (amounts_20['month'] == 'February')].index)

In [None]:
avg_monthly19 = amounts_19.groupby('month')['total_amount'].mean()
avg_monthly19 = pd.DataFrame(avg_monthly19)
avg_monthly19.reset_index(inplace=True)
avg_monthly19.head(10)

In [None]:
avg_monthly20 = amounts_20.groupby('month')['total_amount'].mean()
avg_monthly20 = pd.DataFrame(avg_monthly20)
avg_monthly20.reset_index(inplace=True)
avg_monthly20.head(10)

In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x=avg_monthly19['month'], y=avg_monthly19['total_amount'], 
            color='green', label='Pre Pandemic (2019)', 
           order = ["March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
sns.barplot(x=avg_monthly20['month'], y=avg_monthly20['total_amount'], 
            color='yellow', label='Peak Pandemic (2020)',
           order = ["March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])

plt.xlabel('Month', fontsize=12, color='blue')
plt.title("The Difference in Total Amounts Paid Before & During Pandemic", fontsize=14)
plt.ylabel('Average Fare (Dollars)', fontsize=12, color='blue')
plt.xticks(rotation=25, horizontalalignment='right', fontsize=10)
plt.legend(fontsize=9);

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=avg_monthly19.index, y=avg_monthly19['total_amount'],
                    mode='lines',
                    name='Pre-Pandemic'))
fig.add_trace(go.Scatter(x=avg_monthly20.index, y=avg_monthly20['total_amount'],
                    mode='lines',
                    name='Peak Pandemic'))
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
        ticktext = ["April", "August", "December", "July", "June", "March", "May", 
                          "November", "October", "September"]
    )
)

fig.update_layout(legend_title_text = "Year")
fig.update_xaxes(title_text="Month (Alphabetical Order)")
fig.update_yaxes(title_text="Average Fare Per Month")
fig.show()

In [None]:
avg_paid20 = avg_monthly20['total_amount'].mean()
avg_paid19 = avg_monthly19['total_amount'].mean()
print(avg_paid20)
print(avg_paid19)

In [None]:
tips_19 = taxi_2019[['tip_amount', 'month', 'year']]
tips_19 = pd.DataFrame(tips_19)
tips_20 = taxi_2020[['tip_amount', 'month', 'year']]
tips_20 = pd.DataFrame(tips_20)

In [None]:
tips_19 = tips_19.drop(tips_19[(tips_19['month'] == 'January') | (tips_19['month'] == 'February')].index)
tips_20 = tips_20.drop(tips_20[(tips_20['month'] == 'January') | (tips_20['month'] == 'February')].index)

In [None]:
avg_tips19 = tips_19.groupby('month')['tip_amount'].mean()
avg_tips19 = pd.DataFrame(avg_tips19)
avg_tips19.reset_index(inplace=True)
avg_tips19.head(10)

In [None]:
avg_tips20 = tips_20.groupby('month')['tip_amount'].mean()
avg_tips20 = pd.DataFrame(avg_tips20)
avg_tips20.reset_index(inplace=True)
avg_tips20.head(10)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=avg_tips19.index, y=avg_tips19['tip_amount'],
                    mode='lines',
                    name='Pre-Pandemic'))
fig.add_trace(go.Scatter(x=avg_tips20.index, y=avg_tips20['tip_amount'],
                    mode='lines',
                    name='Peak Pandemic'))
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
        ticktext = ["April", "August", "December", "July", "June", "March", "May", 
                          "November", "October", "September"]
    )
)

fig.update_layout(legend_title_text = "Year")
fig.update_xaxes(title_text="Month (Alphabetical Order)")
fig.update_yaxes(title_text="Average Tip Per Month")
fig.show()

In [None]:
tips20 = avg_tips20['tip_amount'].mean()
tips19 = avg_tips19['tip_amount'].mean()
print(tips20)
print(tips19)

In [None]:
passengers_19 = taxi_2019[['passenger_count', 'month', 'year']]
passengers_19 = pd.DataFrame(passengers_19)
passengers_20 = taxi_2020[['passenger_count', 'month', 'year']]
passengers_20 = pd.DataFrame(passengers_20)

In [None]:
passengers_19 = passengers_19.drop(passengers_19[(passengers_19['month'] == 'January') | 
                                                 (passengers_19['month'] == 'February')].index)
passengers_20 = passengers_20.drop(passengers_20[(passengers_20['month'] == 'January') | 
                                                 (passengers_20['month'] == 'February')].index)

In [None]:
avg_pass_19 = passengers_19.groupby('month')['passenger_count'].mean()
avg_pass_19 = pd.DataFrame(avg_pass_19)
avg_pass_19.reset_index(inplace=True)
avg_pass_19.head(10)

In [None]:
avg_pass_20 = passengers_20.groupby('month')['passenger_count'].mean()
avg_pass_20 = pd.DataFrame(avg_pass_20)
avg_pass_20.reset_index(inplace=True)
avg_pass_20.head(10)

In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x=avg_pass_19['month'], y=avg_pass_19['passenger_count'], 
            color='green', label='Pre Pandemic (2019)', 
           order = ["March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
sns.barplot(x=avg_pass_20['month'], y=avg_pass_20['passenger_count'], 
            color='yellow', label='Peak Pandemic (2020)',
           order = ["March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])

plt.xlabel('Month', fontsize=12, color='blue')
plt.title("The Difference in Passenger Count Before & During Pandemic", fontsize=14)
plt.ylabel('Average Fare (Dollars)', fontsize=12, color='blue')
plt.xticks(rotation=25, horizontalalignment='right', fontsize=10)
plt.legend(fontsize=9);

In [None]:
pass20 = avg_pass_20['passenger_count'].mean()
pass19 = avg_pass_19['passenger_count'].mean()
print(pass20)
print(pass19)

In [None]:
most_expensive = taxi_2019.sort_values('total_amount', ascending=False)
most_expensive.head(1)

In [None]:
most_expensive = most_expensive.loc[most_expensive['payment_type'].isin([1.0, 2.0])]
most_expensive.head(1)

In [None]:
least_expensive = taxi_2019.sort_values('total_amount', ascending=True)
least_expensive.head(1)

In [None]:
most_expensive = taxi_2020.sort_values('total_amount', ascending=False)
most_expensive.head(1)

In [None]:
most_expensive2 = most_expensive.loc[most_expensive['payment_type'].isin([1.0, 2.0])]
most_expensive2.head(1)

In [None]:
least_expensive = taxi_2020.sort_values('total_amount', ascending=True)
least_expensive.head(1)

In [None]:
least_expensive = least_expensive.loc[least_expensive['PULocationID'] != least_expensive['DOLocationID']]
least_expensive = least_expensive.loc[least_expensive['payment_type'].isin([1.0, 2.0])]
least_expensive.head(1)

In [None]:
payment_type_count = taxi_2020['payment_type'].value_counts()
payment_types = pd.DataFrame(payment_type_count)
#Rename the columns
payment_types.reset_index(inplace=True)
payment_types = payment_types.rename(columns = {'index':'Payment Type', 'payment_type':'Count'})
payment_types.head(6)

In [None]:
data = {'Code':[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
        'Payment':['Credit Card', 'Cash', 'No Charge', 'Dispute', 'Unknown', 'Voided Trip']}
  
data_dict = pd.DataFrame(data)
data_dict.set_index('Code', inplace=True)
data_dict.head(6)

In [None]:
# Converting to a string for easy manipulation
payment_types['Payment Type']= payment_types['Payment Type'].astype(str)
#Replacing the code numbers with the payment type
payment_types['Payment Type']= payment_types['Payment Type'].replace({'1.0': 'CreditCard', '2.0': 'Cash',
                                                                      '3.0': 'NoCharge','4.0': 'Dispute', 
                                                                      '5.0': 'Unknown', '6.0': 'VoidedTrip'})
payment_types.head()

In [None]:
payment_type_count = taxi_2019['payment_type'].value_counts()
payment_types_pre = pd.DataFrame(payment_type_count)
#Rename the columns
payment_types_pre.reset_index(inplace=True)
payment_types_pre = payment_types_pre.rename(columns = {'index':'Payment Type', 'payment_type':'Count'})
#Converting the column values from an int64 to a string for easy manipulation
payment_types_pre['Payment Type']= payment_types_pre['Payment Type'].astype(str)
payment_types_pre['Payment Type']= payment_types_pre['Payment Type'].replace({'1.0': 'CreditCard', '2.0': 'Cash',
                                                                      '3.0': 'NoCharge','4.0': 'Dispute', 
                                                                      '5.0': 'Unknown', '6.0': 'VoidedTrip'})
payment_types_pre.head()

In [None]:
plt.figure(figsize=(15,12))
sns.barplot(x=payment_types_pre['Payment Type'], y=payment_types_pre['Count'], 
            color='green', label='Pre Pandemic')
sns.barplot(x=payment_types['Payment Type'], y=payment_types['Count'], 
            color='yellow', label='Peak Pandemic')
plt.xlabel('Payment Type', fontsize=12, color='blue')
plt.title("Payment Types Before and During the Pandemic", fontsize=14)
plt.ylabel('Count (Millions)', fontsize=12, color='blue')
plt.xticks(rotation=25, horizontalalignment='right', fontsize=10)
plt.legend(fontsize=12);

In [None]:
#Making a new dataframe for the day of the week and fare amount columns
days_fares = taxi_2019[['day_of_week', 'fare_amount']]
days_fares = pd.DataFrame(days_fares)
days_fares = days_fares.rename(columns = {'day_of_week':'DayOfWeek', 'fare_amount':'Fare'})
#Grouping by the day of the week to find the average fare amount for each day
day_fare = days_fares.groupby('DayOfWeek')['Fare'].mean()
day_fare = pd.DataFrame(day_fare)
#Sorting from highest to lowest
day_fare = day_fare.sort_values('Fare', ascending=False)
day_fare.reset_index(inplace=True)
day_fare.head(7)

In [None]:
plt.figure(figsize=(14,7))
ax1 = sns.barplot(x=day_fare['DayOfWeek'], y=day_fare['Fare'], 
                  order = [ "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"],
                  palette='viridis')
plt.xlabel('Day of Week', fontsize=12)
plt.ylabel('Average Fare Amount (Dollars)', fontsize=12)
plt.title('Most Expensive Day to Travel 2019', fontsize=14);
plt.show()

In [None]:
days_fares = taxi_2020[['day_of_week', 'fare_amount']]
days_fares = pd.DataFrame(days_fares)
days_fares = days_fares.rename(columns = {'day_of_week':'DayOfWeek', 'fare_amount':'Fare'})
day_fare2 = days_fares.groupby('DayOfWeek')['Fare'].mean()
day_fare2 = pd.DataFrame(day_fare2)
day_fare2 = day_fare2.sort_values('Fare', ascending=False)
day_fare2.reset_index(inplace=True)
day_fare2.head()

In [None]:
plt.figure(figsize=(14,7))
ax1 = sns.barplot(x=day_fare2['DayOfWeek'], y=day_fare2['Fare'], palette='viridis')
plt.xlabel('Day of Week', fontsize=12)
plt.ylabel('Average Fare Amount (Dollars)', fontsize=12)
plt.title('Most Expensive Day to Travel 2020', fontsize=14);
plt.show()

In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x=day_fare['DayOfWeek'], y=day_fare['Fare'], 
            color='green', label='Pre Pandemic', 
           order = [ "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"],)
sns.barplot(x=day_fare2['DayOfWeek'], y=day_fare2['Fare'], 
            color='yellow', label='Peak Pandemic',
           order = [ "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"],)
plt.xlabel('Day of Week', fontsize=12, color='blue')
plt.title("Most Expensive Travel Day Before and During the Pandemic", fontsize=14)
plt.ylabel('Average Fare (Dollars)', fontsize=12, color='blue')
plt.xticks(rotation=25, horizontalalignment='right', fontsize=10)
plt.legend(fontsize=9);

In [None]:
(day_fare['Fare'].mean()) - (day_fare2['Fare'].mean())