In [None]:
#Importing necessary libraries
import pandas as pd
import numpy as np

In [None]:
#Loading the dataset
nyc_data = pd.read_csv(r"../input/nyc-taxi-trip-duration/train.zip")

In [None]:
#Viewing the head of the dataset
nyc_data.head()

In [None]:
#Viewing the tail of the dataset
nyc_data.tail()

In [None]:
#Getting the dimension of the dataset
nyc_data.shape

In [None]:
#Getting the summary of the dataset
nyc_data.describe()

In [None]:
#Viewing the number of NAs in the dataset
nyc_data.isna().sum()

- This is great since there are no missing values 

In [None]:
#Checking the duration of time the dataset was collected
nyc_data['pickup_datetime'].min(), nyc_data['pickup_datetime'].max()

In [None]:
nyc_data['dropoff_datetime'].min(), nyc_data['dropoff_datetime'].max()

- This shows that the data was collected from January 1, 2016 to May 31, 2016 - five months period

In [None]:
#Checking the data types
nyc_data.dtypes

- From the output, we need to do a few transformations. 
     * vendor_id should be categorical
     * pickup and drop off time should be a datetime object
     * store_and_fwd_flag should be categorical

In [None]:
#Transforming vendor_id and store_and_fwd to categorical data type
nyc_data['vendor_id'] = nyc_data['vendor_id'].astype('category')
nyc_data['store_and_fwd_flag'] = nyc_data['store_and_fwd_flag'].astype('category')

In [None]:
#Transforming pick_up and drop off date time into a datetime object
nyc_data['pickup_datetime'] = pd.to_datetime(nyc_data['pickup_datetime'], format= '%Y-%m-%d %H:%M:%S')
nyc_data['dropoff_datetime'] = pd.to_datetime(nyc_data['dropoff_datetime'], format='%Y-%m-%d %H:%M:%S')

In [None]:
#Checking the data types again
nyc_data.dtypes

In [None]:
#Creating a pickup_hour column. This will help determine which hour the taxi business is busy
nyc_data['pickup_hour'] = nyc_data['pickup_datetime'].dt.hour

In [None]:
nyc_data.tail()

In [None]:
#Classifying the hours into morning, afternoon, and evening
nyc_data['pickup_time_of_the_day'] = nyc_data.pickup_hour.apply(lambda y: (y % 24 + 4)//4).replace({1: 'Late Night',
                      2: 'Early Morning',
                      3: 'Morning',
                      4: 'Afternoon',
                      5: 'Evening',
                      6: 'Night'})

In [None]:
nyc_data.tail()

**Let's check what times of the day are busy**

---

In [None]:
# importing required libraries
import seaborn as sns
sns.set()
sns.set(style="darkgrid")

# importing matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.countplot(x = 'pickup_time_of_the_day', order=['Early Morning', 'Morning', 'Afternoon', 'Evening', 'Night', 'Late Night'],
              data=nyc_data);
plt.title("Trip frequency per the time of the day");
plt.xlabel("Time of the Day");
plt.ylabel("Frequency");

- From the output, it is clear that the taxi business is busiest in the afternoon, evening, and night

**Let's check which days of the week are busy**

In [None]:
#Getting the day of the week
nyc_data['pickup_day_of_the_week'] = nyc_data.pickup_datetime.apply(lambda x: x.day_name())

In [None]:
sns.countplot(x = 'pickup_day_of_the_week', order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
              data=nyc_data);
plt.title("Trip frequency per the day of the week");
plt.xlabel("Day of the Week");
plt.ylabel("Frequency");

- From the output, it can be seen that the busiest days of the week are Thursday, Friday, and Saturday

**Checking which vendor is more popular**

In [None]:
sns.countplot(x = 'vendor_id', data=nyc_data);
plt.title("Vendor Popularity");
plt.xlabel("Vendor ID");
plt.ylabel("Frequency");

- Vendor 2 is more popular than vendor 1 in NYC.

**Why is vendor 2 more popular?**

In [None]:
sns.countplot(x = 'vendor_id', hue="passenger_count", data=nyc_data);
plt.title("Passenger count per vendor id");
plt.xlabel("Vendor ID");
plt.ylabel("Frequency");

- A possible reason for vendor 2's popularity is the type of car since it seems vendor 2 is more popular to more than one passenger while vendor 1 is more popular with single passengers.

In [None]:
sns.barplot(x="vendor_id", y="trip_duration", data=nyc_data);
plt.title("Average Trip Duration (In seconds)");
plt.xlabel("Vendor Id");
plt.ylabel("Trip Duration");

- The average trip duration of vendor 2 is greater than vendor 1

In [None]:
#Computing the distances between pickup and dropoff locations
nyc_data['pickup_tuple'] = list(zip(nyc_data.pickup_latitude, nyc_data.pickup_longitude))
nyc_data['dropoff_tuple'] = list(zip(nyc_data.dropoff_latitude, nyc_data.dropoff_longitude))

In [None]:
# Importing the geodesic module from the library
from geopy.distance import geodesic

In [None]:
nyc_data['distance'] = nyc_data.apply(lambda x : geodesic(x['pickup_tuple'], x['dropoff_tuple']).km, axis=1)

In [None]:
nyc_data.head()

In [None]:
sns.barplot(x="vendor_id", y="distance", data=nyc_data);
plt.title("Average Trip Distance (in km)");
plt.xlabel("Vendor ID");
plt.ylabel("Trip Distance");

- The average trip distace of the two vendors is almost the same.

In [None]:
#Converting the trip duration from hours to seconds
nyc_data['trip_duration_in_hours'] = nyc_data.trip_duration.apply(lambda x: x/3600)

In [None]:
#Computing trip speeds
nyc_data['trip_speed'] = nyc_data.apply(lambda x: x['distance']/x['trip_duration_in_hours'], axis=1)

In [None]:
nyc_data.groupby("vendor_id").trip_speed.mean()

In [None]:
sns.barplot(x="vendor_id", y="trip_speed", data=nyc_data);
plt.title("Average Trip Speed (in km/h)");
plt.xlabel("Vendor ID");
plt.ylabel("Trip Speed");

- The average trip speed of both vendors are equal

In [None]:
#Transforming day of week - Monday (0) to Sunday (6)
nyc_data['pickup_day_of_the_week_transformed'] = nyc_data['pickup_datetime'].dt.dayofweek 

In [None]:
#Transforming pick up time of the day
conditions = [
    nyc_data['pickup_time_of_the_day'] == "Late Night",
    nyc_data['pickup_time_of_the_day'] == "Early Morning",
    nyc_data['pickup_time_of_the_day'] == "Morning",
    nyc_data['pickup_time_of_the_day'] == "Afternoon",
    nyc_data['pickup_time_of_the_day'] == "Evening",
    nyc_data['pickup_time_of_the_day'] == "Night"
]

choices = [0,1,2,3,4,5,]

nyc_data['pickup_time_of_the_day_transformed'] = np.select(conditions, choices)

In [None]:
#Creating a dataframe to use for regression
nyc_df_features = ['vendor_id', 'passenger_count', 'pickup_time_of_the_day_transformed', 'pickup_day_of_the_week_transformed', 
               'distance', 'trip_speed', 'trip_duration_in_hours']
nyc_df = nyc_data[nyc_df_features]
nyc_df.head()

In [None]:
sns.heatmap(nyc_df.corr(), annot=True);
plt.title('Correlation Matrix for NYC Taxi Dataset');

**LINEAR REGRESSION MODEL**

In [None]:
#seperating independent and dependent variables
x = nyc_df.drop(['trip_duration_in_hours'], axis=1)
y = nyc_data['trip_duration_in_hours']
x.shape, y.shape

In [None]:
# Importing the train test split function
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 56)

In [None]:
#importing Linear Regression and metric mean square error
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_absolute_error as mae

In [None]:
# Creating instance of Linear Regresssion
lr = LR()

# Fitting the model
lr.fit(train_x, train_y)

In [None]:
# Predicting over the Train Set and calculating error
train_predict = lr.predict(train_x)
k = mae(train_predict, train_y)
print('Training Mean Absolute Error', k )

In [None]:
# Predicting over the Test Set and calculating error
test_predict = lr.predict(test_x)
k = mae(test_predict, test_y)
print('Test Mean Absolute Error    ', k )

In [None]:
lr.coef_

In [None]:
plt.figure(figsize=(8, 6), dpi=120, facecolor='w', edgecolor='b')
x = range(len(train_x.columns))
y = lr.coef_
plt.bar( x, y )
plt.xlabel( "Variables")
plt.ylabel('Coefficients')
plt.title('Coefficient plot');

In [None]:
# Arranging and calculating the Residuals
residuals = pd.DataFrame({
    'fitted values' : test_y,
    'predicted values' : test_predict,
})

residuals['residuals'] = residuals['fitted values'] - residuals['predicted values']
residuals.head()