In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')


In [4]:
# Step 2: Load the Data
# Load the dataset
url = 'https://www.kaggle.com/kaggle/input/nyc-taxi-trip-data-google-public-data/taxi_trip_data.csv'
"https://www.kaggle.com/datasets/neilclack/nyc-taxi-trip-data-google-public-data/code"
df = pd.read_csv(url, parse_dates=['pickup_date', 'dropoff_date'])


HTTPError: HTTP Error 404: Not Found

In [None]:
# Step 3: Understand the Data
# Display the first few rows
print(df.head())

# Get the basic information
print(df.info())

# Summary statistics
print(df.describe(include='all'))


In [None]:
# Step 4: Handle Missing Data
# Check for missing values
print(df.isnull().sum())

# Handle missing values
df.dropna(inplace=True)


In [None]:
# Step 5: Data Visualization
# Distribution of trip fares
sns.histplot(df['fare_amount'], kde=True)
plt.title('Distribution of Trip Fares')
plt.show()

# Distribution of trip durations
df['trip_duration'] = (df['dropoff_date'] - df['pickup_date']).dt.total_seconds() / 60
sns.histplot(df['trip_duration'], kde=True)
plt.title('Distribution of Trip Durations')
plt.show()

# Number of trips by hour of day
df['pickup_hour'] = df['pickup_date'].dt.hour
sns.countplot(x='pickup_hour', data=df)
plt.title('Number of Trips by Hour of Day')
plt.show()

# Average fare by hour of day
average_fare_by_hour = df.groupby('pickup_hour')['fare_amount'].mean()
sns.lineplot(x=average_fare_by_hour.index, y=average_fare_by_hour.values)
plt.title('Average Fare by Hour of Day')
plt.show()


In [None]:
# Step 6: Univariate Analysis
# Distribution of fare amount
sns.histplot(df['fare_amount'], kde=True)
plt.title('Fare Amount Distribution')
plt.show()

# Distribution of trip duration
sns.histplot(df['trip_duration'], kde=True)
plt.title('Trip Duration Distribution')
plt.show()


In [None]:
# Step 7: Bivariate Analysis
# Trip duration vs Fare amount
sns.scatterplot(x='trip_duration', y='fare_amount', data=df)
plt.title('Trip Duration vs Fare Amount')
plt.show()

# Trip fare by pickup hour
sns.boxplot(x='pickup_hour', y='fare_amount', data=df)
plt.title('Fare Amount by Pickup Hour')
plt.show()


In [None]:
# Step 8: Multivariate Analysis
# Pair plot for numerical features
sns.pairplot(df[['fare_amount', 'trip_duration', 'pickup_hour']])
plt.show()


In [None]:
# Step 9: Identify and Handle Outliers
# Box plot to identify outliers in fare amount
sns.boxplot(x=df['fare_amount'])
plt.title('Boxplot of Fare Amount')
plt.show()

# Removing outliers from fare amount
Q1 = df['fare_amount'].quantile(0.25)
Q3 = df['fare_amount'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['fare_amount'] < (Q1 - 1.5 * IQR)) | (df['fare_amount'] > (Q3 + 1.5 * IQR)))]

# Box plot to identify outliers in trip duration
sns.boxplot(x=df['trip_duration'])
plt.title('Boxplot of Trip Duration')
plt.show()

# Removing outliers from trip duration
Q1 = df['trip_duration'].quantile(0.25)
Q3 = df['trip_duration'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['trip_duration'] < (Q1 - 1.5 * IQR)) | (df['trip_duration'] > (Q3 + 1.5 * IQR)))]


In [None]:
# Step 10: Feature Engineering
# Create features for day of week and month
df['pickup_day_of_week'] = df['pickup_date'].dt.dayofweek
df['pickup_month'] = df['pickup_date'].dt.month

# Create a feature for trip distance (if available)
# For simplicity, let's assume distance is provided in the dataset
# df['distance'] = df['distance']  # Ensure distance feature is available


In [None]:
Step 11: Summary and Insights
# Summarize key findings
print("Key Insights:")

# Average fare by hour
average_fare_by_hour = df.groupby('pickup_hour')['fare_amount'].mean()
print(f"Average Fare by Hour:\n{average_fare_by_hour}")

# Trip duration vs Fare
trip_duration_fare_corr = df[['trip_duration', 'fare_amount']].corr().iloc[0, 1]
print(f"Correlation between Trip Duration and Fare Amount: {trip_duration_fare_corr}")

# Number of trips by hour
trips_by_hour = df['pickup_hour'].value_counts().sort_index()
print(f"Number of Trips by Hour:\n{trips_by_hour}")

# Average fare and trip duration by day of week
average_fare_by_day = df.groupby('pickup_day_of_week')['fare_amount'].mean()
average_duration_by_day = df.groupby('pickup_day_of_week')['trip_duration'].mean()
print(f"Average Fare by Day of Week:\n{average_fare_by_day}")
print(f"Average Trip Duration by Day of Week:\n{average_duration_by_day}")


Findings:
1. Fare Distribution: The distribution of fare amounts shows typical fare ranges, with some high-value outliers.
2. Trip Duration: Most trips have moderate durations, with some extreme values.
3. Trips by Hour: There is a peak in the number of trips during certain hours, which can indicate rush hours.
4. Fare vs Duration: Longer trips tend to have higher fares, indicating a positive correlation.
5. Day of Week Trends: Average fares and trip durations can vary by day of the week, providing insights into weekly patterns.