# Uber Request Data Analysis - EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plot style
plt.style.use('seaborn')

# Load the data
df = pd.read_csv('../data/Uber Request Data.csv')

# Display first few rows
df.head()

## Data Cleaning

In [None]:
# Check for missing values
print('Missing values:
', df.isnull().sum())

# Convert timestamp columns
df['Request timestamp'] = pd.to_datetime(df['Request timestamp'])
df['Pickup timestamp'] = pd.to_datetime(df['Pickup timestamp'])

# Create additional useful columns
df['Hour'] = df['Request timestamp'].dt.hour
df['Day'] = df['Request timestamp'].dt.day_name()
df['Weekday'] = df['Request timestamp'].dt.weekday

# Calculate wait time for completed requests
df['Wait Time'] = (df['Pickup timestamp'] - df['Request timestamp']).dt.total_seconds() / 60

# Handle missing values in Pickup timestamp
df['Wait Time'] = df['Wait Time'].fillna(0)

# Display cleaned data info
df.info()

## Exploratory Data Analysis

### 1. Request Distribution by Location

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Pickup point')
plt.title('Distribution of Requests by Pickup Point')
plt.xlabel('Pickup Point')
plt.ylabel('Number of Requests')
plt.xticks(rotation=45)
plt.show()

### 2. Request Status Analysis

In [None]:
# Create a pie chart for request status
plt.figure(figsize=(8, 8))
df['Status'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Request Status')
plt.ylabel('')
plt.show()

# Detailed status counts
status_counts = df['Status'].value_counts()
status_counts

### 3. Time-Based Analysis

In [None]:
# Hourly request distribution
plt.figure(figsize=(15, 6))
sns.countplot(data=df, x='Hour')
plt.title('Hourly Request Distribution')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Requests')
plt.xticks(rotation=45)
plt.show()

# Day of week analysis
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Day', order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Request Distribution by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Requests')
plt.xticks(rotation=45)
plt.show()

### 4. Wait Time Analysis

In [None]:
# Wait time statistics
wait_time_stats = df[df['Wait Time'] > 0]['Wait Time'].describe()
wait_time_stats

# Wait time distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df[df['Wait Time'] > 0], x='Wait Time', bins=30)
plt.title('Distribution of Wait Times')
plt.xlabel('Wait Time (minutes)')
plt.ylabel('Number of Requests')
plt.show()

### 5. Location-Based Analysis

In [None]:
# Cancellation rates by location
cancellation_rates = df[df['Status'] == 'Cancelled'].groupby('Pickup point')['Status'].count() / df.groupby('Pickup point')['Status'].count() * 100
cancellation_rates = cancellation_rates.sort_values(ascending=False)

plt.figure(figsize=(12, 6))
cancellation_rates.plot(kind='bar')
plt.title('Cancellation Rate by Location')
plt.xlabel('Pickup Point')
plt.ylabel('Cancellation Rate (%)')
plt.xticks(rotation=45)
plt.show()

## Insights and Recommendations

1. Peak Request Times:
   - Identify peak hours and days for resource allocation
   - Schedule more drivers during peak times

2. Cancellation Patterns:
   - Analyze high cancellation locations
   - Implement strategies to reduce cancellations

3. Wait Time Optimization:
   - Analyze wait time distribution
   - Set realistic wait time expectations

4. Location-Specific Strategies:
   - Allocate resources based on location demand
   - Implement location-specific marketing

5. Time-Based Strategies:
   - Optimize driver schedules
   - Implement surge pricing during peak times