In [None]:
import pandas as pd
import plotly.express as px

In [None]:
df = pd.read_csv('uber.csv')
df.rename(columns={'Unnamed: 0': 'unique_id'}, inplace=True)
df

Unnamed: 0,unique_id,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1.0
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1.0
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1.0
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3.0
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5.0
...,...,...,...,...,...,...,...,...,...
53584,8361470,2010-11-25 21:15:31.0000001,7.7,2010-11-25 21:15:31 UTC,-73.972790,40.749323,-74.000707,40.741479,1.0
53585,47202705,2011-04-16 18:13:09.0000004,8.5,2011-04-16 18:13:09 UTC,-73.981819,40.762517,-74.000019,40.761710,1.0
53586,48651213,2009-11-30 12:59:00.000000107,10.1,2009-11-30 12:59:00 UTC,-73.980230,40.748618,-73.983753,40.768880,3.0
53587,39962430,2012-05-29 09:37:09.0000001,8.1,2012-05-29 09:37:09 UTC,-73.968240,40.761938,-73.985079,40.768352,1.0


In [None]:
df.columns

Index(['unique_id', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [None]:
df = df.dropna(subset=['pickup_longitude', 'pickup_latitude','dropoff_latitude','dropoff_longitude'])
df.isnull().sum()

Unnamed: 0,0
unique_id,0
key,0
fare_amount,0
pickup_datetime,0
pickup_longitude,0
pickup_latitude,0
dropoff_longitude,0
dropoff_latitude,0
passenger_count,1


In [None]:
df.dtypes


Unnamed: 0,0
unique_id,int64
key,object
fare_amount,float64
pickup_datetime,object
pickup_longitude,float64
pickup_latitude,float64
dropoff_longitude,float64
dropoff_latitude,float64
passenger_count,float64


In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['year'] = df['pickup_datetime'].dt.year
df['month'] = df['pickup_datetime'].dt.month
df['hour'] = df['pickup_datetime'].dt.hour
df['minute'] = df['pickup_datetime'].dt.minute
df['day_of_week'] = df['pickup_datetime'].dt.day_name()
df

Unnamed: 0,unique_id,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,hour,minute,day_of_week
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1.0,2015,5,19,52,Thursday
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1.0,2009,7,20,4,Friday
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1.0,2009,8,21,45,Monday
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3.0,2009,6,8,22,Friday
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5.0,2014,8,17,47,Thursday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53584,8361470,2010-11-25 21:15:31.0000001,7.7,2010-11-25 21:15:31+00:00,-73.972790,40.749323,-74.000707,40.741479,1.0,2010,11,21,15,Thursday
53585,47202705,2011-04-16 18:13:09.0000004,8.5,2011-04-16 18:13:09+00:00,-73.981819,40.762517,-74.000019,40.761710,1.0,2011,4,18,13,Saturday
53586,48651213,2009-11-30 12:59:00.000000107,10.1,2009-11-30 12:59:00+00:00,-73.980230,40.748618,-73.983753,40.768880,3.0,2009,11,12,59,Monday
53587,39962430,2012-05-29 09:37:09.0000001,8.1,2012-05-29 09:37:09+00:00,-73.968240,40.761938,-73.985079,40.768352,1.0,2012,5,9,37,Tuesday


In [None]:
rides_per_year = df.groupby('year').size().reset_index(name='ride_count')

# Create a bar plot using Plotly
fig = px.bar(rides_per_year, x='year', y='ride_count',
             title='Count of Rides Per Year',
             labels={'year': 'Year', 'ride_count': 'Number of Rides'},
             color='year',  # Color bars by year for better visualization
             text='ride_count')  # Display the count on top of each bar


fig.show()

In [None]:
revenue_per_year = df.groupby('year')['fare_amount'].sum().reset_index(name='Revenue')

# Create a bar plot using Plotly
fig = px.bar(revenue_per_year, x='year', y='Revenue',
             title='Revenue Per Year',
             labels={'year': 'Year', 'Revenue': 'Amount in Dollars'},
             color='year',  # Color bars by year for better visualization
             text='Revenue')  # Display the count on top of each bar

# Show the plot
fig.show()

In [None]:

revenue_per_month = df.groupby(['year', 'month'])['fare_amount'].sum().reset_index(name='Revenue')

# Create a bar plot using Plotly
fig = px.bar(revenue_per_month, x='month', y='Revenue', color='year',
             title='Monthly Revenue Per Year',
             labels={'month': 'Month', 'Revenue': 'Amount in Dollars'},
             text='Revenue',  # Display the revenue on top of each bar
             category_orders={'month': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}  # Ensure months are ordered correctly
             )

# Show the plot
fig.show()

In [None]:
# Filter the data for the USA (approximate bounds)
df_usa = df[(df['pickup_latitude'] >= 24.396308) & (df['pickup_latitude'] <= 49.384358) &
            (df['pickup_longitude'] >= -125.0) & (df['pickup_longitude'] <= -66.93457)]

In [None]:

#Pickup Map
fig1 = px.scatter_mapbox(df_usa, lat="pickup_latitude", lon="pickup_longitude",
                         color_discrete_sequence=["red"], zoom=10,
                         title="Pickup Points in the USA")
fig1.update_layout(mapbox_style="carto-positron", mapbox_center={"lat": 40.7128, "lon": -74.0060})
fig1.show()

# Dropoff Map
fig2 = px.scatter_mapbox(df_usa, lat="dropoff_latitude", lon="dropoff_longitude",
                         color_discrete_sequence=["green"], zoom=10,
                         title="Drop-off Points in the USA")
fig2.update_layout(mapbox_style="carto-positron", mapbox_center={"lat": 40.7128, "lon": -74.0060})
fig2.show()

In [None]:
# Function to create continent-wise scatter maps
def plot_continent_map(df_continent, continent_name, lat_center, lon_center, zoom_level):
    fig = px.scatter_mapbox(df_continent,
                            lat='pickup_latitude',
                            lon='pickup_longitude',
                            title=f"Pickup Points in {continent_name}",
                            color_discrete_sequence=["blue"],  # Color for the markers
                            hover_name='pickup_datetime',  # Show pickup time on hover
                            labels={'pickup_latitude': 'Latitude', 'pickup_longitude': 'Longitude'})

    # Set the map style and zoom level
    fig.update_layout(mapbox_style="carto-positron",
                      mapbox_zoom=zoom_level,  # Adjust zoom level for the continent
                      mapbox_center={"lat": lat_center, "lon": lon_center},  # Center map on continent
                      )  # Disable zooming with scroll
    fig.show()


# Example for Europe (adjust latitude and longitude bounds as needed)
df_europe = df[(df['pickup_latitude'] >= 35) & (df['pickup_latitude'] <= 70) &
               (df['pickup_longitude'] >= -25) & (df['pickup_longitude'] <= 40)]
plot_continent_map(df_europe, "Spain and Portugal", lat_center=39.5, lon_center=-8.0, zoom_level=6)



In [None]:
#Identify Peak Hours of the Day
fig = px.bar(df.groupby('hour').size().reset_index(name="count"),
             x='hour', y='count',
             title='Number of Rides Per Hour')
fig.show()

In [None]:
day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# Group by day and count rides
day_counts = df['day_of_week'].value_counts().reindex(day_order).reset_index()
day_counts.columns = ['day_of_week', 'count']

# Plot the data as a line chart
fig = px.line(day_counts,
              x='day_of_week', y='count',
              title='Number of Rides Per Day of the Week',
              markers=True,  # Adds dots to each point
              text='count',
              labels={'day_of_week': 'Day of the Week', 'count': 'Number of Rides'})

# Show data labels on each point
fig.update_traces(textposition="top center")

fig.show()

In [None]:
# Group by day of the week and hour, then count rides
hourly_rides = df.groupby(['day_of_week', 'hour']).size().reset_index(name="count")

# Sort days in the correct order
hourly_rides['day_of_week'] = pd.Categorical(hourly_rides['day_of_week'], categories=day_order, ordered=True)

# Create a heatmap
fig = px.imshow(hourly_rides.pivot(index='hour', columns='day_of_week', values='count'),
                labels={'x': "Day of the Week", 'y': "Hour of the Day", 'color': "Number of Rides"},
                title="Heatmap of Rides Per Hour for Each Day",
                color_continuous_scale='Plasma')

# Adjust layout to make the heatmap square
fig.update_layout(
    height=600,  # Set height
    width=600,   # Set width
    title_x=0.5  # Center the title
)

fig.show()

In [None]:
df['passenger_count'].value_counts()

Unnamed: 0_level_0,count
passenger_count,Unnamed: 1_level_1
1.0,37076
2.0,7930
5.0,3752
3.0,2429
4.0,1129
6.0,1086
0.0,186


In [None]:
df_sorted = df.sort_values(by='pickup_datetime')

# Create a scatter plot on the map with animation
fig = px.scatter_mapbox(df_sorted,
                        lat='pickup_latitude',
                        lon='pickup_longitude',
                        animation_frame=df_sorted['pickup_datetime'].dt.date.astype(str),
                        title="Daily Pickup Locations")

# Set the map style
fig.update_layout(mapbox_style="carto-positron",  # Clean style map
                  mapbox_zoom=10,  # Adjust this value depending on your dataset's location
                  mapbox_center={"lat": 40.7128, "lon": -74.0060},  # Center map on New York (example)
                  height=600,  # Set height for better resolution
                  width=800)  # Adjust width if needed

fig.show()

In [None]:
# Haversine function to calculate distance between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    # Radius of Earth in km (use 6371 for km, 3958 for miles)
    r = 6371  # kilometers

    return r * c

In [None]:
import numpy as np
#Apply Haversine function to each row
df['trip_distance_km'] = df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'],
                                                       row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

In [None]:
df

Unnamed: 0,unique_id,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,hour,minute,day_of_week,trip_distance_km
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1.0,2015,5,19,52,Thursday,1.683323
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1.0,2009,7,20,4,Friday,2.457590
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1.0,2009,8,21,45,Monday,5.036377
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3.0,2009,6,8,22,Friday,1.661683
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5.0,2014,8,17,47,Thursday,4.475450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53584,8361470,2010-11-25 21:15:31.0000001,7.7,2010-11-25 21:15:31+00:00,-73.972790,40.749323,-74.000707,40.741479,1.0,2010,11,21,15,Thursday,2.508346
53585,47202705,2011-04-16 18:13:09.0000004,8.5,2011-04-16 18:13:09+00:00,-73.981819,40.762517,-74.000019,40.761710,1.0,2011,4,18,13,Saturday,1.535465
53586,48651213,2009-11-30 12:59:00.000000107,10.1,2009-11-30 12:59:00+00:00,-73.980230,40.748618,-73.983753,40.768880,3.0,2009,11,12,59,Monday,2.272488
53587,39962430,2012-05-29 09:37:09.0000001,8.1,2012-05-29 09:37:09+00:00,-73.968240,40.761938,-73.985079,40.768352,1.0,2012,5,9,37,Tuesday,1.587391


In [None]:
df['date'] = df['pickup_datetime'].dt.date

fig = px.line(df.groupby('date')['passenger_count'].sum().reset_index(),
              x='date', y='passenger_count',
              title='Total Passengers Per Day',
              labels={'date': 'Date', 'passenger_count': 'Total Passengers'})
fig.show()


In [None]:
# Create a 3D scatter plot with better resolution and customization
fig = px.scatter_3d(df,
                    x='trip_distance_km',
                    y='fare_amount',
                    z='passenger_count',
                    color='passenger_count',  # Color points by passenger count
                    size_max=18,  # Make the points larger
                    opacity=0.7,  # Set transparency for a cleaner look
                    labels={'trip_distance': 'Trip Distance (km)',
                            'fare_amount': 'Fare Amount ($)',
                            'passenger_count': 'Number of Passengers'},
                    title="Relationship Between Passenger Count, Fare Amount, and Trip Distance",
                    color_continuous_scale='Plasma',  # Use a color scale for better visualization
                    height=600, width=800)  # Increase figure resolution (size)

# Show the plot
fig.show()

In [None]:
# Create fare buckets for ranges like $1-$5, $6-$10, etc.
df['fare_bucket'] = pd.cut(df['fare_amount'], bins=[0, 5, 10, 15, 20, 30, 50, 100],
                           labels=['$1-$5', '$6-$10', '$11-$15', '$16-$20', '$21-$30', '$31-$50', '$51+'])

# Group by fare bucket and calculate the number of rides and average passenger count
fare_summary = df.groupby('fare_bucket').agg(
    ride_count=('fare_amount', 'size'),
    total_passenger_count=('passenger_count', 'sum')
).reset_index()

# Create a histogram (bar plot) for fare amount buckets and include average passenger count
fig = px.bar(fare_summary,
             x='fare_bucket',
             y='ride_count',
             title='Number of Rides Per Fare Bucket with Average Passenger Count',
             labels={'fare_bucket': 'Fare Bucket', 'ride_count': 'Number of Rides'},
             text='total_passenger_count',  # Display average passenger count as text on bars
             color='fare_bucket',  # Color bars based on fare bucket
             color_continuous_scale='Viridis')

# Show the plot
fig.show()






In [None]:
import pandas as pd
import plotly.express as px

# Create trip distance bins (e.g., 0-2 km, 2-5 km, 5-10 km, etc.)
bins = [0, 2, 5, 10, 20, 50, 100]  # Customize bin ranges as needed
labels = ['0-2 km', '2-5 km', '5-10 km', '10-20 km', '20-50 km', '50+ km']

# Create a new column 'distance_range' to categorize each trip distance into a bin
df['distance_range'] = pd.cut(df['trip_distance_km'], bins=bins, labels=labels, right=False)

# Create a violin plot to show the distribution of fare amounts for each range of trip distances
fig = px.violin(df,
                x='distance_range',
                y='fare_amount',
                box=True,  # Add box plot inside the violin plot
                points="all",  # Show all data points
                title='Distribution of Fare Amount for Different Trip Distance Ranges',
                labels={'distance_range': 'Trip Distance Range', 'fare_amount': 'Fare Amount ($)'},
                color='distance_range')  # Color by trip distance range

# Show the plot
fig.show()
