In [29]:
%matplotlib notebook

In [30]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [31]:
city_data = "data/city_data.csv"
ride_data = "data/ride_data.csv"

city_data_df = pd.read_csv(city_data)
city_data_df

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban
5,West Anthony,70,Urban
6,West Angela,48,Urban
7,Martinezhaven,25,Urban
8,Karenberg,22,Urban
9,Barajasview,26,Urban


In [32]:
ride_data_df = pd.read_csv(ride_data)
ride_data_df

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2018-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2018-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2018-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2018-02-10 23:22:03,23.44,5149245426178
4,South Jack,2018-03-06 04:28:35,34.58,3908451377344
5,South Latoya,2018-03-11 12:26:48,9.52,1994999424437
6,New Paulville,2018-02-27 11:17:56,43.25,793208410091
7,Simpsonburgh,2018-04-26 00:43:24,35.98,111953927754
8,South Karenland,2018-01-08 03:28:48,35.09,7995623208694
9,North Jasmine,2018-03-09 06:26:29,42.81,5327642267789


In [33]:
combined_data_df = pd.merge(ride_data_df, city_data_df, how="outer", on="city")
combined_data_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2018-01-14 10:14:22,13.83,5739410935873,5,Urban
1,Lake Jonathanshire,2018-04-07 20:51:11,31.25,4441251834598,5,Urban
2,Lake Jonathanshire,2018-03-09 23:45:55,19.89,2389495660448,5,Urban
3,Lake Jonathanshire,2018-04-07 18:09:21,24.28,7796805191168,5,Urban
4,Lake Jonathanshire,2018-01-02 14:14:50,13.89,424254840012,5,Urban


In [34]:
grouped_data = combined_data_df.groupby(["city", "type"])
avg_fare = grouped_data["fare"].mean()
avg_fare_by_type = avg_fare.groupby("type")
avg_fare_urban = avg_fare_by_type.get_group("Urban")
avg_fare_suburban = avg_fare_by_type.get_group("Suburban")
avg_fare_rural = avg_fare_by_type.get_group("Rural")

In [35]:
total_rides = grouped_data["ride_id"].count()
total_rides_by_type = total_rides.groupby("type")
total_rides_urban = total_rides_by_type.get_group("Urban")
total_rides_suburban = total_rides_by_type.get_group("Suburban")
total_rides_rural = total_rides_by_type.get_group("Rural")

In [36]:
total_drivers = grouped_data["driver_count"].mean()
total_drivers_by_type = total_drivers.groupby("type")
total_drivers_urban = total_drivers_by_type.get_group("Urban")
total_drivers_suburban = total_drivers_by_type.get_group("Suburban")
total_drivers_rural = total_drivers_by_type.get_group("Rural")

In [37]:
urban = plt.scatter(total_rides_urban, avg_fare_urban, alpha=0.75, facecolors="lightcoral", edgecolors="black", s=10*total_drivers_urban, label="Urban")
suburban = plt.scatter(total_rides_suburban, avg_fare_suburban, alpha=0.75, facecolors="lightskyblue", edgecolors="black", s=10*total_drivers_suburban, label="Suburban")
rural = plt.scatter(total_rides_rural, avg_fare_rural, alpha=0.75, facecolors="gold", edgecolors="black", s=10*total_drivers_rural, label="Rural")
legend = plt.legend(handles=[urban, suburban, rural], loc="upper right", title="City Types")
for handle in legend.legendHandles:
    handle.set_sizes([20])
plt.gcf().text(0.83, 0.5, "Note:\nCircle size\ncorrelates with\ndriver count\nper city.", fontsize=9)
plt.grid(True)
plt.subplots_adjust(right=0.82)
plt.title("Pyber Ride Sharing Data (2016)")
plt.xlabel("Total Number of Rides (Per City)")
plt.ylabel("Average Fare ($)")
plt.savefig("BubblePlot_RideSharingData.png")
plt.show()

<IPython.core.display.Javascript object>

In [38]:
grouped_type_data = combined_data_df.groupby("type")
total_fare_by_type = grouped_type_data["fare"].sum()
total_fare_by_type.head()

type
Rural        4327.93
Suburban    19356.33
Urban       39854.38
Name: fare, dtype: float64

In [39]:
plt.title("% of Total Fares by City Type")
labels = ["Rural", "Suburban", "Urban"]
colors = ["gold", "lightskyblue", "lightcoral"]
explode = [0, 0, 0.1]
plt.pie(total_fare_by_type, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.axis("equal")
plt.savefig("PieChart_%OfTotalFaresByCityType.png")
plt.show()

<IPython.core.display.Javascript object>

In [40]:
total_rides_by_type = grouped_type_data["ride_id"].count()
total_rides_by_type.head()

type
Rural        125
Suburban     625
Urban       1625
Name: ride_id, dtype: int64

In [41]:
plt.title("% of Total Rides by City Type")
labels = ["Rural", "Suburban", "Urban"]
colors = ["gold", "lightskyblue", "lightcoral"]
explode = [0, 0, 0.1]
plt.pie(total_rides_by_type, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.axis("equal")
plt.savefig("PieChart_%OfTotalRidesByCityType.png")
plt.show()

<IPython.core.display.Javascript object>

In [42]:
unique_cities = combined_data_df.drop_duplicates(subset="city")
cities_by_type = unique_cities.groupby("type")
total_drivers_by_type = cities_by_type["driver_count"].sum()
total_drivers_by_type.head()

type
Rural         78
Suburban     490
Urban       2405
Name: driver_count, dtype: int64

In [43]:
plt.title("% of Total Drivers by City Type")
labels = ["Rural", "Suburban", "Urban"]
colors = ["gold", "lightskyblue", "lightcoral"]
explode = [0, 0, 0.1]
plt.pie(total_drivers_by_type, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.axis("equal")
plt.savefig("PieChart_%OfTotalDriversByCityType.png")
plt.show()

<IPython.core.display.Javascript object>