In [1]:
# Add Matplotlib inline magic command will help update charts automatically
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as sts

In [2]:
# Files to load
city_data_to_load = ("resources/city_data.csv")
ride_data_to_load = ("resources/ride_data.csv")


In [3]:
#Read the file and make dataframe
city_data_df = pd.read_csv(city_data_to_load)
ride_data_df = pd.read_csv(ride_data_to_load)

In [4]:
city_data_df.head(10)

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban
5,West Anthony,70,Urban
6,West Angela,48,Urban
7,Martinezhaven,25,Urban
8,Karenberg,22,Urban
9,Barajasview,26,Urban


In [5]:
ride_data_df.head(10)

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344
5,South Latoya,2019-03-11 12:26:48,9.52,1994999424437
6,New Paulville,2019-02-27 11:17:56,43.25,793208410091
7,Simpsonburgh,2019-04-26 00:43:24,35.98,111953927754
8,South Karenland,2019-01-08 03:28:48,35.09,7995623208694
9,North Jasmine,2019-03-09 06:26:29,42.81,5327642267789


In [6]:
#check data is clean ,check rows with nulls, datatype, find total number of data for city
city_data_df.count()


city            120
driver_count    120
type            120
dtype: int64

In [7]:
# check nulls 0 means no nulls in this df
city_data_df.isnull().sum()


city            0
driver_count    0
type            0
dtype: int64

In [8]:
#good both df got no nulls ,now check datatype , driver counts = int 64 good to go
city_data_df.dtypes


city            object
driver_count     int64
type            object
dtype: object

In [9]:
# Get the unique values of the type of city.this shows how many data points are there in this df in col type we have 3 options 
city_data_df["type"].unique()

array(['Urban', 'Suburban', 'Rural'], dtype=object)

In [10]:
# Get the number of data points from the Urban cities.
sum(city_data_df["type"]=="Urban")

66

In [11]:
sum(city_data_df["type"]=="Suburban")

36

In [12]:
sum(city_data_df["type"]=="Rural")

18

In [13]:
ride_data_df.count()

city       2375
date       2375
fare       2375
ride_id    2375
dtype: int64

In [14]:
ride_data_df.isnull().sum()

city       0
date       0
fare       0
ride_id    0
dtype: int64

In [15]:
#date showing as object need to date 
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

In [16]:
ride_data_df['date'] = pd.to_datetime(ride_data_df['date'])

In [17]:
ride_data_df.dtypes

city               object
date       datetime64[ns]
fare              float64
ride_id             int64
dtype: object

In [18]:
ride_data_df

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344
...,...,...,...,...
2370,Michaelberg,2019-04-29 17:04:39,13.38,8550365057598
2371,Lake Latoyabury,2019-01-30 00:05:47,20.76,9018727594352
2372,North Jaime,2019-02-10 21:03:50,11.11,2781339863778
2373,West Heather,2019-05-07 19:22:15,44.94,4256853490277


In [19]:
#now merge 2 df find common column that is "city"
pyber_city_ride_df  = pd.merge(ride_data_df, city_data_df, how="left", on=["city", "city"])

In [20]:
#combined df ready 
pyber_city_ride_df

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban
...,...,...,...,...,...,...
2370,Michaelberg,2019-04-29 17:04:39,13.38,8550365057598,6,Rural
2371,Lake Latoyabury,2019-01-30 00:05:47,20.76,9018727594352,2,Rural
2372,North Jaime,2019-02-10 21:03:50,11.11,2781339863778,1,Rural
2373,West Heather,2019-05-07 19:22:15,44.94,4256853490277,4,Rural


In [21]:
#Scatter plot avg fare for each city=y axis,total number of rides for each type on the x axis 
avg_fare_eachcity_list = pyber_city_ride_df.groupby(["city"]).mean()["fare"]
avg_fare_eachcity_list

city
Amandaburgh         24.641667
Barajasview         25.332273
Barronchester       36.422500
Bethanyland         32.956111
Bradshawfurt        40.064000
                      ...    
West Robert         25.123871
West Samuelburgh    21.767600
Williamsonville     31.875000
Williamsstad        24.362174
Williamsview        26.599000
Name: fare, Length: 120, dtype: float64

In [22]:
#total ride for each city count
tot_rides_list = pyber_city_ride_df.groupby(["city"]).count()["ride_id"]
tot_rides_list

city
Amandaburgh         18
Barajasview         22
Barronchester       16
Bethanyland         18
Bradshawfurt        10
                    ..
West Robert         31
West Samuelburgh    25
Williamsonville     14
Williamsstad        23
Williamsview        20
Name: ride_id, Length: 120, dtype: int64

In [23]:
#Avg driver count for each city
Avg_drivercount_list= pyber_city_ride_df.groupby(["city"]).mean()["driver_count"]
Avg_drivercount_list

city
Amandaburgh         12.0
Barajasview         26.0
Barronchester       11.0
Bethanyland         22.0
Bradshawfurt         7.0
                    ... 
West Robert         39.0
West Samuelburgh    73.0
Williamsonville      2.0
Williamsstad        59.0
Williamsview        46.0
Name: driver_count, Length: 120, dtype: float64

In [24]:
#create 3 data frame to seperate urban ,suburban and rural data 
urban_cities_df = pyber_city_ride_df[pyber_city_ride_df["type"] == "Urban"]
urban_cities_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban


In [25]:
suburban_cities_df = pyber_city_ride_df[pyber_city_ride_df["type"] == "Suburban"]
suburban_cities_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
1625,Barronchester,2019-01-27 03:08:01,27.79,6653622887913,11,Suburban
1626,East Kentstad,2019-04-07 19:44:19,18.75,6575961095852,20,Suburban
1627,Lake Omar,2019-01-17 21:33:35,21.71,966911700371,22,Suburban
1628,Myersshire,2019-02-27 17:38:39,17.1,5706770909868,19,Suburban
1629,West Hannah,2019-04-19 01:06:59,37.78,2273047151891,12,Suburban


In [32]:
rural_cities_df = pyber_city_ride_df[pyber_city_ride_df["type"] == "Rural"]
rural_cities_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
2250,Randallchester,2019-04-13 11:13:31,43.22,1076079536213,9,Rural
2251,North Holly,2019-02-02 14:54:00,12.42,1985256326182,8,Rural
2252,Michaelberg,2019-03-27 18:27:34,54.85,4421836952718,6,Rural
2253,Lake Latoyabury,2019-02-23 21:12:24,47.9,3269652929887,2,Rural
2254,Lake Latoyabury,2019-05-06 08:57:56,51.8,4018025271936,2,Rural


In [35]:
rural_cities_df

Unnamed: 0,city,date,fare,ride_id,driver_count,type
2250,Randallchester,2019-04-13 11:13:31,43.22,1076079536213,9,Rural
2251,North Holly,2019-02-02 14:54:00,12.42,1985256326182,8,Rural
2252,Michaelberg,2019-03-27 18:27:34,54.85,4421836952718,6,Rural
2253,Lake Latoyabury,2019-02-23 21:12:24,47.90,3269652929887,2,Rural
2254,Lake Latoyabury,2019-05-06 08:57:56,51.80,4018025271936,2,Rural
...,...,...,...,...,...,...
2370,Michaelberg,2019-04-29 17:04:39,13.38,8550365057598,6,Rural
2371,Lake Latoyabury,2019-01-30 00:05:47,20.76,9018727594352,2,Rural
2372,North Jaime,2019-02-10 21:03:50,11.11,2781339863778,1,Rural
2373,West Heather,2019-05-07 19:22:15,44.94,4256853490277,4,Rural


In [38]:
Rural_cities=rural_cities_df.groupby(["city"]).count()["ride_id"]
Rural_cities

city
Bradshawfurt         10
Garzaport             3
Harringtonfort        6
Jessicaport           6
Lake Jamie            6
Lake Latoyabury      11
Michaelberg          12
New Ryantown          6
Newtonview            4
North Holly           9
North Jaime           8
Penaborough           5
Randallchester        5
South Jennifer        7
South Marychester     8
South Saramouth       4
Taylorhaven           6
West Heather          9
Name: ride_id, dtype: int64

In [None]:
#Urban df ride count
urban_ridecount_list = urban_cities_df.groupby(["city"]).count()["ride_id"]
urban_ridecount_list

In [None]:
suburban_ridecount_list = suburban_cities_df.groupby(["city"]).count()["ride_id"]
suburban_ridecount_list

In [None]:
rural_ridecount_list = rural_cities_df.groupby(["city"]).count()["ride_id"]
rural_ridecount_list

In [None]:
#avg fare for each city 
urban_avgfare_list = urban_cities_df.groupby(["city"]).mean()["fare"]
urban_avgfare_list

In [None]:
#avg fare for each city 
suburban_avgfare_list = suburban_cities_df.groupby(["city"]).mean()["fare"]
suburban_avgfare_list

In [None]:
rural_avgfare_list = rural_cities_df.groupby(["city"]).mean()["fare"]
rural_avgfare_list

In [None]:
# Get the average number of drivers for each urban city.
urban_avgdrivercount_list = urban_cities_df.groupby(["city"]).mean()["driver_count"]
urban_avgdrivercount_list 

In [None]:
suburban_avgdrivercount_list = suburban_cities_df.groupby(["city"]).mean()["driver_count"]
suburban_avgdrivercount_list 

In [None]:
rural_avgdrivercount_list = rural_cities_df.groupby(["city"]).mean()["driver_count"]
rural_avgdrivercount_list 

In [None]:
#x-axis = Number of ride for each city
#y-axis = Avg fare for each city
#size of marker shows avg number of driver in each city
#first creating 3 sepearte scatter for urban ,suburban and rural
plt.scatter(urban_ridecount_list,urban_avgfare_list, s=10*urban_avgdrivercount_list,c="coral",edgecolor="black", linewidths=1,
      alpha=0.8, label="Urban")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

In [None]:
# Build the scatter plots for suburban cities.
plt.scatter(suburban_ridecount_list,suburban_avgfare_list,s=10*suburban_avgdrivercount_list, c="skyblue",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

In [None]:
# Build the scatter plots for rural cities.
plt.scatter(rural_ridecount_list,rural_avgfare_list,s=10*rural_avgdrivercount_list , c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Rural")
plt.title("PyBer Ride-Sharing Data (2019)")
plt.ylabel("Average Fare ($)")
plt.xlabel("Total Number of Rides (Per City)")
plt.grid(True)
# Add the legend.
plt.legend()

In [None]:
#combine above 3 scatter to one 
# Add the scatter charts for each type of city.
plt.scatter(urban_ridecount_list,urban_avgfare_list, s=10*urban_avgdrivercount_list,c="coral",edgecolor="black", linewidths=1,
      alpha=0.8, label="Urban")

plt.scatter(suburban_ridecount_list,suburban_avgfare_list,s=10*suburban_avgdrivercount_list, c="skyblue",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Suburban")

plt.scatter(rural_ridecount_list,rural_avgfare_list,s=10*rural_avgdrivercount_list , c="gold",
      edgecolor="black", linewidths=1,
      alpha=0.8, label="Rural")

# Incorporate the other graph properties
plt.title("PyBer Ride-Sharing Data (2019)", fontsize=20)
plt.ylabel("Average Fare ($)", fontsize=12)
plt.xlabel("Total Number of Rides (Per City)", fontsize=12)
plt.grid(True)

# Show the plot
plt.legend()
#Customize legend assign variable for plt,legend 
lgnd = plt.legend(fontsize="12", mode="Expanded",
         scatterpoints=1, loc="best", title="City Types")
lgnd.legendHandles[0]._sizes = [75]
lgnd.legendHandles[1]._sizes = [75]
lgnd.legendHandles[2]._sizes = [75]
lgnd.get_title().set_fontsize(12)
plt.text(42, 35, "Note: Circle size correlates with driver count per city.", fontsize="12")

plt.show
# Save the figure.
plt.savefig("resources/pyberscatterplot_fig1.png")

In [None]:
urban_ridecount_list

In [None]:
# urban_cities_df
# suburban_cities_df
# rural_cities_df
# Statistics for all 3 city types 
urban_ridecount_list.describe()


In [None]:
suburban_ridecount_list.describe()

In [None]:
rural_ridecount_list.describe()

In [None]:
round(urban_ridecount_list.mean(),2), round(suburban_ridecount_list.mean(),2), round(rural_ridecount_list.mean(),2)

In [None]:
round(urban_ridecount_list.median(),2), round(suburban_ridecount_list.median(),2), round(rural_ridecount_list.median(),2)

In [None]:
round(urban_ridecount_list.mode(),2), round(suburban_ridecount_list.mode(),2), round(rural_ridecount_list.mode(),2)

In [None]:
# Calculate the measures of central tendency for the ride count for the urban cities.
mean_urban_ride_count = np.mean(urban_ridecount_list)
print(f"The mean for the ride counts for urban trips is {mean_urban_ride_count:.2f}.")

median_urban_ride_count = np.median(urban_ridecount_list)
print(f"The median for the ride counts for urban trips is {median_urban_ride_count}.")

mode_urban_ride_count = sts.mode(urban_ridecount_list)
print(f"The mode for the ride counts for urban trips is {mode_urban_ride_count}.")

In [None]:
# Calculate the measures of central tendency for the ride count for the urban cities.
mean_urban_ride_count = np.mean(suburban_ridecount_list)
print(f"The mean for the ride counts for suburban trips is {mean_urban_ride_count:.2f}.")

median_urban_ride_count = np.median(suburban_ridecount_list)
print(f"The median for the ride counts for suburban trips is {median_urban_ride_count}.")

mode_urban_ride_count = sts.mode(suburban_ridecount_list)
print(f"The mode for the ride counts for suburban trips is {mode_urban_ride_count}.")

In [None]:
# Calculate the measures of central tendency for the ride count for the urban cities.
mean_urban_ride_count = np.mean(rural_ridecount_list)
print(f"The mean for the ride counts for rural trips is {mean_urban_ride_count:.2f}.")

median_urban_ride_count = np.median(rural_ridecount_list)
print(f"The median for the ride counts for rural trips is {median_urban_ride_count}.")

mode_urban_ride_count = sts.mode(rural_ridecount_list)
print(f"The mode for the ride counts for rural trips is {mode_urban_ride_count}.")

In [None]:
urban_fares = urban_cities_df["fare"] 
suburban_fares = suburban_cities_df["fare"] 
rural_fares = rural_cities_df["fare"]

In [None]:
urban_fares.head()

In [None]:
suburban_fares.head()

In [None]:
rural_fares.head()

In [None]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_urban_fares = np.mean(urban_fares)
print(f"The mean fare price for urban trips is ${mean_urban_fares:.2f}.")

median_urban_fares = np.median(urban_fares)
print(f"The median fare price for urban trips is ${median_urban_fares:.2f}.")

mode_urban_fares = sts.mode(urban_fares)
print(f"The mode fare price for urban trips is {mode_urban_fares}.")

In [None]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_suburban_fares = np.mean(suburban_fares)
print(f"The mean fare price for suburban trips is ${mean_suburban_fares:.2f}.")

median_suburban_fares = np.median(suburban_fares)
print(f"The median fare price for suburban trips is ${median_suburban_fares:.2f}.")

mode_suburban_fares = sts.mode(suburban_fares)
print(f"The mode fare price for suburban trips is {mode_suburban_fares}.")

In [None]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_rural_fares = np.mean(rural_fares)
print(f"The mean fare price for rural trips is ${mean_rural_fares:.2f}.")

median_rural_fares = np.median(rural_fares)
print(f"The median fare price for rural trips is ${median_rural_fares:.2f}.")

mode_rural_fares = sts.mode(rural_fares)
print(f"The mode fare price for rural trips is {mode_rural_fares}.")

In [None]:
urban_cities_df

In [None]:
# Calculate the summary statistics for the number of drivers by city type mod 5.4.3
urban_cities_df
urban_drivers = urban_cities_df["driver_count"]
urban_drivers.head()

In [None]:
suburban_cities_df
suburban_drivers = suburban_cities_df["driver_count"]
suburban_drivers.head()

In [None]:
rural_cities_df
rural_drivers = rural_cities_df["driver_count"]
rural_drivers.head()

In [None]:
#Calculate the mean, median, and mode for the using Numpy
mean_urban_drivers = np.mean(urban_drivers)
print(f"The mean driver count for urban trips is {mean_urban_drivers:.2f}.")

median_urban_drivers = np.median(urban_drivers)
print(f"The median driver count for urban trips is {median_urban_drivers:.2f}.")

mode_urban_drivers = sts.mode(urban_drivers)
print(f"The mode driver count for urban trips is {mode_urban_drivers}.")

In [None]:
#
mean_suburban_drivers = np.mean(suburban_drivers)
print(f"The mean driver count for suburban trips is {mean_suburban_drivers:.2f}.")

median_suburban_drivers = np.median(suburban_drivers)
print(f"The median driver count for suburban trips is {median_suburban_drivers:.2f}.")

mode_suburban_drivers= sts.mode(suburban_drivers)
print(f"The mode driver count for suburban trips is {mode_suburban_drivers}.")

In [None]:
mean_rural_drivers = np.mean(rural_drivers)
print(f"The mean driver count for rural trips is {mean_rural_drivers:.2f}.")

median_rural_drivers = np.median(rural_drivers)
print(f"The median driver count for rural trips is {median_rural_drivers:.2f}.")

mode_rural_drivers= sts.mode(rural_drivers)
print(f"The mode driver count for rural trips is {mode_rural_drivers}.")

In [None]:
#Box-and-Whisker Plots for outlier
# Create a box-and-whisker plot for the urban cities ride count.
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(urban_ridecount_list, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Count Data - Urban (2019)')
ax.set_ylabel('Number of Rides')
ax.set_yticks(np.arange(10, 41, step=2.0))
ax.grid()
plt.show()

In [None]:
urban_ridecount_list.describe()

In [None]:
#Calculate the summary statistics with box-and-whisker plots on the number of suburban and rural rides.
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(suburban_ridecount_list, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Count Data- suburban (2019)')
ax.set_ylabel('Number of Rides')
ax.set_yticks(np.arange(10, 41, step=2.0))
ax.grid()
plt.show()

In [None]:
suburban_ridecount_list.describe()

In [None]:
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(rural_ridecount_list, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Count Data- Rural (2019)')
ax.set_ylabel('Number of Rides')
ax.set_yticks(np.arange(10, 41, step=2.0))
ax.grid()
plt.show()

In [None]:
rural_ridecount_list.describe()

In [None]:
# Add all ride count box-and-whisker plots to the same graph outlier is in 39
x_labels = ["Urban", "Suburban","Rural"]
allride_count_data = [urban_ridecount_list, suburban_ridecount_list, rural_ridecount_list]
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Ride Count Data (2019)',fontsize=20)
ax.set_ylabel('Number of Rides',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)
ax.boxplot(allride_count_data, labels=x_labels)
ax.set_yticks(np.arange(0, 45, step=3.0))
ax.grid()
# Save the figure.
plt.savefig("resources/allboxwhisk.png")
plt.show()

In [None]:
urban_city_outlier = urban_ridecount_list[urban_ridecount_list==39].index[0]
print(f"{urban_city_outlier} has the highest rider count.")

In [None]:
#box-and-whisker plots for the ride fare data with summary statistics.
# Create a box-and-whisker plot for the urban fare data.
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(urban_fares, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Fare Data -Urban (2019)')
ax.set_ylabel('Fare($USD)')
ax.set_yticks(np.arange(0, 51, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
urban_fares.describe()

In [None]:
x_labels = ["subUrban"]
fig, ax = plt.subplots()
ax.boxplot(suburban_fares, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Fare Data -Suburban (2019)')
ax.set_ylabel('Fare($USD)')
ax.set_yticks(np.arange(0, 51, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
suburban_fares.describe()

In [None]:
x_labels = ["Rural"]
fig, ax = plt.subplots()
ax.boxplot(rural_fares, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Fare Data -rural (2019)')
ax.set_ylabel('Fare($USD)')
ax.set_yticks(np.arange(0, 65, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
rural_fares.describe()

In [None]:
x_labels = ["Urban", "Suburban","Rural"]
allfares_data = [urban_fares, suburban_fares, rural_fares]
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Ride Count Data (2019)',fontsize=20)
ax.set_ylabel('Number of Rides',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)
ax.boxplot(allfares_data, labels=x_labels)
ax.set_yticks(np.arange(5, 65, step=5))
ax.grid()
# Save the figure.
plt.savefig("resources/allboxwhiskfares.png")
plt.show()

In [None]:
# Create the box-and-whisker plot for the urban driver count data.
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(urban_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
urban_drivers.describe()

In [None]:
x_labels = ["subUrban"]
fig, ax = plt.subplots()
ax.boxplot(suburban_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
suburban_drivers.describe()

In [None]:
x_labels = ["Rural"]
fig, ax = plt.subplots()
ax.boxplot(rural_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
rural_drivers.describe()

In [None]:
x_labels = ["Urban", "Suburban","Rural"]
allfares_data = [urban_drivers, suburban_drivers, rural_drivers]
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Driver Count Data (2019)',fontsize=20)
ax.set_ylabel('Number of Rides',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)
ax.boxplot(allfares_data, labels=x_labels)
ax.set_yticks(np.arange(0, 85,step=5.0))
ax.grid()
# Save the figure.
plt.savefig("resources/allboxwhiskdrivers.png")
plt.show()

In [None]:
#create pie chart
pyber_city_ride_df

In [None]:
sum_fares_by_type = pyber_city_ride_df.groupby(["type"]).sum()["fare"]
sum_fares_by_type

In [None]:
# Get the sum of all the fares.
total_fares = pyber_city_ride_df["fare"].sum()
total_fares

In [None]:
# Calculate the percentage of fare for each city type.
type_percents = 100 * sum_fares_by_type / total_fares
type_percents

In [None]:
# # Calculate the percentage of fare for each city type.
# type_percents = 100 * pyber_city_ride_df.groupby(["type"]).sum()["fare"] / pyber_city_ride_df["fare"].sum()
# type_percents

In [None]:
import matplotlib as mpl
plt.subplots(figsize=(10, 6))
plt.pie(type_percents, labels= ["Rural","Suburban","Urban"],colors=["gold", "lightskyblue", "lightcoral"],
    explode=[0, 0, 0.1],
    autopct='%1.1f%%',
    shadow=True, startangle=150)
plt.title("% of Total Fares by City ")
mpl.rcParams['font.size'] = 14
plt.savefig("resources/pie1.png")
plt.show

In [None]:
# Get the total rides for all the city types.
total_ride = pyber_city_ride_df["ride_id"].count()
total_ride

In [None]:
# Get the total number of rides for each city type.
total_ride_by_type = pyber_city_ride_df.groupby(["type"]).count()["ride_id"]
total_ride_by_type

In [None]:
# Calculate the percentage of rides for each city type.
ride_percents = 100 * (pyber_city_ride_df.groupby(["type"]).count()["ride_id"] / pyber_city_ride_df["ride_id"].count())
ride_percents

In [None]:
# Build percentage of rides by city type pie chart.
plt.subplots(figsize=(10, 6))
plt.pie(ride_percents,
    labels=["Rural", "Suburban", "Urban"],
    colors=["gold", "lightskyblue", "lightcoral"],
    explode=[0, 0, 0.1],
    autopct='%1.1f%%',
    shadow=True, startangle=150)
plt.title("% of Total Rides by City Type")
# Change the default font size from 10 to 14.
mpl.rcParams['font.size'] = 14
# Save Figure
plt.savefig("resources/pie2.png")
# Show Figure
plt.show()

In [None]:
total_drivercount_type = pyber_city_ride_df.groupby(["type"]).sum()["driver_count"]
total_drivercount_type

In [None]:
Total_all_drivercount=pyber_city_ride_df["driver_count"].sum()
Total_all_drivercount

In [None]:
# Calculate the percentage of drivers for each city type.
driver_percents = 100 * city_data_df.groupby(["type"]).sum()["driver_count"] /city_data_df["driver_count"].sum()
driver_percents

In [None]:
# Build percentage of rides by city type pie chart.
plt.subplots(figsize=(10, 6))
plt.pie(driver_percents,
    labels=["Rural", "Suburban", "Urban"],
    colors=["gold", "lightskyblue", "lightcoral"],
    explode=[0, 0, 0.1],
    autopct='%1.1f%%',
    shadow=True, startangle=165)
plt.title("% of Total Drivers by City Type")
# Change the default font size from 10 to 14.
mpl.rcParams['font.size'] = 14
# Save Figure
plt.savefig("resources/pie3.png")
# Show Figure
plt.show()