In [225]:
import pandas as pd
import numpy as np

In [245]:
# FILTERS:
state_filter = "chicago"
month_filter = "February"
day_filter = "Wednesday"

if state_filter == "chicago":
    df = pd.read_csv("./data/chicago.csv")
elif state_filter == "new york":
    df = pd.read_csv("./data/new_york_city.csv")
elif state_filter == "new york":
    df = pd.read_csv("./data/washington.csv")

In [252]:
df["Gender"].value_counts().keys().to_list()

['Male', 'Female']

In [228]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     300000 non-null  int64  
 1   Start Time     300000 non-null  object 
 2   End Time       300000 non-null  object 
 3   Trip Duration  300000 non-null  int64  
 4   Start Station  300000 non-null  object 
 5   End Station    300000 non-null  object 
 6   User Type      300000 non-null  object 
 7   Gender         238948 non-null  object 
 8   Birth Year     238981 non-null  float64
dtypes: float64(1), int64(2), object(6)
memory usage: 20.6+ MB


## Clean up dataframe

In [229]:
# Change column names
df = df.rename(columns={
    "Unnamed: 0" : "id", 
    "Start Time" : "start_time", 
    "End Time" : "end_time", 
    "Trip Duration" : "trip_duration", 
    "Start Station" : "start_station", 
    "End Station" : "end_station", 
    "User Type" : "user_type", 
    "Gender" : "gender", 
    "Birth Year" : "birth_year"
})


df["birth_year"] = df["birth_year"].fillna(0)
df["birth_year"] = df["birth_year"].astype(int)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             300000 non-null  int64 
 1   start_time     300000 non-null  object
 2   end_time       300000 non-null  object
 3   trip_duration  300000 non-null  int64 
 4   start_station  300000 non-null  object
 5   end_station    300000 non-null  object
 6   user_type      300000 non-null  object
 7   gender         238948 non-null  object
 8   birth_year     300000 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 20.6+ MB


In [230]:
df["start_time"] = pd.to_datetime(df['start_time'])
df["end_time"] = pd.to_datetime(df['end_time'])

df["year"] = df["start_time"].dt.year
df["month"] = df["start_time"].dt.month_name()
df["day"] = df["start_time"].dt.day
df["weekday"] = df["start_time"].dt.day_name()
df["hour"] = df["start_time"].dt.hour

## 1: Popular times of travel
- most common month
- most common day of the week
- most common hour of the day

In [231]:
counts = df["month"].value_counts()
month = counts.nlargest().index[0]
amount = counts.nlargest().iloc[0]
print(f"The most common month of travel is {month} with {format(amount, ',d')} travels")

The most common month of travel is June with 98,081 travels


In [232]:
counts = df["weekday"].value_counts()
day = counts.nlargest().index[0]
amount = counts.nlargest().iloc[0]
print(f"The most common day of travel is {day} with {format(amount, ',d')} travels")

The most common day of travel is Tuesday with 45,912 travels


In [233]:
counts = df["hour"].value_counts()
hour = counts.nlargest().index[0]
hour = f"{hour - 12}:00 PM" if hour >= 12 else f"{hour}:00 AM"
amount = counts.nlargest().iloc[0]
print(f"The most common hour of travel is {hour} with {format(amount, ',d')} travels")

The most common hour of travel is 5:00 PM with 35,992 travels


In [234]:
# FILTER THE DF
filtered_df = df

if month_filter != "no":
    filtered_df = filtered_df[filtered_df["month"] == month_filter]

if day_filter != "no":
    filtered_df = filtered_df[filtered_df["weekday"] == day_filter]

df = filtered_df
df

Unnamed: 0,id,start_time,end_time,trip_duration,start_station,end_station,user_type,gender,birth_year,year,month,day,weekday,hour
10,175805,2017-02-15 07:09:55,2017-02-15 07:14:56,301,Broadway & Barry Ave,Wilton Ave & Belmont Ave,Subscriber,Female,1993,2017,February,15,Wednesday,7
35,243879,2017-02-22 15:33:56,2017-02-22 15:54:07,1211,Streeter Dr & Grand Ave,Theater on the Lake,Customer,,0,2017,February,22,Wednesday,15
49,141745,2017-02-08 07:13:31,2017-02-08 07:16:17,166,Pine Grove Ave & Irving Park Rd,Clarendon Ave & Gordon Ter,Subscriber,Male,1986,2017,February,8,Wednesday,7
55,239793,2017-02-22 07:51:44,2017-02-22 08:00:14,510,Broadway & Barry Ave,Sedgwick St & Webster Ave,Subscriber,Male,1988,2017,February,22,Wednesday,7
122,178170,2017-02-15 14:45:08,2017-02-15 14:59:11,843,State St & Van Buren St,Clinton St & 18th St,Subscriber,Male,1985,2017,February,15,Wednesday,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299512,245744,2017-02-22 17:10:48,2017-02-22 17:33:07,1339,Lake Shore Dr & Ohio St,Clinton St & Lake St,Subscriber,Female,1984,2017,February,22,Wednesday,17
299579,141346,2017-02-08 04:22:07,2017-02-08 04:28:06,359,Broadway & Belmont Ave,Broadway & Waveland Ave,Subscriber,Male,1989,2017,February,8,Wednesday,4
299727,178512,2017-02-15 16:08:34,2017-02-15 16:23:21,887,Mies van der Rohe Way & Chicago Ave,Green St & Randolph St,Subscriber,Male,1973,2017,February,15,Wednesday,16
299788,145200,2017-02-08 17:40:03,2017-02-08 17:56:06,963,Franklin St & Jackson Blvd,Clark St & Elm St,Subscriber,Male,1986,2017,February,8,Wednesday,17


## 2: Popular stations and trip
- most common start station
- most common end station
- most common trip from start to end (combination of start and end stations)

In [235]:
start_station = df["start_station"]
counts = start_station.value_counts()
station = counts.nlargest().index[0]
amount = counts.nlargest().iloc[0]
print(f"The most common starting station is {station} with {format(amount, ',d')} departures")

The most common starting station is Clinton St & Washington Blvd with 115 departures


In [236]:
start_station = df["end_station"]
counts = start_station.value_counts()
station = counts.nlargest().index[0]
amount = counts.nlargest().iloc[0]
print(f"The most common ending station is {station} with {format(amount, ',d')} arrivals")

The most common ending station is Clinton St & Washington Blvd with 131 arrivals


In [237]:
df["start_end_stations"] = df["start_station"] + " and " + df["end_station"]
start_end_stations = df["start_end_stations"]
counts = start_end_stations.value_counts()
trip = counts.nlargest().index[0]
amount = counts.nlargest().iloc[0]
print(f"The most common trip from start to end is between {trip} with {format(amount, ',d')} travels")

The most common trip from start to end is between Michigan Ave & Washington St and Canal St & Madison St with 9 travels


## 3: Trip Duration
- total travel time
- average travel time

In [238]:
travel_times = df["trip_duration"]
print(f"{format(travel_times.sum() // 60, ',d')} minutes")

60,572 minutes


In [239]:
travel_times = df["trip_duration"]
print(f"{travel_times.mean()/60} minutes on average")

11.926064842160532 minutes on average


## 4: User Info
- counts of each user type
- counts of each gender (only for NYC and Chicago)
- earliest year of birth (only for NYC and Chicago)
- most recent year of birth (only for NYC and Chicago)
- most common year of birth (only for NYC and Chicago)

In [240]:
types = df["user_type"]
types.value_counts()

user_type
Subscriber    4765
Customer       314
Name: count, dtype: int64

In [241]:
genders = df["gender"]
counts = genders.value_counts()

print(f"Males: {format(counts["Male"], ',d')}")
print(f"Females: {format(counts["Female"], ',d')}")

Males: 3,738
Females: 1,026


In [242]:
birth_year = df["birth_year"]
years = birth_year.unique()
years = years[years != 0]
minimum = years.min()
print(f"The earliest year of birth is {minimum}")

The earliest year of birth is 1918


In [243]:
birth_year = df["birth_year"]
years = birth_year.unique()
years = years[years != 0]
most_recent = years.max()
print(f"The most recent year of birth is {most_recent}")

The most recent year of birth is 2000


In [244]:
birth_year = df["birth_year"]
birth_year = birth_year[birth_year != 0]
counts = birth_year.value_counts()
most_common = counts.nlargest().index[1]
print(f"The most common year of birth is {most_common}")

The most common year of birth is 1989
