## Bike Share

Using Python, including NumPy and Pandas, to explore data related to bike share systems for three major cities in the United States -  Chicago, New York City and Washington. 

In [1]:
import pandas as pd
import datetime
from datetime import timedelta
import time

In [2]:
## Filenames
#chicago = 'chicago.csv'
#new_york_city = 'new_york_city.csv'
#washington = 'washington.csv'

In [3]:
CITY_DATA = { 'chicago': 'chicago.csv',
              'new york city': 'new_york_city.csv',
              'washington': 'washington.csv' }

cities = ["chicago", "new york city", "washington"]
months = {"all":0, "january":1, "februrary":2, "march":3, "april":4,"may":5, "june":6}
weekdays = {"monday":0, "tuesday":1, "wednsday":2, "thirsday":3,"friday":4, "saturday":5, "sunday":6}

In [4]:
def get_city():
    """
    Asks user to specify a city to analyze.
    
    Returns:
        (str) city - name of the city to analyze
    """
    print('Hello! Let\'s explore some US bikeshare data!')
    
    # get user input for city (chicago, new york city, washington). HINT: Use a while loop to handle invalid inputs
    city = ' '
    while city.lower() not in cities:
        city = input('Please select one of the following cities: Chicago, New York City and Washington.\n').lower()
        if city in cities:
            return city.lower()
        else:
            print("Sorry, invalid input.")

In [5]:
# get user input for month (all, january, february, ... , june)
def get_month():
    
    month = ' '
    
    while month.lower() not in months:
        month = input("Please select one of the following: 'all' (from jan to june), 'january', 'februrary', 'march', 'april', 'may', 'june' .\n").lower()
        if month == 'all':
            return "No month filter from 2017-01 to 2017-06."
        elif month in ["january", "februrary", "march", "april","may", "june"]:
            return "2017-0{}".format(months[month])
        else:
            print("Sorry, invalid input.")

In [6]:
# get user input for day of week (all, monday, tuesday, ... sunday)
def get_day():
    
    day = ' '
    
    while day.lower() not in weekdays:
        day = input("Please select one of the following: 'all' (the whole week), 'monday', 'tuesday', 'wednsday', 'thirsday', 'friday', 'saturday', 'sunday' .\n").lower() 
        if day == 'all':
            return "The whole week without day filter."
        elif day in ['monday', 'tuesday', 'wednsday', 'thirsday', 'friday', 'saturday', 'sunday']:
            return weekdays[day.lower()]
        else:
            print("Sorry, invalid input.")

    print('-'*40)

In [7]:
def load_data(city, month, day):

    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - Pandas DataFrame containing city data filtered by month and day
    """
    df = pd.read_csv(CITY_DATA[city])
    df_weekday = pd.to_datetime(df['Start Time']).dt.weekday.rename("Weekday")
    df = pd.concat([df,df_weekday],axis=1)
    
    if month == "No month filter from 2017-01 to 2017-06.":
        if day == "The whole week without day filter.":
            return df
        else:
            return df.loc[df['Weekday'] == day] #can also use directly df[df['Weekday'] == day]
    else:
        if day == "The whole week without day filter.":
            return df.loc[df['Start Time'].str.contains(month)]
        else:
            return df.loc[df['Start Time'].str.contains(month) & (df['Weekday'] == day)]

In [8]:
def time_stats(df):
    
    df['Start Time'] = pd.to_datetime(df['Start Time'])
    
    # display the most common month
    print('\nCalculating The Most Frequent Month of Travel...\n')
    start_time = time.time()

    df_month = df['Start Time'].dt.month
    y = df_month.value_counts().idxmax(axis=1)
    most_pop_month = [k for k, v in months.items() if v == y]
    print("The most common month is: {}.".format((most_pop_month[0]).title()))

    # display the most common day of week
    print('\nCalculating The Most Frequent Weekday of Travel...\n')
    
    df_weekday = df['Start Time'].dt.weekday
    y = df_weekday.value_counts().idxmax(axis=1)
    most_pop_weekday = [k for k, v in weekdays.items() if v == y]
    print("The most common weekday is: {}.".format((most_pop_weekday[0]).title()))

    # display the most common start hour
    print('\nCalculating The Most Frequent Start Hour of Travel...\n') 
    
    df_starthour = df['Start Time'].dt.hour
    y = df_starthour.value_counts().idxmax(axis=1)
    if y < 12:
        print("The most common start hour is: {}am.".format(y))
    elif y == 12:
        print("The most common start hour is: {}pm.".format(y))
    else:
        print("The most common start hour is: {}pm.".format(y-12))


    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

In [9]:
def station_stats(df):
    """Displays statistics on the most popular stations and trip."""

    print('\nCalculating The Most Popular Stations and Trip...\n')
    start_time = time.time()

    # display most commonly used start station
    start_station = df["Start Station"].value_counts().idxmax(axis=1)
    print("The most common start station is: {}.".format(start_station))

    # display most commonly used end station
    end_station = df["End Station"].value_counts().idxmax(axis=1)
    print("The most common end station is: {}.".format(end_station))

    # display most frequent combination of start station and end station trip
    df_com = pd.concat([df["Start Station"], df["End Station"]])
    pop_com = df_com.value_counts().idxmax(axis=1)
    print("The most common combination station is: {}.".format(pop_com))

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

In [10]:
def trip_duration_stats(df, city):
    """Displays statistics on the total and average trip duration."""

    print('\nCalculating Trip Duration...\n')
    start_time = time.time()

    # display total travel time
    total_travel_seconds = int(df["Trip Duration"].sum())
    total_travel_time = datetime.timedelta(seconds = total_travel_seconds)
    print("Total traval time in {} is {}.".format(city.title(),total_travel_time))

    # display mean travel time
    avg_travel_seconds = int(df["Trip Duration"].mean())
    avg_travel_time = datetime.timedelta(seconds = avg_travel_seconds)
    print("Average traval time in {} is {}.".format(city.title(),avg_travel_time))

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

In [11]:
def user_stats(df, city):
    """Displays statistics on bikeshare users."""

    print('\nCalculating User Stats...\n')
    start_time = time.time()

    # Display counts of user types
    print(df["User Type"].value_counts())
    print()

    # Display counts of gender
    if city.lower() == "washington":
        print("There is no gender information for {}.".format(city.title()))
        print()
    else:
        print(df["Gender"].value_counts())
        count_nan = len(df) - df["Gender"].count()
        print("There are {} people do not mention their gender".format(count_nan))
        print()

    # Display earliest, most recent, and most common year of birth
    if  city.lower() == "washington":
        print("There is no birth year information for {}.".format(city.title()))
    else:
        earliest_year = int(df["Birth Year"].min())
        recent_year = int(df["Birth Year"].max())
        common_year = int(df["Birth Year"].value_counts().idxmax(axis=1))
        print("The earliest, most recent and most common year of birth are respectively as following: {}, {} and {}".format(earliest_year, recent_year, common_year))

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

In [12]:
def main():
    while True:
        city = get_city()
        month = get_month()
        day = get_day()
        df = load_data(city, month, day)

        time_stats(df)
        station_stats(df)
        trip_duration_stats(df, city)
        user_stats(df, city)

        restart = input('\nWould you like to restart? Enter yes or no.\n')
        if restart.lower() != 'yes':
            break

In [None]:
if __name__ == "__main__":
	main()

Hello! Let's explore some US bikeshare data!
Please select one of the following cities: Chicago, New York City and Washington.
Chicago
Please select one of the following: 'all' (from jan to june), 'january', 'februrary', 'march', 'april', 'may', 'june' .
all
Please select one of the following: 'all' (the whole week), 'monday', 'tuesday', 'wednsday', 'thirsday', 'friday', 'saturday', 'sunday' .
all

Calculating The Most Frequent Month of Travel...

The most common month is: June.

Calculating The Most Frequent Weekday of Travel...

The most common weekday is: Tuesday.

Calculating The Most Frequent Start Hour of Travel...

The most common start hour is: 5pm.

This took 0.08686685562133789 seconds.
----------------------------------------

Calculating The Most Popular Stations and Trip...

The most common start station is: Streeter Dr & Grand Ave.
The most common end station is: Streeter Dr & Grand Ave.
The most common combination station is: Streeter Dr & Grand Ave.

This took 0.1900439