In [1]:
import pandas as pd

filename = 'chicago.csv'

# load data file into a dataframe
df = pd.read_csv(filename)

In [2]:
df.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0


In [6]:
# convert the Start Time column to datetime
df['Start Time'] = pd.to_datetime(df['Start Time'], format='%d%b%Y:%H:%M:%S.%f')

In [7]:
# extract hour from the Start Time column to create an hour column
df['hour'] = df['Start Time'].dt.hour

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,hour
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0,15
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0,18
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0,8
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0,13
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0,14


In [9]:
# find the most common hour (from 0 to 23)
popular_hour = df['hour'].mode()[0]

In [10]:
print('Most Frequent Start Hour:', popular_hour)

Most Frequent Start Hour: 17


In [11]:
df['day_of_week'] = df['Start Time'].dt.weekday_name
print(df['day_of_week'].head())

0       Friday
1     Thursday
2    Wednesday
3       Monday
4      Tuesday
Name: day_of_week, dtype: object


In [25]:
df['month'] = df['Start Time'].dt.month
df['journey'] = df['Start Station'].str.cat(df['End Station'], sep=' to ')
df.head()

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,hour,day_of_week,month,journey
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0,15,Friday,6,Wood St & Hubbard St to Damen Ave & Chicago Ave
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0,18,Thursday,5,Theater on the Lake to Sheffield Ave & Wavelan...
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0,8,Wednesday,1,May St & Taylor St to Wood St & Taylor St
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0,13,Monday,3,Christiana Ave & Lawrence Ave to St. Louis Ave...
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0,14,Tuesday,1,Clark St & Randolph St to Desplaines St & Jack...


In [14]:
# print value counts for each user type
user_types = df['User Type'].value_counts()

print(user_types)

Subscriber    238889
Customer       61110
Dependent          1
Name: User Type, dtype: int64


In [23]:
pop_start = df['Start Station'].mode().to_string(index = False)
pop_end = df['End Station'].mode().to_string(index = False)
print('The most popular start station is {}.'.format(pop_start))
print('The most popular end station is {}.'.format(pop_end))

The most popular start station is Streeter Dr & Grand Ave.
The most popular end station is Streeter Dr & Grand Ave.


In [26]:
total_time = df['Trip Duration'].sum()
print(total_time)

280871787


In [None]:
import pandas as pd

CITY_DATA = { 'chicago': 'chicago.csv',
              'new york city': 'new_york_city.csv',
              'washington': 'washington.csv' }

def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - pandas DataFrame containing city data filtered by month and day
    """
    
    # load data file into a dataframe
    df = pd.read_csv(CITY_DATA[city])

    # convert the Start Time column to datetime
    df['Start Time'] = pd.to_datetime(df['Start Time'])

    # extract month and day of week from Start Time to create new columns
    df['month'] = df['Start Time'].dt.month
    df['day_of_week'] = df['Start Time'].dt.weekday_name


    # filter by month if applicable
    if month != 'all':
        # use the index of the months list to get the corresponding int
        months = ['january', 'february', 'march', 'april', 'may', 'june']
        month = months.index(month) + 1
    
        # filter by month to create the new dataframe
        df = df[df['month'] == month]

    # filter by day of week if applicable
    if day != 'all':
        # filter by day of week to create the new dataframe
        df = df[df['day_of_week'] == day.title()] 
    
    return df
    
df = load_data('chicago', 'march', 'friday')

