In [136]:
# Dependencies
import pandas as pd
import os
import math
import time
import datetime

In [137]:
# Create a list of all csv files to clean up
path_name = '../Resources/'
csv_file_list = []
csv_file_list = os.listdir(path_name)

# Remove '.DS_Store' file (the “DS” stands for “Desktop Services”) 
# is important in helping Mac work out how to display folders when you open them. 
if '.DS_Store' in csv_file_list: csv_file_list.remove('.DS_Store')
    
csv_file_list

['201701-citibike-tripdata - TEST FILE.csv']

# Citi Bike Trip Histories. 
The data includes:

Trip Duration (seconds)
Start Time and Date
Stop Time and Date
Start Station Name
End Station Name
Station ID
Station Lat/Long
Bike ID
User Type (Customer = 24-hour pass or 3-day pass user; Subscriber = Annual Member)
Gender (Zero=unknown; 1=male; 2=female)
Year of Birth

Winter: December (12), January (1), February (2),
Spring: March (3), April (4), May (5)
Summer: June (6), July (7), August (8)
Autumn: September (9), October (10), November (11)

In [138]:
# csv files have same columns but different heaers.
# Define the coulmn headers for the dataframe of csv contents
new_column_headers = ['Trip Duration', 'Start Time', 'Stop Time', 'Start Station ID',
                   'Start Station Name', 'Start Station Latitude',
                   'Start Station Longitude', 'End Station ID', 'End Station Name',
                   'End Station Latitude', 'End Station Longitude', 'Bike ID', 'User Type',
                   'Birth Year', 'Gender']

In [139]:
start_time = time.time()
start_time

1546295744.109482

In [140]:
# Define the empty dataframe with only column headers.
raw_df  = pd.DataFrame(columns=new_column_headers)

# Loop through the csv files and append to the row dataframe defined above. 
# At the end of the loop the row dataframe will contain the contents of all the csv files
for file in csv_file_list:
    
    file_name = path_name + file
    print(file_name)
    
    current_df = pd.read_csv(file_name, encoding="ISO-8859-1")
    current_df.columns = new_column_headers
    raw_df = raw_df.append(current_df)
    
    # Add Age column calculated using Birth Year and current year (2017)
    raw_df['Age'] = 2017 - raw_df['Birth Year']
    
    # Add a new column for Season - for now store the month.  
    # Later this value will be convereted to name of the season.
    raw_df['Season'] = pd.DatetimeIndex(raw_df['Start Time']).month 

raw_df.head(50)

../Resources/201701-citibike-tripdata - TEST FILE.csv


Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Age,Season
0,680,1/1/17 0:00,1/1/17 0:11,3226,W 82 St & Central Park West,40.78275,-73.97137,3165,Central Park West & W 72 St,40.775794,-73.976206,25542,Subscriber,1965.0,2,52.0,1
1,1282,1/1/17 0:00,1/1/17 0:22,3263,Cooper Square & E 7 St,40.729236,-73.990868,498,Broadway & W 32 St,40.748549,-73.988084,21136,Subscriber,1987.0,2,30.0,1
2,648,1/1/17 0:00,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,18147,Customer,,0,,1
3,631,1/1/17 0:01,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,21211,Customer,,0,,1
4,621,1/1/17 0:01,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,26819,Customer,,0,,1
5,666,1/1/17 0:01,1/1/17 0:12,3163,Central Park West & W 68 St,40.773407,-73.977825,3163,Central Park West & W 68 St,40.773407,-73.977825,16050,Subscriber,2000.0,1,17.0,1
6,559,1/1/17 0:05,1/1/17 0:14,499,Broadway & W 60 St,40.769155,-73.981918,479,9 Ave & W 45 St,40.760193,-73.991255,27294,Subscriber,1973.0,1,44.0,1
7,826,1/1/17 0:05,1/1/17 0:19,362,Broadway & W 37 St,40.751726,-73.987535,445,E 10 St & Avenue A,40.727408,-73.98142,23288,Subscriber,1977.0,2,40.0,1
8,255,1/1/17 0:05,1/1/17 0:10,430,York St & Jay St,40.701485,-73.986569,242,Carlton Ave & Flushing Ave,40.697787,-73.973736,25041,Subscriber,1989.0,1,28.0,1
9,634,1/1/17 0:07,1/1/17 0:18,3165,Central Park West & W 72 St,40.775794,-73.976206,3164,Columbus Ave & W 72 St,40.777057,-73.978985,16311,Subscriber,1980.0,1,37.0,1


In [141]:
# Get the avreage age and Birth Year to populate the 'NaN' values it corresponding columns. 
avg_birth_year = math.ceil(raw_df['Birth Year'].mean())
avg_age =  math.ceil(raw_df['Age'].mean())

raw_df.loc[pd.isna(raw_df['Birth Year']), 'Birth Year'] = avg_birth_year
raw_df.loc[pd.isna(raw_df['Age']), 'Age'] = avg_age

In [142]:
raw_df.head(50)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Age,Season
0,680,1/1/17 0:00,1/1/17 0:11,3226,W 82 St & Central Park West,40.78275,-73.97137,3165,Central Park West & W 72 St,40.775794,-73.976206,25542,Subscriber,1965.0,2,52.0,1
1,1282,1/1/17 0:00,1/1/17 0:22,3263,Cooper Square & E 7 St,40.729236,-73.990868,498,Broadway & W 32 St,40.748549,-73.988084,21136,Subscriber,1987.0,2,30.0,1
2,648,1/1/17 0:00,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,18147,Customer,1979.0,0,39.0,1
3,631,1/1/17 0:01,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,21211,Customer,1979.0,0,39.0,1
4,621,1/1/17 0:01,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,26819,Customer,1979.0,0,39.0,1
5,666,1/1/17 0:01,1/1/17 0:12,3163,Central Park West & W 68 St,40.773407,-73.977825,3163,Central Park West & W 68 St,40.773407,-73.977825,16050,Subscriber,2000.0,1,17.0,1
6,559,1/1/17 0:05,1/1/17 0:14,499,Broadway & W 60 St,40.769155,-73.981918,479,9 Ave & W 45 St,40.760193,-73.991255,27294,Subscriber,1973.0,1,44.0,1
7,826,1/1/17 0:05,1/1/17 0:19,362,Broadway & W 37 St,40.751726,-73.987535,445,E 10 St & Avenue A,40.727408,-73.98142,23288,Subscriber,1977.0,2,40.0,1
8,255,1/1/17 0:05,1/1/17 0:10,430,York St & Jay St,40.701485,-73.986569,242,Carlton Ave & Flushing Ave,40.697787,-73.973736,25041,Subscriber,1989.0,1,28.0,1
9,634,1/1/17 0:07,1/1/17 0:18,3165,Central Park West & W 72 St,40.775794,-73.976206,3164,Columbus Ave & W 72 St,40.777057,-73.978985,16311,Subscriber,1980.0,1,37.0,1


In [143]:
print(raw_df['Gender'].value_counts())

1    191
2     55
0     54
Name: Gender, dtype: int64


In [144]:
# Gender (Zero=unknown; 1=male; 2=female)
gender_counts = raw_df['Gender'].value_counts().to_dict()
row_count = raw_df.shape
print(row_count[0])
print(gender_counts)

300
{1: 191, 2: 55, 0: 54}


In [145]:
# Determine the number of males, females and missing gender fields 
missing_genders = gender_counts[0]
num_males = gender_counts[1]
num_females = gender_counts[2]
num_males_females = (num_males + num_females)

# Determine ratio of males, and females excluding missing gender fields
males_ratio = num_males / num_males_females
females_ratio = num_females / num_males_females

print("males = " + str(num_males))
print("females = " + str(num_females))
print("num missing gender = " + str(missing_genders))
print("males and females = " + str(num_males_females))
print("male ratio = " + str(males_ratio))
print("female ratio = " + str(females_ratio))

# Determime what ratio of the missing gender feilds should be set to male and female
#missing_males = math.ceil(missing_genders * males_ratio)
#missing_females =  math.ceil(missing_genders * females_ratio)

missing_males = math.floor((missing_genders * males_ratio) + 0.5)
missing_females =  math.floor((missing_genders * females_ratio) + 0.5)

print("male = " + str(missing_males))
print("female = " + str(missing_females))
print("total ratio = " + str(missing_males + missing_females))


males = 191
females = 55
num missing gender = 54
males and females = 246
male ratio = 0.7764227642276422
female ratio = 0.22357723577235772
male = 42
female = 12
total ratio = 54


In [146]:
# Loop through and convert the Gender columns with 0 values to male or female using the ratios
# calculated above.  

gender = []
males_count = 0 
    
for row in raw_df['Gender']:
    
    # 0 -> unknown, 1 -> Male, 2 -> Female 
    if  row == 1:
        gender.append('Male')
    elif  row == 2:
        gender.append('Female')
    elif  row == 0:
        if males_count <= missing_males:
            gender.append('Male')
            males_count += 1
        else: 
            gender.append('Female')
                                 
raw_df['Gender'] = gender

In [147]:
# Add Season column calculated using month portion of Start Time as follows:
# Winter: December (12), January (1), February (2),
# Spring: March (3), April (4), May (5)
# Summer: June (6), July (7), August (8)
# Autumn: September (9), October (10), November (11)

season  = []
    
for row in raw_df['Season']:
    
    if row == 1:
        season.append('Winter')
    if row == 2:
        season.append('Winter')
    if row == 3:
        season.append('Spring')
    if row == 4:
        season.append('Spring')
    if row == 5:
        season.append('Spring')
    if row == 6:
        season.append('Summer')
    if row == 7:
        season.append('Summer')
    if row == 8:
        season.append('Summer')
    if row == 9:
        season.append('Autumn')
    if row == 10:
        season.append('Autumn')
    if row == 11:
        season.append('Autumn')
    if row == 12:
        season.append('Winter')
                                 
raw_df['Season'] = season

In [148]:
raw_df.head(50)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Age,Season
0,680,1/1/17 0:00,1/1/17 0:11,3226,W 82 St & Central Park West,40.78275,-73.97137,3165,Central Park West & W 72 St,40.775794,-73.976206,25542,Subscriber,1965.0,Female,52.0,Winter
1,1282,1/1/17 0:00,1/1/17 0:22,3263,Cooper Square & E 7 St,40.729236,-73.990868,498,Broadway & W 32 St,40.748549,-73.988084,21136,Subscriber,1987.0,Female,30.0,Winter
2,648,1/1/17 0:00,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,18147,Customer,1979.0,Male,39.0,Winter
3,631,1/1/17 0:01,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,21211,Customer,1979.0,Male,39.0,Winter
4,621,1/1/17 0:01,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,26819,Customer,1979.0,Male,39.0,Winter
5,666,1/1/17 0:01,1/1/17 0:12,3163,Central Park West & W 68 St,40.773407,-73.977825,3163,Central Park West & W 68 St,40.773407,-73.977825,16050,Subscriber,2000.0,Male,17.0,Winter
6,559,1/1/17 0:05,1/1/17 0:14,499,Broadway & W 60 St,40.769155,-73.981918,479,9 Ave & W 45 St,40.760193,-73.991255,27294,Subscriber,1973.0,Male,44.0,Winter
7,826,1/1/17 0:05,1/1/17 0:19,362,Broadway & W 37 St,40.751726,-73.987535,445,E 10 St & Avenue A,40.727408,-73.98142,23288,Subscriber,1977.0,Female,40.0,Winter
8,255,1/1/17 0:05,1/1/17 0:10,430,York St & Jay St,40.701485,-73.986569,242,Carlton Ave & Flushing Ave,40.697787,-73.973736,25041,Subscriber,1989.0,Male,28.0,Winter
9,634,1/1/17 0:07,1/1/17 0:18,3165,Central Park West & W 72 St,40.775794,-73.976206,3164,Columbus Ave & W 72 St,40.777057,-73.978985,16311,Subscriber,1980.0,Male,37.0,Winter


In [149]:
end_time = time.time()
end_time

1546295753.739476

In [150]:
elapsed = end_time - start_time
elapsed

9.629993915557861

In [151]:
raw_df.head(50)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Age,Season
0,680,1/1/17 0:00,1/1/17 0:11,3226,W 82 St & Central Park West,40.78275,-73.97137,3165,Central Park West & W 72 St,40.775794,-73.976206,25542,Subscriber,1965.0,Female,52.0,Winter
1,1282,1/1/17 0:00,1/1/17 0:22,3263,Cooper Square & E 7 St,40.729236,-73.990868,498,Broadway & W 32 St,40.748549,-73.988084,21136,Subscriber,1987.0,Female,30.0,Winter
2,648,1/1/17 0:00,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,18147,Customer,1979.0,Male,39.0,Winter
3,631,1/1/17 0:01,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,21211,Customer,1979.0,Male,39.0,Winter
4,621,1/1/17 0:01,1/1/17 0:11,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,26819,Customer,1979.0,Male,39.0,Winter
5,666,1/1/17 0:01,1/1/17 0:12,3163,Central Park West & W 68 St,40.773407,-73.977825,3163,Central Park West & W 68 St,40.773407,-73.977825,16050,Subscriber,2000.0,Male,17.0,Winter
6,559,1/1/17 0:05,1/1/17 0:14,499,Broadway & W 60 St,40.769155,-73.981918,479,9 Ave & W 45 St,40.760193,-73.991255,27294,Subscriber,1973.0,Male,44.0,Winter
7,826,1/1/17 0:05,1/1/17 0:19,362,Broadway & W 37 St,40.751726,-73.987535,445,E 10 St & Avenue A,40.727408,-73.98142,23288,Subscriber,1977.0,Female,40.0,Winter
8,255,1/1/17 0:05,1/1/17 0:10,430,York St & Jay St,40.701485,-73.986569,242,Carlton Ave & Flushing Ave,40.697787,-73.973736,25041,Subscriber,1989.0,Male,28.0,Winter
9,634,1/1/17 0:07,1/1/17 0:18,3165,Central Park West & W 72 St,40.775794,-73.976206,3164,Columbus Ave & W 72 St,40.777057,-73.978985,16311,Subscriber,1980.0,Male,37.0,Winter


In [152]:
print(raw_df['Gender'].value_counts())

Male      234
Female     66
Name: Gender, dtype: int64


In [153]:
raw_df.to_csv('../Output/2017-citibike-tripdata.csv')