# Data Investigation - Stations

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
font = {'size'   : 50}
matplotlib.rc('font', **font)

LABEL_FONT_SIZE = 15
TITLE_FONT_SIZE = 25

## Key Dates

In [4]:
FIRST_SERVICE_DATE = datetime.datetime.strptime('2013-08-29', '%Y-%m-%d')
LAST_SERVICE_DATE  = datetime.datetime.strptime('2016-08-31', '%Y-%m-%d')

## Import Data

In [5]:
print('Loading Station Data...')

try:
    file_path_slug = '../../../datasets/bayareabikeshare/*_station_data.csv'
    file_list = glob(file_path_slug)

    station_01 = pd.DataFrame()
    station_02 = pd.DataFrame()
    station_03 = pd.DataFrame()
    station_04 = pd.DataFrame()

    counter = 1


    for file in file_list:
        
        chunks = []
            
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            chunk.columns = ['station_id', 'name', 'lat', 'long', 'dock_count', 'landmark', 'first_service_date']            
            chunks.append(chunk)
        print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
        
        if counter == 1:
            station_01 = pd.concat(chunks)
        elif counter == 2:
            station_02 = pd.concat(chunks)
        elif counter == 3:
            station_03 = pd.concat(chunks)
        elif counter == 4:
            station_04 = pd.concat(chunks)
        else:
            pass        

        counter += 1

    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong importing the data :(')

Loading Station Data...
	Finished file! (1 of 4)
	Finished file! (2 of 4)
	Finished file! (3 of 4)
	Finished file! (4 of 4)
Data Loaded Successfully!


In [7]:
tmp = pd.concat([station_01, station_02, station_03, station_04])
tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 69
Data columns (total 7 columns):
station_id            276 non-null float64
name                  276 non-null object
lat                   276 non-null float64
long                  276 non-null float64
dock_count            276 non-null float64
landmark              276 non-null object
first_service_date    276 non-null object
dtypes: float64(4), object(3)
memory usage: 75.1+ KB


## Preview loaded data

In [None]:
station_01.info()

In [None]:
station_01.head(5)

In [None]:
station_02.info()

In [None]:
station_02.head(5)

In [None]:
station_03.info()

In [None]:
station_03.head(5)

In [None]:
station_04.info()

In [None]:
station_04.head(5)

In [None]:
station_04.info()

## Cleaning Functions

In [None]:
def label_zip(row):
    ''' Return zipcode for given landmark
    '''
    if row['landmark'] == 'San Francisco':
       return 94107
    if row['landmark'] == 'Redwood City':
        return 94063
    if row['landmark'] == 'Palo Alto':
        return 94301
    if row['landmark'] == 'Mountain View':
        return 94041
    if row['landmark'] == 'San Jose':
        return 95113
    return 99999

def days_in_service(row):
    ''' returns an integer of the number of days the statin was in service
    '''
    days_in_service = row.last_service_date - row.first_service_date
    
    try:
        result = int(days_in_service.days)
    except:
        result = 999999
    
    return result

def correct_first_service_date(row):
    ''' adjust first service dates prior to the program start date
    '''
    if row.first_service_date < FIRST_SERVICE_DATE:
        result = FIRST_SERVICE_DATE
    else:
        result = row.first_service_date
    return result


def clean_import_data_types(df):
    ''' set the correct datatype for each column on initially imported data
    '''
    df['station_id']         = df['station_id'].astype('int')
    df['name']               = df['name'].astype('str')
    df['lat']                = df['lat'].astype('float')
    df['long']               = df['long'].astype('float')
    df['landmark']           = df['landmark'].astype('category')
    df['dock_count']         = df['dock_count'].astype('int')
    df['first_service_date'] = pd.to_datetime(df['first_service_date'], format='%m/%d/%Y', errors='coerce')
    
    return df

def clean_final_data_types(df):
    ''' set the correct datatype for each column on final pass
    '''
    df['station_id']         = df['station_id'].astype('int')
    df['name']               = df['name'].astype('str')
    df['lat']                = df['lat'].astype('float')
    df['long']               = df['long'].astype('float')
    df['landmark']           = df['landmark'].astype('category')
    df['dock_count']         = df['dock_count'].astype('int')
    df['first_service_date'] = pd.to_datetime(df['first_service_date'], format='%m/%d/%Y', errors='coerce')
    df['last_service_date']  = pd.to_datetime(df['last_service_date'], format='%m/%d/%Y', errors='coerce')
    df['zip_code']           = df['zip_code'].astype('str')
    df['days_in_service']    = df['days_in_service'].astype('int')
    
    return df


## Clean Data

- Merge Dataframes, drop duplicates
- Corrections provided by Bay Area Bike Share Notes

In [None]:
stations_df.tail()

In [None]:
def notes_cleaning(df):
    ''' Manual Fixes from included notes in publisehd dataset'''
    
    index_drop_list = []
    
    # FROM NOTES - correct installation dates prior to 8/29/13, to 8/29/13
        # Cleaning Step : adjust all dates prior to service start dates to FIRST_SERVICE_DATE
    df['first_service_date'] = df.apply(lambda row: correct_first_service_date (row), axis=1)

    # FROM NOTES - Station 23: From 9/1/14 – 10/22/14: This station was located at (37.488501, -122.231061). 
        # Cleaning Step : this move is across the block, throw out new location record
    index_drop_list.append(17)
    df.loc[17, 'station_id'] = 'JUNK'

    # FROM NOTES - Station 25: From 9/1/14 – 10/22/14: This station was located at (37.486725, -122.225551). It was previously named “Broadway at Main.”
        # Cleaning Step : station is renamed and moved over a mile, set end and start dates for row
    df.loc[19,'last_service_date']  = datetime.datetime.strptime('2014-09-01', '%Y-%m-%d')
    df.loc[20,'first_service_date'] = datetime.datetime.strptime('2014-09-01', '%Y-%m-%d')

    # FROM NOTES - Station 49: From 9/1/14 - 2/5/15: This station was located at (37.789625, -122.390264). 
        # Cleaning Step : station was moved around the block, throw out new location record
    index_drop_list.append(44)
    df.loc[44, 'station_id'] = 'JUNK'
    
    
    # FROM NOTES - Station 69: From 9/1/14 – 3/11/15: This station was located at (37.776377,-122.39607). 
        # Cleaning Step : station was moved around the block, throw out new location record
    index_drop_list.append(63)
    df.loc[63, 'station_id'] = 'JUNK'
        

    # FROM NOTES - Station 72: Moved twice. From 9/1/14 – 2/12/15, this station was located at (37.780356, -122.412919). 
    #                                       From 2/13/15 to 6/3/15, the station was located at (37.780353, -122.41226). 
        # Cleaning Step : the statio was only relocated once on 2/13/15, not twice.  move was around the corner, toss out latest record
    index_drop_list.append(67)
    df.loc[67, 'station_id'] = 'JUNK'
        

    # FROM NOTES - Station 80: On 9/1/14, this station changed names from "San Jose Government Center" to "Santa Clara County Civic Center." It did not move.
        # Cleaning Step : name change, second name is better, throw out original name
    index_drop_list.append(74)
    df.loc[74, 'station_id'] = 'JUNK'
        

    # FROM NOTES - Station 21: On 9/16/15, this station was renamed from "Franklin at Maple" to "Sequoia Hospital" and moved to (37.479303,-122.253755)
        # Cleaning Step : this is a significant move, create a new row, and adjust start and end dates
    df.loc[14,'last_service_date']  = datetime.datetime.strptime('2015-09-16', '%Y-%m-%d')
    station_21_copy = df.loc[14,:].copy()
    station_21_copy.lat = 37.479303
    station_21_copy.long = -122.253755
    station_21_copy.first_service_date = datetime.datetime.strptime('2015-09-16', '%Y-%m-%d')
    station_21_copy.last_service_date = LAST_SERVICE_DATE
    df.loc[100] = station_21_copy
    df.loc[100, 'name'] = 'Sequoia Hospital'


    # FROM NOTES - Station 26: On 9/16/15, this station was renamed from "Redwood City Medical Center" to "Kaiser Hospital" and moved to (37.489704,-122.224728)
        # Cleaning Step :  station was moved around the block, nothing to do here
    
    
    # FROM NOTES - Station 30: On 9/28/15, this station was renamed from "Evelyn Park and Ride" to "Middlefield Light Rail Station" and moved to (37.395337,-122.052476)
        # Cleaning Step : this is a substantial move, update start and end dates
    df.loc[25,'last_service_date']  = datetime.datetime.strptime('2015-09-28', '%Y-%m-%d')
    df.loc[26,'first_service_date'] = datetime.datetime.strptime('2015-09-28', '%Y-%m-%d')


    # FROM NOTES - Station 33: On 9/16/15, this station was renamed from "Rengstorff Avenue / California Street" to "Charleston Park/ North Bayshore Area" and moved to (37.420909,-122.080623)
        # Cleaning Step : this is a substantial move, update start and end dates
    df.loc[29,'last_service_date']  = datetime.datetime.strptime('2015-09-16', '%Y-%m-%d')
    df.loc[30,'first_service_date'] = datetime.datetime.strptime('2015-09-16', '%Y-%m-%d') 


    # FROM NOTES - Station 73: Moved twice. From 3/14/16 – 5/19/16, this station was located at (37.797746, -122.407073). From 5/19/16 to 8/31/16, the station was located at (37.7979, -122.405942). The station name stayed the same for all moves. 
        # Cleaning Step : the move is around the block, but mor stations were added
    df.loc[68,'last_service_date']  = datetime.datetime.strptime('2015-05-19', '%Y-%m-%d')
    df.loc[69,'first_service_date'] = datetime.datetime.strptime('2015-05-19', '%Y-%m-%d')


    # FROM NOTES - Station 83: On 9/16/15, this station was renamed from "Mezes Park" to "Mezes" and moved to (37.491405,-122.233051)
        # Cleaning Step : moved around corner, nothing to clean


    # FROM NOTES - Note 2: On 6/30/16, Service in Redwood City was discontinued due to low usage. This included 7 stations: 21, 22, 23, 24, 25, and 26.
        # Cleaning Step : set last_service_date on these stations
    df.loc[15,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')
    df.loc[16,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')
    df.loc[17,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')
    df.loc[18,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')
    df.loc[19,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')
    df.loc[20,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')
    df.loc[21,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')
    df.loc[77,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')
    df.loc[100,'last_service_date']  = datetime.datetime.strptime('2016-06-30', '%Y-%m-%d')  # station 21 update, special index
    

    # FROM NOTES - Four of these stations have since been moved to either San Francisco or San Jose. (Stations 23, 24, 25 and 26 have become stations 88, 89, 90 and 91 respectively). Although these stations were promptly re-named, there was a delay in assigning them new station IDs. Full details:
        # Cleaning Step 


    # FROM NOTES - On 7/5/16, Station 23, "San Mateo County Center," was renamed to be "5th S. at E. San Salvador St.” On 8/24/16, the station was reassigned to Station 88.
        # Cleaning Step :  79
    df.loc[16, 'last_service_date']  = datetime.datetime.strptime('2016-07-05', '%Y-%m-%d')
    df.loc[79,'first_service_date']  = datetime.datetime.strptime('2016-07-05', '%Y-%m-%d')


    # FROM NOTES - On 7/5/16, Station 24, "Redwood City Public Library," was renamed to be "S. Market St at Park Ave.” On 8/24/16, the station was reassigned to Station 89.
        # Cleaning Step :  80
    df.loc[18, 'last_service_date']  = datetime.datetime.strptime('2016-07-05', '%Y-%m-%d')
    df.loc[80,'first_service_date']  = datetime.datetime.strptime('2016-07-05', '%Y-%m-%d')


    # FROM NOTES - On 8/4/16, Station 25, "Stanford in Redwood City," was renamed to be "Cyril Magnin St at Ellis St.” On 8/24/16, the station was reassigned to Station 91.
        # Cleaning Step :  82
    df.loc[20, 'last_service_date']  = datetime.datetime.strptime('2016-08-04', '%Y-%m-%d')
    df.loc[82,'first_service_date']  = datetime.datetime.strptime('2016-08-04', '%Y-%m-%d')


    # FROM NOTES - On 8/4/16, Station 26, "Kaiser Hospital," was renamed to be "5th St at Folsom St.” On 8/24/16, the station was reassigned to Station 90.
        # Cleaning Step :  81
    df.loc[21, 'last_service_date']  = datetime.datetime.strptime('2016-08-04', '%Y-%m-%d')
    df.loc[81,'first_service_date']  = datetime.datetime.strptime('2016-08-04', '%Y-%m-%d')
    

    # drop all rows in drop list and clean/reset index
    df.drop(index_drop_list, inplace=True)
    df.sort_values(['station_id', 'first_service_date'], inplace=True)
    df.reset_index(inplace=True, drop=True)
    
    
    return df

In [None]:
station_01_TOCLEAN = station_01.copy()
station_02_TOCLEAN = station_02.copy()
station_03_TOCLEAN = station_03.copy()
station_04_TOCLEAN = station_04.copy()

# remove dulplicates
print('\tdropping completely empty rows')
station_01_TOCLEAN.dropna(how='all', inplace=True)
print(station_01_TOCLEAN.shape)
station_02_TOCLEAN.dropna(how='all', inplace=True)
print(station_02_TOCLEAN.shape)
station_03_TOCLEAN.dropna(how='all', inplace=True)
print(station_03_TOCLEAN.shape)
station_04_TOCLEAN.dropna(how='all', inplace=True)
print(station_04_TOCLEAN.shape)

# clean imported date types
print('\tsetting column data types')
station_01_TOCLEAN = clean_import_data_types(station_01_TOCLEAN)
station_02_TOCLEAN = clean_import_data_types(station_02_TOCLEAN)
station_03_TOCLEAN = clean_import_data_types(station_03_TOCLEAN)
station_04_TOCLEAN = clean_import_data_types(station_04_TOCLEAN)

# merge data sets and drop duplicate records
print('\tmerging to single dataframe and droping duplicate rows')
station_data_TOCLEAN = pd.concat([station_01_TOCLEAN, station_02_TOCLEAN, station_03_TOCLEAN, station_04_TOCLEAN])
station_data_TOCLEAN.drop_duplicates(inplace=True)
station_data_TOCLEAN.sort_values('station_id', inplace=True)
station_data_TOCLEAN.reset_index(drop=True, inplace=True)

print(station_data_TOCLEAN.shape)

In [None]:
station_data_TOCLEAN.tail(10)

In [None]:
print('Station Data Cleanup Started...')


print('\tadjusting installatin dates and last service dates')
# as a starting point, set 'last_service_date' to LAST_SERVICE_DATE, adjust based on notes
station_data_TOCLEAN['last_service_date'] = LAST_SERVICE_DATE

station_data_TOCLEAN['zip_code'] = station_data_TOCLEAN.apply(lambda row: label_zip (row),axis=1)

print('\tspecific notes cleaning started')
station_data_TOCLEAN = notes_cleaning(station_data_TOCLEAN)

print('\tcalculating days in service')
station_data_TOCLEAN['days_in_service'] = station_data_TOCLEAN.apply(lambda row: days_in_service (row),axis=1)

# clean data column types a final time
print('\tsetting column data types')
station_data_TOCLEAN = clean_final_data_types(station_data_TOCLEAN)

station_data = station_data_TOCLEAN.copy()

print('Cleaning complete!')

In [None]:
station_data_TOCLEAN.info()

## Preview Final Data Frame

In [None]:
docks_by_service_area = station_data.groupby('landmark')['dock_count'].count().to_frame()
plt.subplots(figsize=(15,6))
ax = sns.barplot(x=docks_by_service_area.index, y='dock_count', data=docks_by_service_area)
ax.set_title('Total Docks', size=TITLE_FONT_SIZE, weight='bold')
ax.set_xlabel('Service Area', size=LABEL_FONT_SIZE, rotation=90)
ax.set_xticklabels(sorted(pd.unique(station_data.landmark)), rotation=0)
ax.set_ylabel('Number of Docks', size=LABEL_FONT_SIZE)
plt.show()

In [None]:
plt.subplots(figsize=(15,6))
ax = sns.barplot(x='landmark', y='days_in_service', data=station_data)
ax.set_title('Days in Service', size=TITLE_FONT_SIZE, weight='bold')
ax.set_xlabel('Service Area', size=LABEL_FONT_SIZE)
ax.set_ylabel('Number of Docks', size=LABEL_FONT_SIZE)
plt.show()

In [None]:
plt.subplots(figsize=(15,6))
ax = sns.swarmplot(x='landmark', y='dock_count', data=station_data, s=10)
ax.set_title('Dock Per Station', size=TITLE_FONT_SIZE, weight='bold')
ax.set_xlabel('Service Area', size=LABEL_FONT_SIZE)
ax.set_ylabel('Number of Docks', size=LABEL_FONT_SIZE)
plt.show()

In [None]:
station_data.info()

In [None]:

for item in pd.unique(station_data.zip_code):
    t = station_data[station_data.zip_code == item]
    
    if item == '94107':
        print('\nSan Francisco')
    if item == '94063':
        print('\nRedwood City')
    if item == '94301':
        print('\nPalo Alto')
    if item == '94041':
        print('\nMountain View')
    if item == '95113':
        print('\nSan Jose')
    print('-' * 80)
    print(sorted(pd.unique(t.name)))




In [None]:
San Jose
--------------------------------------------------------------------------------
['5th S. at E. San Salvador St', 'Adobe on Almaden', 'Arena Green / SAP Center', 'Japantown', 'MLK Library', 
 'Paseo de San Antonio', 'Ryland Park', 'S. Market st at Park Ave', 'SJSU - San Salvador at 9th', 'SJSU 4th at San Carlos', 
 'San Jose City Hall', 'San Jose Civic Center', 'San Jose Diridon Caltrain Station', 'San Jose Government Center', 'San Pedro Square', 
 'San Salvador at 1st', 'Santa Clara at Almaden', 'St James Park']

Redwood City
--------------------------------------------------------------------------------
['Broadway at Main', 'Franklin at Maple', 'Mezes Park',  'Redwood City Medical Center', 
 'Redwood City Public Library', 'San Mateo County Center', 'Sequoia Hospital', 'Stanford in Redwood City']

Mountain View
--------------------------------------------------------------------------------
['Castro Street and El Camino Real', 'Charleston Park/ North Bayshore Area', 'Evelyn Park and Ride', 
 'Middlefield Light Rail Station',  'Mountain View City Hall', 'Rengstorff Avenue / California Street', 
 'San Antonio Caltrain Station', 'San Antonio Shopping Center']

Palo Alto
--------------------------------------------------------------------------------
[, 'Cowper at University', , 'Park at Olive', 
 'University and Emerson']

San Francisco
--------------------------------------------------------------------------------
['2nd at Folsom', '2nd at South Park', '2nd at Townsend', '5th St at Folsom St', '5th at Howard', 'Beale at Market', 
 'Broadway St at Battery St', , 'Clay at Battery', 'Commercial at Montgomery', 
 'Cyril Magnin St at Ellis St', 'Davis at Jackson', , 'Golden Gate at Polk', 'Grant Avenue at Columbus Avenue',  
 'Howard at 2nd', 'Market at 10th', 'Market at 4th', 'Market at Sansome', 'Mechanics Plaza (Market at Battery)', 'Post at Kearney', 
  'Powell at Post (Union Square)',  
 'San Francisco City Hall', 'South Van Ness at Market', 'Spear at Folsom', 'Steuart at Market',  
 'Townsend at 7th', 'Washington at Kearney', 'Yerba Buena Center of the Arts (3rd @ Howard)']



caltrian_stations = ['San Francisco Caltrain (Townsend at 4th)', 'San Francisco Caltrain 2 (330 Townsend)', 'Redwood City Caltrain Station', 'Mountain View Caltrain Station', 'California Ave Caltrain Station', 'Palo Alto Caltrain Station']

mass_transit_stations = ['Powell Street BART', 'Civic Center BART (7th at Market)', 'Harry Bridges Plaza (Ferry Building)', 'Temporary Transbay Terminal (Howard at Beale)',]

embarcadero_stations  = ['Embarcadero at Bryant', 'Embarcadero at Folsom', 'Embarcadero at Sansome', 'Embarcadero at Vallejo']

In [None]:
def label_commuter (row):
    
    

   if row['name'] == 1 :
      return True
   return False


station_data['is_commuter'] = station_data.apply (lambda row: label_commuter (row),axis=1)

In [None]:
station_data[:45]

In [None]:
station_data[45:]

## Write to File

In [None]:
station_data.to_csv('../clean_data/station_data_cleaned_final.csv', encoding='utf-8')

## Quick Reference to Special Stations

> Stations that kept their station_id but other information changed, mostly location, one is dock count

> All Stations in Redwood City were closed on June 30, 2016

In [None]:
specials = station_data[station_data.duplicated(subset=['station_id'], keep=False)]
specials.head(20)

In [None]:
redwood_city = station_data[station_data.landmark == 'Redwood City']
redwood_city.head(20)

## Quick Preview All Station Data

In [None]:
station_data.head(5)

In [None]:
redwoodCity_stations = sorted(pd.unique(station_data[station_data.landmark == 'Redwood City']['station_id']))
print('redwoodCity_stations =', redwoodCity_stations)
mountainView_stations = sorted(pd.unique(station_data[station_data.landmark == 'Mountain View']['station_id']))
print('mountainView_stations =', mountainView_stations)
sanJose_stations = sorted(pd.unique(station_data[station_data.landmark == 'San Jose']['station_id']))
print('sanJose_stations =', sanJose_stations)
sanFrancisco_stations = sorted(pd.unique(station_data[station_data.landmark == 'San Francisco']['station_id']))
print('sanFrancisco_stations =', sanFrancisco_stations)
paloAlto_stations = sorted(pd.unique(station_data[station_data.landmark == 'Palo Alto']['station_id']))
print('paloAlto_stations =', paloAlto_stations)
