# Data Investigation - Stations

### Key Take Aways:

- Summer gets warm and windy
- Winter gets cool and calmer
- Year round it is at least a little cloudy (rank 3 out of 8)
- On the rare occurance of rain, primarily in Winter (Early December through Early April), it is in very little volume

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob

import seaborn as sns
sns.set()

### Import Data

In [2]:
print('Loading Station Data...')

try:
    file = 'station_data_clean.csv'
    
    station_import
try:
    file_path_slug = '../../../datasets/bayareabikeshare/*_station_data.csv'
    file_list = glob(file_path_slug)

    station_import = pd.DataFrame()

    counter = 1
    chunks = []

    for file in file_list:
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
            chunk.columns = ['station_id', 'name', 'lat', 'long', 'dockcount', 'landmark', 'installation']            
            chunks.append(chunk)
        print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    station_import = pd.concat(chunks)
    print('Data Loaded Successfully!')
except:
    print('oops... something went wrong importing the data :(')

Loading Station Data...
	Finished file! (1 of 4)
	Finished file! (2 of 4)
	Finished file! (3 of 4)
	Finished file! (4 of 4)
Data Loaded Successfully!


In [3]:
def label_zip(row):
    if row['landmark'] == 'San Francisco':
       return '94107'
    if row['landmark'] == 'Redwood City':
        return '94063'
    if row['landmark'] == 'Palo Alto':
        return '94301'
    if row['landmark'] == 'Mountain View':
        return '94041'
    if row['landmark'] == 'San Jose':
        return '95113'
    return '99999'

def make_lat_long(row):
    lat = row['lat']
    long = row['long']
    return (lat, long)

In [4]:
print('Station Data Cleanup Started...')

station_data = station_import.copy()

# remove dulplicates
# print('\tremove dulplicates')
# station_data.drop_duplicates(keep='first', inplace=True)
print('\tdropping empty rows')
station_data.dropna(how='all', inplace=True)

# set datatype for each column
print('\tset datatype for each column')
station_data['station_id']   = station_data['station_id'].astype('int')
station_data['name']         = station_data['name'].astype('str')
station_data['lat']          = station_data['lat'].astype('float')
station_data['long']         = station_data['long'].astype('float')
station_data['landmark']     = station_data['landmark'].astype('category')

# add a zipcode column for later comparison with weather data
station_data['zip_code'] = station_data.apply(lambda row: label_zip (row),axis=1)
# station_data['zip_code'] = station_data['landmark'].astype('str')

# create lat,lon tuple column
station_data['lat_long'] = station_data.apply(lambda row: make_lat_long (row),axis=1)

station_data.drop_duplicates(station_data.columns, keep='first', inplace=True)

# # reindex to remove some extra duplicate
# print('\tcorrecting index')
# station_data.reset_index(inplace=True)
# station_data.drop_duplicates(['station_id', 'installation'], keep='first', inplace=True)
# station_data.set_index('station_id', inplace=True)
# station_data.sort_index(inplace=True)

station_data.sort_values(by=['station_id', 'installation'], inplace=True)


# del station_data['index']
print('Cleaning complete!')


Station Data Cleanup Started...
	dropping empty rows
	set datatype for each column
Cleaning complete!


In [6]:
station_data.to_csv('station_data_clean.csv', encoding='utf-8')

In [8]:
station_data.iloc[50:]

Unnamed: 0,station_id,name,lat,long,dockcount,landmark,installation,zip_code,lat_long
45,57,5th at Howard,37.781752,-122.405127,15.0,San Francisco,8/21/2013,94107,"(37.781752000000004, -122.40512700000001)"
46,58,San Francisco City Hall,37.77865,-122.418235,19.0,San Francisco,8/21/2013,94107,"(37.77865, -122.41823500000001)"
47,59,Golden Gate at Polk,37.781332,-122.418603,23.0,San Francisco,8/21/2013,94107,"(37.781332, -122.418603)"
48,60,Embarcadero at Sansome,37.80477,-122.403234,15.0,San Francisco,8/21/2013,94107,"(37.80477, -122.40323400000001)"
49,61,2nd at Townsend,37.780526,-122.390288,27.0,San Francisco,8/22/2013,94107,"(37.780526, -122.39028799999998)"
50,62,2nd at Folsom,37.785299,-122.396236,19.0,San Francisco,8/22/2013,94107,"(37.785299, -122.39623600000002)"
51,63,Howard at 2nd,37.786978,-122.398108,19.0,San Francisco,8/22/2013,94107,"(37.786978000000005, -122.39810800000001)"
52,64,2nd at South Park,37.782259,-122.392738,15.0,San Francisco,8/22/2013,94107,"(37.782259, -122.392738)"
53,65,Townsend at 7th,37.771058,-122.402717,15.0,San Francisco,8/22/2013,94107,"(37.771058000000004, -122.402717)"
54,66,South Van Ness at Market,37.774814,-122.418954,19.0,San Francisco,8/23/2013,94107,"(37.774814, -122.418954)"


In [152]:
# bay area bike share did not become active until 8/23/13, 
# adjust any installation dates prior to then to 8/23/13

def adjust_to_activation_date(row):
    print(row['installation'])
    print(type(row['installation']))
    print()

    
    
    
    






In [155]:
station_clean = station_data.copy()
station_clean['installation'] = pd.to_datetime(station_clean['installation'])
station_clean['installation'] = station_clean.apply(lambda row: adjust_to_activation_date(row),axis=1)

2013-08-06 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-05 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-06 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-05 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-07 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-07 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-05 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-05 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-06 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-06 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-07 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-06 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-05 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-07 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-12 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-15 00:00:00
<class 'pandas._libs.tslib.Timestamp'>

2013-08-15 00:00:00
<class 'pandas._libs