# Data Investigation - Stations

In [12]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob

import seaborn as sns
sns.set()

In [13]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

### Import Data

In [14]:
print('Loading Station Data...')

try:
    file = 'station_data_clean.csv'
    
    station_import = pd.read_csv('../junk.csv')
except:
    try:
        file_path_slug = '../../../datasets/bayareabikeshare/*_station_data.csv'
        file_list = glob(file_path_slug)

        station_import = pd.DataFrame()

        counter = 1
        chunks = []

        for file in file_list:
            for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
                chunk.columns = ['station_id', 'name', 'lat', 'long', 'dockcount', 'landmark', 'installation']            
                chunks.append(chunk)
            print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
            counter += 1

        station_import = pd.concat(chunks)
        print('Data Loaded Successfully!')
    except:
        print('oops... something went wrong importing the data :(')

Loading Station Data...
	Finished file! (1 of 4)
	Finished file! (2 of 4)
	Finished file! (3 of 4)
	Finished file! (4 of 4)
Data Loaded Successfully!


In [15]:
def label_zip(row):
    if row['landmark'] == 'San Francisco':
       return '94107'
    if row['landmark'] == 'Redwood City':
        return '94063'
    if row['landmark'] == 'Palo Alto':
        return '94301'
    if row['landmark'] == 'Mountain View':
        return '94041'
    if row['landmark'] == 'San Jose':
        return '95113'
    return '99999'

def make_lat_long(row):
    lat = row['lat']
    long = row['long']
    return (lat, long)

In [20]:
print('Station Data Cleanup Started...')

station_data = station_import.copy()

# remove dulplicates
print('\tdropping empty rows')
station_data.dropna(how='all', inplace=True)

# set datatype for each column
print('\tset datatype for each column')
station_data['station_id']   = station_data['station_id'].astype('int')
station_data['name']         = station_data['name'].astype('str')
station_data['lat']          = station_data['lat'].astype('float')
station_data['long']         = station_data['long'].astype('float')
station_data['landmark']     = station_data['landmark'].astype('category')

# add a zipcode column for later comparison with weather data
station_data['zip_code'] = station_data.apply(lambda row: label_zip (row),axis=1)
# station_data['zip_code'] = station_data['landmark'].astype('str')

# create lat,lon tuple column
station_data['lat_long'] = station_data.apply(lambda row: make_lat_long (row),axis=1)

print('Cleaning complete!')


Station Data Cleanup Started...
	dropping empty rows
	set datatype for each column
Cleaning complete!


In [21]:
station_data.reset_index(inplace=True)
station_data.sort_values(by=['station_id', 'installation'], inplace=True)

# drop identical columns, keep first
station_data.drop_duplicates(station_data.columns, keep='first', inplace=True)

# drop remaining columns with matching station_id values, keep first
station_data.drop_duplicates(['station_id'], keep='first', inplace=True)


In [7]:
station_data.sort_values(by=['station_id', 'installation'], inplace=True)
station_data.set_index('station_id', inplace=True)

station_data.head()

Unnamed: 0_level_0,index,name,lat,long,dockcount,landmark,installation,zip_code,lat_long
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,0,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,8/6/2013,95113,"(37.329732, -121.90178200000001)"
3,1,San Jose Civic Center,37.330698,-121.888979,15.0,San Jose,8/5/2013,95113,"(37.330698, -121.888979)"
4,2,Santa Clara at Almaden,37.333988,-121.894902,11.0,San Jose,8/6/2013,95113,"(37.333988, -121.894902)"
5,3,Adobe on Almaden,37.331415,-121.8932,19.0,San Jose,8/5/2013,95113,"(37.331415, -121.8932)"
6,4,San Pedro Square,37.336721,-121.894074,15.0,San Jose,8/7/2013,95113,"(37.336721000000004, -121.894074)"


In [8]:
station_data.to_csv('../clean_data/station_data_cleaned.csv', encoding='utf-8')

In [9]:
station_data.head()

Unnamed: 0_level_0,index,name,lat,long,dockcount,landmark,installation,zip_code,lat_long
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,0,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,8/6/2013,95113,"(37.329732, -121.90178200000001)"
3,1,San Jose Civic Center,37.330698,-121.888979,15.0,San Jose,8/5/2013,95113,"(37.330698, -121.888979)"
4,2,Santa Clara at Almaden,37.333988,-121.894902,11.0,San Jose,8/6/2013,95113,"(37.333988, -121.894902)"
5,3,Adobe on Almaden,37.331415,-121.8932,19.0,San Jose,8/5/2013,95113,"(37.331415, -121.8932)"
6,4,San Pedro Square,37.336721,-121.894074,15.0,San Jose,8/7/2013,95113,"(37.336721000000004, -121.894074)"


In [10]:
station_data[:40]

Unnamed: 0_level_0,index,name,lat,long,dockcount,landmark,installation,zip_code,lat_long
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,0,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,8/6/2013,95113,"(37.329732, -121.90178200000001)"
3,1,San Jose Civic Center,37.330698,-121.888979,15.0,San Jose,8/5/2013,95113,"(37.330698, -121.888979)"
4,2,Santa Clara at Almaden,37.333988,-121.894902,11.0,San Jose,8/6/2013,95113,"(37.333988, -121.894902)"
5,3,Adobe on Almaden,37.331415,-121.8932,19.0,San Jose,8/5/2013,95113,"(37.331415, -121.8932)"
6,4,San Pedro Square,37.336721,-121.894074,15.0,San Jose,8/7/2013,95113,"(37.336721000000004, -121.894074)"
7,5,Paseo de San Antonio,37.333798,-121.886943,15.0,San Jose,8/7/2013,95113,"(37.333798, -121.88694299999999)"
8,6,San Salvador at 1st,37.330165,-121.885831,15.0,San Jose,8/5/2013,95113,"(37.330165, -121.88583100000001)"
9,7,Japantown,37.348742,-121.894715,15.0,San Jose,8/5/2013,95113,"(37.348742, -121.89471499999999)"
10,8,San Jose City Hall,37.337391,-121.886995,15.0,San Jose,8/6/2013,95113,"(37.337391, -121.886995)"
11,9,MLK Library,37.335885,-121.88566,19.0,San Jose,8/6/2013,95113,"(37.335885, -121.88566000000002)"


In [11]:
station_data[40:]

Unnamed: 0_level_0,index,name,lat,long,dockcount,landmark,installation,zip_code,lat_long
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
50,40,Harry Bridges Plaza (Ferry Building),37.795392,-122.394203,23.0,San Francisco,8/20/2013,94107,"(37.795392, -122.394203)"
51,41,Embarcadero at Folsom,37.791464,-122.391034,19.0,San Francisco,8/20/2013,94107,"(37.791464000000005, -122.391034)"
54,42,Embarcadero at Bryant,37.787152,-122.388013,15.0,San Francisco,8/20/2013,94107,"(37.787152, -122.38801299999999)"
55,43,Temporary Transbay Terminal (Howard at Beale),37.789756,-122.394643,23.0,San Francisco,8/20/2013,94107,"(37.789756, -122.39464299999999)"
56,44,Beale at Market,37.792251,-122.397086,19.0,San Francisco,8/20/2013,94107,"(37.792251, -122.39708600000002)"
57,45,5th at Howard,37.781752,-122.405127,15.0,San Francisco,8/21/2013,94107,"(37.781752000000004, -122.40512700000001)"
58,46,San Francisco City Hall,37.77865,-122.418235,19.0,San Francisco,8/21/2013,94107,"(37.77865, -122.41823500000001)"
59,47,Golden Gate at Polk,37.781332,-122.418603,23.0,San Francisco,8/21/2013,94107,"(37.781332, -122.418603)"
60,48,Embarcadero at Sansome,37.80477,-122.403234,15.0,San Francisco,8/21/2013,94107,"(37.80477, -122.40323400000001)"
61,49,2nd at Townsend,37.780526,-122.390288,27.0,San Francisco,8/22/2013,94107,"(37.780526, -122.39028799999998)"
