# Data Investigation - Stations

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob

import seaborn as sns
sns.set()

### Import Data

In [2]:
print('Loading Trip Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_station_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    station_import = pd.DataFrame()

    counter = 1
    chunks = []

    # load data from each file
    for file in file_list:

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):

            # set chunk index column to 'Trip ID'
#             chunk = chunk.set_index('Trip ID')

            # define Columns
            chunk.columns = ['station_id', 'name', 'lat', 'long', 'dockcount', 'landmark', 'installation']            
            
            # append chunk to chunks list
            chunks.append(chunk)

        print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    # concat chunks
    station_import = pd.concat(chunks)

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')

Loading Trip Data...
	Finished file! (1 of 4)
	Finished file! (2 of 4)
	Finished file! (3 of 4)
	Finished file! (4 of 4)
Data Loaded Successfully!


In [3]:
station_import.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 69
Data columns (total 7 columns):
station_id      276 non-null float64
name            276 non-null object
lat             276 non-null float64
long            276 non-null float64
dockcount       276 non-null float64
landmark        276 non-null object
installation    276 non-null object
dtypes: float64(4), object(3)
memory usage: 75.1+ KB


### Clean Data

In [4]:
def label_zip(row):
    if row['landmark'] == 'San Francisco':
       return '94107'
    if row['landmark'] == 'Redwood City':
        return '94063'
    if row['landmark'] == 'Palo Alto':
        return '94301'
    if row['landmark'] == 'Mountain View':
        return '94041'
    if row['landmark'] == 'San Jose':
        return '95113'
    return '99999'

In [21]:
def make_lat_long(row):
    lat = row['lat']
    long = row['long']
    return (lat, long)

In [22]:
station_data = station_import.copy()

# remove dulplicates
print('remove dulplicates')
station_data.drop_duplicates(keep='first', inplace=True)
station_data.dropna(how='all', inplace=True)

# set datatype for each column
print('set datatype for each column')
station_data['station_id']   = station_data['station_id'].astype('int')
station_data['name']         = station_data['name'].astype('str')
station_data['lat']          = station_data['lat'].astype('float')
station_data['long']         = station_data['long'].astype('float')
station_data['landmark']     = station_data['landmark'].astype('category')

# add a zipcode column for later comparison with weather data
station_data['zip_code'] = station_data.apply(lambda row: label_zip (row),axis=1)
station_data['zip_code'] = station_data['lat'].astype('str')

# create lat,lon tuple column
station_data['lat_long'] = station_data.apply(lambda row: make_lat_long (row),axis=1)

# reindex to remove some extra duplicate
print('correcting index')
station_data.reset_index(inplace=True)
station_data.drop_duplicates(['station_id', 'installation'], keep='first', inplace=True)
station_data.set_index('station_id', inplace=True)
station_data.sort_index(inplace=True)
del station_data['index']

station_clean = station_data.copy()
print('Cleaning complete!')
station_clean.info()

remove dulplicates
set datatype for each column
correcting index
Cleaning complete!
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 2 to 91
Data columns (total 8 columns):
name            77 non-null object
lat             77 non-null float64
long            77 non-null float64
dockcount       77 non-null float64
landmark        77 non-null category
installation    77 non-null object
zip_code        77 non-null object
lat_long        77 non-null object
dtypes: category(1), float64(3), object(4)
memory usage: 5.1+ KB


In [23]:
station_clean.head()

Unnamed: 0_level_0,name,lat,long,dockcount,landmark,installation,zip_code,lat_long
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,8/6/2013,37.329732,"(37.329732, -121.90178200000001)"
3,San Jose Civic Center,37.330698,-121.888979,15.0,San Jose,8/5/2013,37.330698,"(37.330698, -121.888979)"
4,Santa Clara at Almaden,37.333988,-121.894902,11.0,San Jose,8/6/2013,37.333988,"(37.333988, -121.894902)"
5,Adobe on Almaden,37.331415,-121.8932,19.0,San Jose,8/5/2013,37.331415,"(37.331415, -121.8932)"
6,San Pedro Square,37.336721,-121.894074,15.0,San Jose,8/7/2013,37.336721,"(37.336721000000004, -121.894074)"
