# Data Investigation - Stations

In [61]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob

import seaborn as sns
sns.set()

### Import Data

In [62]:
print('Loading Trip Data...')

try:
    file_path_slug = '../../datasets/bayareabikeshare/*_station_data.csv'

    # glob all files
    file_list = glob(file_path_slug)

    station_import = pd.DataFrame()

    counter = 1
    chunks = []

    # load data from each file
    for file in file_list:

        # import file in chunks
        for chunk in pd.read_csv(file, chunksize=10000, iterator=True):

            # set chunk index column to 'Trip ID'
#             chunk = chunk.set_index('Trip ID')

            # define Columns
            chunk.columns = ['station_id', 'name', 'lat', 'long', 'dockcount', 'landmark', 'installation']            
            
            # append chunk to chunks list
            chunks.append(chunk)

        print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
        counter += 1

    # concat chunks
    station_import = pd.concat(chunks)

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')

Loading Trip Data...
	Finished file! (1 of 4)
	Finished file! (2 of 4)
	Finished file! (3 of 4)
	Finished file! (4 of 4)
Data Loaded Successfully!


In [63]:
station_import.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 69
Data columns (total 7 columns):
station_id      276 non-null float64
name            276 non-null object
lat             276 non-null float64
long            276 non-null float64
dockcount       276 non-null float64
landmark        276 non-null object
installation    276 non-null object
dtypes: float64(4), object(3)
memory usage: 75.1+ KB


### Clean Data

In [88]:
def label_zip(row):
    if row['landmark'] == 'San Francisco':
       return '94107'
    if row['landmark'] == 'Redwood City':
        return '94063'
    if row['landmark'] == 'Palo Alto':
        return '94301'
    if row['landmark'] == 'Mountain View':
        return '94041'
    if row['landmark'] == 'San Jose':
        return '95113'
    return '99999'

In [91]:
station_data = station_import.copy()

# remove dulplicates
print('remove dulplicates')
station_data.drop_duplicates(keep='first', inplace=True)
station_data.dropna(how='all', inplace=True)

# set datatype for each column
print('set datatype for each column')
station_data['station_id']   = station_data['station_id'].astype('int')
station_data['name']         = station_data['name'].astype('str')
station_data['lat']          = station_data['lat'].astype('float')
station_data['long']         = station_data['long'].astype('float')
station_data['landmark']     = station_data['landmark'].astype('category')




# add a zipcode column for later comparison with weather data
station_data['zip_code'] = station_data.apply(lambda row: label_zip (row),axis=1)
station_data['zip_code'] = station_data['lat'].astype('str')



# set station_id as index
print('set station_id as index')
station_data.set_index('station_id', inplace=True)





station_data.head()

remove dulplicates
set datatype for each column
set station_id as index


Unnamed: 0_level_0,name,lat,long,dockcount,landmark,installation,zip_code
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,8/6/2013,37.329732
3,San Jose Civic Center,37.330698,-121.888979,15.0,San Jose,8/5/2013,37.330698
4,Santa Clara at Almaden,37.333988,-121.894902,11.0,San Jose,8/6/2013,37.333988
5,Adobe on Almaden,37.331415,-121.8932,19.0,San Jose,8/5/2013,37.331415
6,San Pedro Square,37.336721,-121.894074,15.0,San Jose,8/7/2013,37.336721


In [94]:
# sort by index
station_clean = station_data.copy()
station_clean.sort_index(inplace=True)
station_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83 entries, 2 to 91
Data columns (total 7 columns):
name            83 non-null object
lat             83 non-null float64
long            83 non-null float64
dockcount       83 non-null float64
landmark        83 non-null category
installation    83 non-null object
zip_code        83 non-null object
dtypes: category(1), float64(3), object(3)
memory usage: 4.8+ KB


In [101]:
station_test = station_clean.copy()
station_test.reset_index(inplace=True)
# station_test.set_index(['station_id', 'installation'], inplace=True)

In [102]:
station_test.head(30)

Unnamed: 0,station_id,name,lat,long,dockcount,landmark,installation,zip_code
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27.0,San Jose,8/6/2013,37.329732
1,3,San Jose Civic Center,37.330698,-121.888979,15.0,San Jose,8/5/2013,37.330698
2,4,Santa Clara at Almaden,37.333988,-121.894902,11.0,San Jose,8/6/2013,37.333988
3,5,Adobe on Almaden,37.331415,-121.8932,19.0,San Jose,8/5/2013,37.331415
4,6,San Pedro Square,37.336721,-121.894074,15.0,San Jose,8/7/2013,37.336721
5,7,Paseo de San Antonio,37.333798,-121.886943,15.0,San Jose,8/7/2013,37.333798
6,8,San Salvador at 1st,37.330165,-121.885831,15.0,San Jose,8/5/2013,37.330165
7,9,Japantown,37.348742,-121.894715,15.0,San Jose,8/5/2013,37.348742
8,10,San Jose City Hall,37.337391,-121.886995,15.0,San Jose,8/6/2013,37.337391
9,11,MLK Library,37.335885,-121.88566,19.0,San Jose,8/6/2013,37.335885


In [103]:
station_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 8 columns):
station_id      83 non-null int64
name            83 non-null object
lat             83 non-null float64
long            83 non-null float64
dockcount       83 non-null float64
landmark        83 non-null category
installation    83 non-null object
zip_code        83 non-null object
dtypes: category(1), float64(3), int64(1), object(3)
memory usage: 4.9+ KB


In [106]:
station_test.drop_duplicates(['station_id', 'installation'], keep='first', inplace=True)