# Data Investigation - Stations

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import math
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob

import seaborn as sns
sns.set()

### Import Data

In [2]:
print('Loading Station Data...')

try:
    file = 'station_data_clean.csv'
    
    station_import = pd.read_csv('../junk.csv')
except:
    try:
        file_path_slug = '../../../datasets/bayareabikeshare/*_station_data.csv'
        file_list = glob(file_path_slug)

        station_import = pd.DataFrame()

        counter = 1
        chunks = []

        for file in file_list:
            for chunk in pd.read_csv(file, chunksize=10000, iterator=True):
                chunk.columns = ['station_id', 'name', 'lat', 'long', 'dockcount', 'landmark', 'installation']            
                chunks.append(chunk)
            print('\tFinished file! (%d of %d)' % (counter, len(file_list)))
            counter += 1

        station_import = pd.concat(chunks)
        print('Data Loaded Successfully!')
    except:
        print('oops... something went wrong importing the data :(')

Loading Station Data...
	Finished file! (1 of 4)
	Finished file! (2 of 4)
	Finished file! (3 of 4)
	Finished file! (4 of 4)
Data Loaded Successfully!


In [3]:
def label_zip(row):
    if row['landmark'] == 'San Francisco':
       return '94107'
    if row['landmark'] == 'Redwood City':
        return '94063'
    if row['landmark'] == 'Palo Alto':
        return '94301'
    if row['landmark'] == 'Mountain View':
        return '94041'
    if row['landmark'] == 'San Jose':
        return '95113'
    return '99999'

def make_lat_long(row):
    lat = row['lat']
    long = row['long']
    return (lat, long)

In [4]:
print('Station Data Cleanup Started...')

station_data = station_import.copy()

# remove dulplicates
# print('\tremove dulplicates')
# station_data.drop_duplicates(keep='first', inplace=True)
print('\tdropping empty rows')
station_data.dropna(how='all', inplace=True)

# set datatype for each column
print('\tset datatype for each column')
station_data['station_id']   = station_data['station_id'].astype('int')
station_data['name']         = station_data['name'].astype('str')
station_data['lat']          = station_data['lat'].astype('float')
station_data['long']         = station_data['long'].astype('float')
station_data['landmark']     = station_data['landmark'].astype('category')

# add a zipcode column for later comparison with weather data
station_data['zip_code'] = station_data.apply(lambda row: label_zip (row),axis=1)
# station_data['zip_code'] = station_data['landmark'].astype('str')

# create lat,lon tuple column
station_data['lat_long'] = station_data.apply(lambda row: make_lat_long (row),axis=1)

station_data.drop_duplicates(station_data.columns, keep='first', inplace=True)

station_data.sort_values(by=['station_id', 'installation'], inplace=True)

print('Cleaning complete!')


Station Data Cleanup Started...
	dropping empty rows
	set datatype for each column
Cleaning complete!


In [5]:
station_data = station_data.drop_duplicates(subset='station_id', keep='last').set_index('station_id')

In [6]:
station_data.to_csv('../../../datasets/bayareabikeshare/CLEANED/station_data_cleaned.csv', encoding='utf-8')