# Data Investigation - Bikes

### Import Data

> starting from trips dataset cleaned in 'Data Investigation - Trips'

In [None]:
%matplotlib inline

import matplotlib
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math

import seaborn as sns
sns.set()

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
print('Loading Trip Data...')

try:
    trips_data = pd.DataFrame()
    file = '../../../datasets/bayareabikeshare/CLEANED/trip_data_cleaned_master.csv'
    chunks = []
    counter = 1
    chunksize = 10000
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)

    # import file in chunks
    for chunk in pd.read_csv(file, chunksize=chunksize, iterator=True, parse_dates=['start_date', 'end_date']):

        # set chunk index column to 'Trip ID'
        chunk = chunk.set_index('Trip ID')

        # append chunk to chunks list
        chunks.append(chunk)

        if counter == 1 or counter % 10 == 0 or counter == num_chunks:
            print('[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), counter, num_chunks))
        counter += 1

    # concat chunks
    trips_data = pd.concat(chunks)

    print('Data Loaded Successfully!')

except:
    print('oops... something went wrong importing the data :(')

In [None]:
trips_data.info()

## Isolate Bike Data

In [None]:
bike_data = pd.DataFrame()
bike_data['first_trip'] = trips_data.groupby('bike_id')['start_date'].min()
bike_data['latest_trip']  = trips_data.groupby('bike_id')['start_date'].max()
bike_data['trip_count'] = trips_data.groupby('bike_id')['start_date'].count()
bike_data['days_in_service'] = (bike_data['latest_trip'] - bike_data['first_trip']).dt.days

last_recorded_trip = bike_data.latest_trip.max()
bike_data['days_since_last_trip'] = (last_recorded_trip - bike_data.latest_trip).dt.days

bike_data['total_usage_time_per_trip'] = trips_data.groupby('bike_id')['duration'].sum()
bike_data['median_usage_time_per_trip'] = trips_data.groupby('bike_id')['duration'].median()
bike_data['mean_usage_time_per_trip'] = trips_data.groupby('bike_id')['duration'].mean()

bike_data.head()

In [None]:
bike_data.to_csv('../../../datasets/bayareabikeshare/CLEANED/bike_data_cleaned.csv', encoding='utf-8')

In [None]:
service_days = trips_data.groupby(trips_data['start_date'].dt.date)['bike_id'].value_counts().to_frame()
grid = service_days.unstack()
plt.subplots(figsize=(120,70))
ax = sns.heatmap(grid, square=True, cmap="YlOrRd")
ax.set(ylabel='Bike ID')
ax.set(xlabel='Date')
ax.set(title='Bike Usage over Time')

plt.savefig('../Documents/bike_usage_by_day.png')

In [None]:
plt.subplots(figsize=(15,5))
sns.distplot(bike_data.trip_count, color='b', bins=50)
plt.title('Distribution of Bike Trips Counts')
plt.xlim(-100, 3000)
plt.xlabel('Number of Trips')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
sns.distplot(bike_data.days_in_service, color='b', bins=100)
plt.title('Distribution of Bike Trips Counts')
plt.xlabel('Days in Service')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
sns.distplot(bike_data.days_since_last_trip, color='b', bins=100)
plt.title('Distribution of Bike Trips Counts')
plt.xlabel('Days in Since Last Trip')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
sns.distplot(bike_data.mean_usage_time_per_trip, color='b', bins=100)
plt.title('Distribution of Bike Trips Counts')
plt.xlabel('Number of Trips')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
ax = sns.distplot(bike_data.median_usage_time_per_trip/60., color='g', bins=100)
sns.distplot(bike_data.mean_usage_time_per_trip/60., color='r', bins=100, ax=ax)
plt.title('Trip Duration Distribution')
plt.xlabel('Length of Trip in Minutes')
plt.legend(['Median', 'Mean'], loc='best')
plt.show()