# Wrangling Data From Bay Area Bike Share Published Data - Bikes

### Import Packages

In [None]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from glob import glob
import datetime
import math
import numpy as np

import seaborn as sns
sns.set()

<p> Set some notebook variables, makes the notebook 95% width of the screen for easier viewing</p>

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

<p> Set some global font sizes for plots </p>

In [None]:
font = {'size'   : 50}
matplotlib.rc('font', **font)

LABEL_FONT_SIZE = 15
TITLE_FONT_SIZE = 25

## Import Data

In [None]:
print('Started Loading Trips Data...')
file_path_slug = '../clean_data/bayareabikeshare/trip_data_*.csv'
file_list = glob(file_path_slug)

trips_df = pd.DataFrame()

counter = 1
chunks = []

for file in file_list:
    
    chunk_counter = 1
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/10000)
    
    for chunk in pd.read_csv(file, chunksize=10000, iterator=True, index_col=0):
        # append chunk to chunks list
        chunks.append(chunk)

        if chunk_counter == 1 or chunk_counter % math.ceil(num_chunks/10) == 0 or chunk_counter == num_chunks:
            print('\t[%s] finished chunk %s of %s' % (datetime.datetime.now().time(), chunk_counter, num_chunks))
        chunk_counter += 1
        
    print('Finished file! (%d of %d)' % (counter, len(file_list)))
    counter += 1
    
trips_df = pd.concat(chunks)
print('Data Loaded Successfully!')

# Isolate Bike Data
<p>Each Trip Recorded inclues the bike_id, extract data we need for this smaller dataset</p>

In [None]:
bike_df = pd.DataFrame()
bike_df['first_trip'] = pd.to_datetime(trips_df.groupby('bike_id')['start_date'].min())
bike_df['latest_trip']  = pd.to_datetime(trips_df.groupby('bike_id')['start_date'].max())
bike_df['trip_count'] = trips_df.groupby('bike_id')['start_date'].count()

bike_df['days_in_service'] = (bike_df['latest_trip'] - bike_df['first_trip']).dt.days

last_recorded_trip = bike_df.latest_trip.max()
bike_df['days_since_last_trip'] = (last_recorded_trip - bike_df.latest_trip).dt.days

bike_df['total_usage_time'] = trips_df.groupby('bike_id')['duration'].sum()
bike_df['median_usage_time_per_trip'] = trips_df.groupby('bike_id')['duration'].median()
bike_df['mean_usage_time_per_trip'] = trips_df.groupby('bike_id')['duration'].mean()

bike_df['mean_rides_per_day'] = bike_df['trip_count'] / bike_df['days_in_service']

# bikes with at least 1500 rides are in category 1, else category 0
bike_df['ride_num_category'] = bike_df.trip_count.apply(lambda x: 1 if x >= 1500 else 0)

bike_df.head()

***
# EDA Preview and Quick Reference

In [None]:
plt.subplots(figsize=(15,5))
sns.distplot(bike_df.trip_count, color='b', bins=50)
plt.title('Distribution of Bike Trips Counts', size=TITLE_FONT_SIZE, weight='bold')
plt.xlim(-100, 3000)
plt.xlabel('Number of Trips', size=LABEL_FONT_SIZE, weight='bold')
plt.ylabel('Frequency', size=LABEL_FONT_SIZE, weight='bold')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
sns.distplot(bike_df.days_in_service, color='b', bins=30)
plt.title('Distribution of Bike Trips Counts', size=TITLE_FONT_SIZE, weight='bold')
plt.xlabel('Days in Service', size=LABEL_FONT_SIZE, weight='bold')
plt.ylabel('Frequency', size=LABEL_FONT_SIZE, weight='bold')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
sns.distplot(bike_df.days_since_last_trip, color='b', bins=100)
plt.title('Distribution of Bike Trips Counts', size=TITLE_FONT_SIZE, weight='bold')
plt.xlabel('Days in Since Last Trip', size=LABEL_FONT_SIZE, weight='bold')
plt.ylabel('Frequency', size=LABEL_FONT_SIZE, weight='bold')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))
ax = sns.distplot(bike_df.median_usage_time_per_trip/60., color='g', bins=70)
sns.distplot(bike_df.mean_usage_time_per_trip/60., color='r', bins=70, ax=ax)
plt.title('Trip Duration Distribution', size=TITLE_FONT_SIZE, weight='bold')
plt.xlabel('Length of Trip in Minutes', size=LABEL_FONT_SIZE, weight='bold')
plt.ylabel('Frequency', size=LABEL_FONT_SIZE, weight='bold')
plt.legend(['Median', 'Mean'], loc='best')
plt.show()

In [None]:
plt.subplots(figsize=(15,5))

sns.distplot(bike_df[bike_df.trip_count < 1500].total_usage_time/60., color='b', bins=30)
sns.distplot(bike_df[bike_df.trip_count >= 1500].total_usage_time/60., color='g', bins=30)


plt.title('Total Rides Duration Distribution', size=TITLE_FONT_SIZE, weight='bold')
plt.xlabel('Total Ride Duration in Minutes', size=LABEL_FONT_SIZE, weight='bold')
plt.ylabel('Frequency', size=LABEL_FONT_SIZE, weight='bold')

plt.legend(['>=1500', '<1500'], loc='best')
plt.show()

In [None]:
sns.lmplot(x='days_in_service', y='mean_rides_per_day', data=bike_df, hue='ride_num_category', size=10)
plt.title('Days in Service vs Average Rides Per Day', size=TITLE_FONT_SIZE, weight='bold')
plt.xlabel('Days in Service', size=LABEL_FONT_SIZE, weight='bold')
plt.ylabel('Average Rides Per Day', size=LABEL_FONT_SIZE, weight='bold')

plt.legend(['>=1500', '<1500'], loc='best')
plt.show()

# Write to File

In [None]:
bike_df.to_csv('../clean_data/bayareabikeshare/bike_data_cleaned.csv', encoding='utf-8')