# The Affect of Covid-19 on NYC Bike Rentals

TBD - Introduction paragraph
- Set the context
- Introduce the analysis
- Add an image?


## 1. A story in 9+ million rows

TBD - Introduce the data set
- Where does it come from
- What were some of the challenges - # of rows and optimization
- What are the steps in this section



### Import packages


In [None]:
# Import packages
import glob
import numpy as np
import pandas as pd
import math
# import scipy.stats as stats

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-poster') #sets the size of the charts
plt.style.use('ggplot')

import seaborn as sns
sns.set(style='ticks', color_codes=True, font_scale=1.25)

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.tile_providers import CARTODBPOSITRON, get_provider
from bokeh.models import ColumnDataSource, HoverTool

# Set display option for floats in Pandas
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Setup Bokeh to output directly to the notebook
output_notebook(resources=None, verbose=False, hide_banner=True, load_timeout=5000, notebook_type='jupyter')


### Prep for data import


In [None]:
# Dictionary of columns and optimal dtypes
col_types = {'usertype': 'category',
             'birth year': 'int', 
             'gender': 'int8'
            }

# Create list of updated column names
col_names = ['tripduration', 'starttime', 'bikeid', 'usertype', 'birth_year', 'gender']


### Create helper function


In [None]:
# Function for reading in and processing data files
def read_and_process_simple(filepath):
    '''Reads in and processes CitiBike monthly csv data files
         
    Args:
      filepath (string): path to csv data file
    
    Returns:
      temp_df (dataframe): dataframe containing the list of rentals for a given month
    '''
    
    # Read in the data
    temp_df = pd.read_csv(filepath) 
    
    # Drop rows with null values
    temp_df = temp_df.dropna()
    
    # Convert start/stop time columns to datetime
    temp_df.starttime = pd.to_datetime(temp_df.starttime, infer_datetime_format=True)
    
    # Convert column dtypes
    temp_df = temp_df.astype(col_types)
    
    # Drop unnecessary columns
    temp_df = temp_df.drop(['stoptime', 'start station id', 'end station id',
                            'start station name', 'start station latitude', 'start station longitude',
                            'end station name', 'end station latitude', 'end station longitude'], axis=1)
    
    # Rename remaining columns
    temp_df.columns = col_names

    return temp_df


### Read in the data


In [None]:
# Create list of years 
years = [2019, 2020]

# Read in and process the data
def gen_df(year):
    rentals_dfs = []
    data_files = glob.glob('../data/' + str(year) + "*.csv")
    for file in data_files:
        rentals_df = read_and_process_simple(file)
        rentals_dfs.append(rentals_df)
    df = pd.concat(rentals_dfs, ignore_index=True)
    return df

df_2019 = gen_df(2019)
df_2020 = gen_df(2020)

# Calculate memory usage
mem_2019 = df_2019.memory_usage().sum() / 1024**2
mem_2020 = df_2020.memory_usage().sum() / 1024**2

# Print output
print("2019 Data: " + str(df_2019.shape[0]) + " rows")
print('Memory usage after optimization:  {:.2f} MB'.format(mem_2019))
print("2020 Data: " + str(df_2020.shape[0]) + " rows")
print('Memory usage after optimization:  {:.2f} MB'.format(mem_2020))


## 2. Do people rent bikes during a pandemic?

TBD - Intro the section

- Add more date dimensions
- Add a count column
- Trip count by dow, hod (gender?, age? usertype?)

### Create new columns

In [None]:
# Sort the dfs by starttime
df_2019 = df_2019.sort_values(by=['starttime']).reset_index(drop=True)
df_2020 = df_2020.sort_values(by=['starttime']).reset_index(drop=True)

# Create a column to facilitate totaling rental count
df_2019['rental_count'] = 1
df_2020['rental_count'] = 1

# Calculate running total of rental counts by year
df_2019['running_total'] = df_2019['rental_count'].cumsum()
df_2020['running_total'] = df_2020['rental_count'].cumsum()

In [None]:
fig = plt.figure()
ax = sns.lineplot(x='starttime', y='running_total', data=df_2019, color='#1E88E5', label='2019')
sns.lineplot(x='starttime', y='running_total', data=df_2020, color='#FFC107', label='2020')