#### Thanks to Daniel Villaveces' article on *Medium* for helping me to scrape the data from Citi Bike data. 
#### It saved me a lot of time that otherwise would have been spent downloading multiple zip files for each month.
#### Daniel's article can be found [here](https://medium.com/@dvillaveces/analyzing-citi-bike-data-2202f9da97d7)

In [None]:
# scrape the citi bike index and extract the file names
# the data begins in 2013, but I only want data from 2020

In [2]:
# the files also have a second file named with 'JC' for every month from 2015 - 2020 that seems to contain addtional/
# trip data, so I will have to do this twice: first to get the 'non JC files' and then get the'JC' files

import requests
from bs4 import BeautifulSoup

In [3]:
url = 'https://s3.amazonaws.com/tripdata/'

# load all url content into soup
r = requests.get(url)
soup = BeautifulSoup(r.text, 'xml')

# extract file names from soup
# data from 2020
files = soup.find_all('Key')
extract_files = []
for i in range(80, 92):
    extract_files.append(files[i].get_text())
    
extract_files

['202001-citibike-tripdata.csv.zip',
 '202002-citibike-tripdata.csv.zip',
 '202003-citibike-tripdata.csv.zip',
 '202004-citibike-tripdata.csv.zip',
 '202005-citibike-tripdata.csv.zip',
 '202006-citibike-tripdata.csv.zip',
 '202007-citibike-tripdata.csv.zip',
 '202008-citibike-tripdata.csv.zip',
 '202009-citibike-tripdata.csv.zip',
 '202010-citibike-tripdata.csv.zip',
 '202011-citibike-tripdata.csv.zip',
 '202012-citibike-tripdata.csv.zip']

In [4]:
# extract the JC files, 2020
files = soup.find_all('Key')
jc_extract_files = []
for i in range(145, len(files)-2):
    jc_extract_files.append(files[i].get_text())
    
jc_extract_files

['JC-202001-citibike-tripdata.csv.zip',
 'JC-202002-citibike-tripdata.csv.zip',
 'JC-202003-citibike-tripdata.csv.zip',
 'JC-202004-citibike-tripdata.csv.zip',
 'JC-202005-citibike-tripdata.csv.zip',
 'JC-202006-citibike-tripdata.csv.zip',
 'JC-202007-citibike-tripdata.csv.zip',
 'JC-202008-citibike-tripdata.csv.zip',
 'JC-202009-citibike-tripdata.csv.zip',
 'JC-202010-citibike-tripdata.csv.zip',
 'JC-202011-citibike-tripdata.csv.zip',
 'JC-202012-citibike-tripdata.csv.zip']

In [6]:
# download and unzip the files using a for loop

import os
import zipfile

In [8]:
# download & unzip non-JC files
# move data up a directory so it's not added to github

for file in extract_files:
    file_url = url + file
    
    #download files
    with open(file, "wb") as f:
            response = requests.get(file_url)
            f.write(response.content)
    
    # unzip data files
    with zipfile.ZipFile(file, "r") as zip_ref:
        zip_ref.extractall("../tripdata")        
    
    # remove zipped file after unziping
    os.remove(file)

In [9]:
# download & unzip JC files

for file in jc_extract_files:
    file_url = url + file
    
    #download files
    with open(file, "wb") as f:
            response = requests.get(file_url)
            f.write(response.content)
    
    # unzip data files
    with zipfile.ZipFile(file, "r") as zip_ref:
        zip_ref.extractall("../tripdata")        
    
    # remove zipped file after unziping
    os.remove(file)

In [10]:
# rename files to be more manageable
# files have long names such as '202001-citibike-tripdata.csv' or 'JC-202001-citibike-tripdata'
# just want the year and month and also include the 'jc' until I can figure out the difference

import numpy as np
import pandas as pd

In [11]:
directory = '../tripdata/'

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.csv'):
        new_filename = filename.replace(' ','').lower().split('ci', 1)[0].strip('-').replace('-','_')
        os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename + '.csv'))

In [12]:
#load into DataFrames into dfs dictionary
directory = '../tripdata/'
dfs = {}

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.csv'):
        dfs[filename.split('.')[0]] = pd.read_csv(os.path.join(directory, filename)) 