### Importing Utilities

In [10]:
import pandas as pd
import numpy as np
import copy
import requests
import time
from tqdm.notebook import tqdm

pd.set_option('chained_assignment',None)
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import re 
from datetime import datetime
import os
#os.path.join(save_path)

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
path = '/content/drive/My Drive/3001TS'

In [3]:
zipurl = 'https://s3.amazonaws.com/tripdata/201411-citibike-tripdata.zip' 

In [4]:
def preprocess_test(zipurl):
    #zipurl = 'https://s3.amazonaws.com/tripdata/201501-citibike-tripdata.zip'
    file = zipurl.split('/')[-1]
    print(file)
    #read data
    if 'zip' in file:
        with urlopen(zipurl) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall(file)
    filename = file+'/'+file.split('.')[0]+'.csv'
    
    df = pd.read_csv(filename)
    return df


In [5]:
df1 = preprocess_test(zipurl)
cols = df1.columns.tolist()
len(cols)

201411-citibike-tripdata.zip


15

In [6]:
def find_distance(dt):
    lat1 = np.radians(np.array(dt['start station latitude']))
    lat2 = np.radians(np.array(dt['end station latitude']))
    lon1 = np.radians(np.array(dt['start station longitude']))
    lon2 = np.radians(np.array(dt['end station longitude']))
    dlon = lon2 - lon1
        #change in coordinates
    dlat = lat2 - lat1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) *np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    R = 3958.8 
    distance = R * c

    return distance


def preprocess(zipurl):
    #ie. 'https://s3.amazonaws.com/tripdata/201501-citibike-tripdata.zip'
    file = zipurl.split('/')[-1]
    print(file)
    #read data
    if 'zip' in file:
        with urlopen(zipurl) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall(file)
    filename = file+'/'+file.split('.')[0]+'.csv'
    
    df = pd.read_csv(filename)
   
    #preprocess 
    df.columns= cols
    df['starttime_day'] = pd.to_datetime(df['starttime']).dt.date
    df['tripdistance'] = find_distance(df)
    df = df[['starttime_day', 'usertype','tripduration', 'tripdistance']]
    df = df.groupby(by='starttime_day').agg({'usertype': 'count','tripduration': 'mean','tripdistance':'mean'})
    return df.reset_index()

In [7]:
dff = preprocess(zipurl)
dff.head()

201411-citibike-tripdata.zip


Unnamed: 0,starttime_day,usertype,tripduration,tripdistance
0,2014-11-01,6002,671.524992,0.852766
1,2014-11-02,14100,967.441844,0.933885
2,2014-11-03,26757,824.786822,1.027006
3,2014-11-04,31174,798.417175,1.040973
4,2014-11-05,30539,871.663087,1.032553


In [8]:
def load_concat_process(list_):
    s3 = 'https://s3.amazonaws.com/tripdata/'
    print(">>> Start Downloading...")
    df_list = []
    for i,file in enumerate(list_):
        url = s3+file
        print(url)
        df = preprocess(url)
        df_list.append(df)
    print(">>> Concatenating...")
    full_df = pd.concat(df_list, ignore_index=False, sort=False)
    
    return full_df

In [12]:
from bs4 import BeautifulSoup
from xml.etree import ElementTree
import xml.etree.ElementTree as ET
import urllib.request

### Download/ unzip and concat Citibike data

In [13]:
url = 'https://s3.amazonaws.com/tripdata/'
document = requests.get(url)
soup= BeautifulSoup(document.content,"lxml-xml")
print (soup.find("Key"))
#soup

<Key>201306-citibike-tripdata.zip</Key>


In [14]:
list_ = []
for key in soup.find_all("Key"):
    k = re.split('[> <]+',str(key))[2]
    if ('JC' not in k) and ('html' not in k):
        print(k)
        date = int(k.split('-')[0])
        if (201412 < date) and (date< 202101):
            list_.append(k)


201306-citibike-tripdata.zip
201307-201402-citibike-tripdata.zip
201307-citibike-tripdata.zip
201308-citibike-tripdata.zip
201309-citibike-tripdata.zip
201310-citibike-tripdata.zip
201311-citibike-tripdata.zip
201312-citibike-tripdata.zip
201401-citibike-tripdata.zip
201402-citibike-tripdata.zip
201403-citibike-tripdata.zip
201404-citibike-tripdata.zip
201405-citibike-tripdata.zip
201406-citibike-tripdata.zip
201407-citibike-tripdata.zip
201408-citibike-tripdata.zip
201409-citibike-tripdata.zip
201410-citibike-tripdata.zip
201411-citibike-tripdata.zip
201412-citibike-tripdata.zip
201501-citibike-tripdata.zip
201502-citibike-tripdata.zip
201503-citibike-tripdata.zip
201504-citibike-tripdata.zip
201505-citibike-tripdata.zip
201506-citibike-tripdata.zip
201507-citibike-tripdata.zip
201508-citibike-tripdata.zip
201509-citibike-tripdata.zip
201510-citibike-tripdata.zip
201511-citibike-tripdata.zip
201512-citibike-tripdata.zip
201601-citibike-tripdata.zip
201602-citibike-tripdata.zip
201603-

In [15]:
list_[48:]

['201901-citibike-tripdata.csv.zip',
 '201902-citibike-tripdata.csv.zip',
 '201903-citibike-tripdata.csv.zip',
 '201904-citibike-tripdata.csv.zip',
 '201905-citibike-tripdata.csv.zip',
 '201906-citibike-tripdata.csv.zip',
 '201907-citibike-tripdata.csv.zip',
 '201908-citibike-tripdata.csv.zip',
 '201909-citibike-tripdata.csv.zip',
 '201910-citibike-tripdata.csv.zip',
 '201911-citibike-tripdata.csv.zip',
 '201912-citibike-tripdata.csv.zip',
 '202001-citibike-tripdata.csv.zip',
 '202002-citibike-tripdata.csv.zip',
 '202003-citibike-tripdata.csv.zip',
 '202004-citibike-tripdata.csv.zip',
 '202005-citibike-tripdata.csv.zip',
 '202006-citibike-tripdata.csv.zip',
 '202007-citibike-tripdata.csv.zip',
 '202008-citibike-tripdata.csv.zip',
 '202009-citibike-tripdata.csv.zip',
 '202010-citibike-tripdata.csv.zip']

### Processing the whole list

In [16]:
start_time = datetime.now()

In [17]:
full_df1 = load_concat_process(list_[:24])

>>> Start Downloading...
https://s3.amazonaws.com/tripdata/201501-citibike-tripdata.zip
201501-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201502-citibike-tripdata.zip
201502-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201503-citibike-tripdata.zip
201503-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201504-citibike-tripdata.zip
201504-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201505-citibike-tripdata.zip
201505-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201506-citibike-tripdata.zip
201506-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201507-citibike-tripdata.zip
201507-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201508-citibike-tripdata.zip
201508-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201509-citibike-tripdata.zip
201509-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201510-citibike-tripdata.zip
201510-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201511-citibike-tripd

In [18]:
full_df1.to_csv(os.path.join('/content/drive/My Drive/3001TS/citibike_1516.csv'))

In [None]:
full_df2 = load_concat_process(list_[24:48])

>>> Start Downloading...
https://s3.amazonaws.com/tripdata/201701-citibike-tripdata.csv.zip
201701-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201702-citibike-tripdata.csv.zip
201702-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201703-citibike-tripdata.csv.zip
201703-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201704-citibike-tripdata.csv.zip
201704-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201705-citibike-tripdata.csv.zip
201705-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201706-citibike-tripdata.csv.zip
201706-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201707-citibike-tripdata.csv.zip
201707-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201708-citibike-tripdata.csv.zip
201708-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201709-citibike-tripdata.csv.zip
201709-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201710-citibike-tripdata.csv.zip
201710-c

In [None]:
full_df2.to_csv(os.path.join('/content/drive/My Drive/3001TS/citibike_1718.csv'))

In [None]:
full_df3 = load_concat_process(list_[48:])

In [None]:
full_df3.to_csv(os.path.join('/content/drive/My Drive/3001TS/citibike_1920.csv'))

In [None]:
print('Run time: ', datetime.now()-start)

### Testing final file

In [None]:
data = pd.read_csv('citibike_1520.csv')
data.head()