# Citibike Project

My goal is to use the [Citibike dataset](https://ride.citibikenyc.com/system-data) as the subject with which to practice SQL and Python.

First, I will need to download the data from the website. I will attempt to do this with `BeautifulSoup`.

Then I will use SQL within Python to query the dataset.

Future tasks will include creating a map of the rides throughout the city, to start.

## Imports

In [22]:
import requests
from bs4 import BeautifulSoup

### BeautifulSoup webcrawler

In [6]:
# url of the citibike tripdata
url = "https://s3.amazonaws.com/tripdata/index.html"


In [10]:
# sanity check - check object
url

'https://s3.amazonaws.com/tripdata/index.html'

In [11]:
# Send a get request and assign the response to a variable
response = requests.get(url)

In [13]:
# check class object
response

<Response [200]>

In [15]:
# execute 'content' function
# this will spit out a lot of text
response.content

b'<html>\r\n    <head>\r\n  <!--\r\n\r\n  Amazon S3 Bucket listing.\r\n\r\n\r\n  Copyright (C) 2008 Francesco Pasqualini\r\n\r\n      This program is free software: you can redistribute it and/or modify\r\n      it under the terms of the GNU General Public License as published by\r\n      the Free Software Foundation, either version 3 of the License, or\r\n      (at your option) any later version.\r\n\r\n      This program is distributed in the hope that it will be useful,\r\n      but WITHOUT ANY WARRANTY; without even the implied warranty of\r\n      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r\n      GNU General Public License for more details.\r\n\r\n      You should have received a copy of the GNU General Public License\r\n      along with this program.  If not, see <http://www.gnu.org/licenses/>.\r\n\r\n  -->\r\n  <!--\r\n\r\n  Modified by Nolan Lawson!  (http://nolanlawson.com).  I\'m keeping the spirit of the\r\n  GPL alive by issuing this with the same licen

In [17]:
# Turn the undecoded content into 
# a Beautiful Soup object and assign it to a variable
soup = BeautifulSoup(response.content)

In [18]:
# check contents of the `soup` object
soup

<html>
<head>
<!--

  Amazon S3 Bucket listing.


  Copyright (C) 2008 Francesco Pasqualini

      This program is free software: you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published by
      the Free Software Foundation, either version 3 of the License, or
      (at your option) any later version.

      This program is distributed in the hope that it will be useful,
      but WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      GNU General Public License for more details.

      You should have received a copy of the GNU General Public License
      along with this program.  If not, see <http://www.gnu.org/licenses/>.

  -->
<!--

  Modified by Nolan Lawson!  (http://nolanlawson.com).  I'm keeping the spirit of the
  GPL alive by issuing this with the same license!

  -->
<title>Bucket loading...</title>
<link href="//netdna.bootstrapcdn.com/bootstr

In [20]:
# the `soup` object is a BeautifulSoup object
type(soup)

bs4.BeautifulSoup

In [21]:
# find 'href' which has the download link
soup.find('href', class_='<a href="https://s3.amazonaws.com/tripdata/201306-citibike-tripdata.zip">201306-citibike-tripdata.zip</a>')

In [5]:
# url of the citibike tripdata
url = "https://s3.amazonaws.com/tripdata/index.html"

# Make a GET request to the webpage
response = requests.get(url)

# Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find the links or elements that contain the files you want to download
file_links = soup.find_all("a", href=True)  # Modify the find_all() method to match the specific elements containing the files

# Iterate over the file links and download each file
for link in file_links:
    file_url = link["href"]
    file_name = file_url.split("/")[-1]  # Extract the file name from the URL

    # Send a GET request to the file URL and save the content to a file
    file_response = requests.get(file_url)
    with open(file_name, "wb") as file:
        file.write(file_response.content)
        print(f"Downloaded: {file_name}")

In [1]:
def separate_dictionaries_by_value(dictionary):
    result = {}
    
    # Iterate over the key-value pairs in the dictionary
    for key, value in dictionary.items():
        if isinstance(value, list):
            value_tuple = tuple(value)
        else:
            value_tuple = value
        
        if value_tuple not in result:
            result[value_tuple] = {}  # Create a new dictionary for the value
        
        result[value_tuple][key] = value  # Add the key-value pair to the respective dictionary
    
    return result

# example usage
dictionary = {'key0': [30, 'abc'], 'key1': [10, 'abc'], 'key2': [10, 'abc'], 'key3': [20, 'abc'],
              'key4': [10, 'abc'], 'key5': [20, 'abc'], 'key6': [30, 'abc'], 'key7': [20, 'def']}
result = separate_dictionaries_by_value(dictionary)
print(result)

{(30, 'abc'): {'key0': [30, 'abc'], 'key6': [30, 'abc']}, (10, 'abc'): {'key1': [10, 'abc'], 'key2': [10, 'abc'], 'key4': [10, 'abc']}, (20, 'abc'): {'key3': [20, 'abc'], 'key5': [20, 'abc']}, (20, 'def'): {'key7': [20, 'def']}}


In [None]:
# for file in file_path:
file_path='/Users/sra/files/projects/citibike_project/tripdata'
counter_csv=0
counter_not_csv=0

for filename in os.listdir(file_path):
    # print('filename:','\n',filename)
    if filename.endswith('.csv'):
        one_file_path = os.path.join(file_path, filename)
        print(one_file_path)
        with open(one_file_path, 'r') as csv_file:
            reader = csv.reader(csv_file)
            headers = next(reader)  # get the header row
            print(headers)
        print('\n')
        counter_csv+=1 # count to make sure all the files are represented
    else:
        counter_not_csv+=1
        print('not csv file:\n',filename)
        continue

print('csv counter:',counter_csv)
print('not csv counter:',counter_not_csv)

In [None]:
for file in file_path:
    columns_dict={}
    columns_dict[file]=get_column_names(directory=file_path)
    
columns_dict

In [None]:
import os
import shutil
import csv

def organize_csv_files_by_header(directory):
    # create a dictionary to store headers and corresponding directories
    header_directories = {}
    print('header_directories:' header_directories)
    
    # scan the directory for CSV files
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    print(csv_files)
    
    # process each CSV file
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        print(file_path)
        
        # read the header of the CSV file
        with open(file_path, 'r') as file:
            csv_reader = csv.reader(file)
            header = next(csv_reader)

        # check if the header matches any existing directories
        matching_directory = None
        for directory, existing_header in header_directories.items():
            if header == existing_header:
                matching_directory = directory
                break

        # if a matching directory is found, move the CSV file there
        if matching_directory:
            destination_directory = os.path.join(directory, matching_directory)
        else:
            # create a new directory for the header and move the CSV file there
            new_directory = f"directory_{len(header_directories) + 1}"
            header_directories[new_directory] = header
            destination_directory = os.path.join(directory, new_directory)
            os.makedirs(destination_directory)

        shutil.move(file_path, destination_directory)

    # print the mapping of headers to directories
    for directory, header in header_directories.items():
        print(f"Header: {header}  Directory: {directory}")

# Example usage
# directory_path = '/path/to/csv_directory'
# organize_csv_files_by_header(directory_path)

In [None]:
organize_csv_files_by_header(directory='/Users/sra/files/projects/citibike_project/data/tripdata')

In [None]:
directory = '/Users/sra/files/projects/citibike_project/tripdata'

# scan the directory for CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
# print(csv_files)
    
# process each CSV file
for csv_file in csv_files:
    file_path = os.path.join(directory, csv_file)
    # print(file_path)
        
    # read the header of the CSV file
    with open(file_path, 'r') as file:
        csv_reader = csv.reader(file)
        header = next(csv_reader)
        # print(header)

In [None]:
# https://www.geeksforgeeks.org/working-csv-files-python/

# importing csv module
import csv
 
# csv file name
filename = "aapl.csv"
# filename=csv_files

# initializing the titles and rows list
fields = []
rows = []
 
# reading csv file
with open(filename, 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # extracting field names through first row
    fields = next(csvreader)

    # extracting each data row one by one
    for row in csvreader:
        rows.append(row)

    # get total number of rows
    print("Total no. of rows: %d"%(csvreader.line_num))
 
# printing the field names
print('Field names are:' + ', '.join(field for field in fields))
 
# printing first 5 rows
print('\nFirst 5 rows are:\n')
for row in rows[:5]:
    # parsing each column of a row
    for col in row:
        print("%10s"%col,end=" "),
    print('\n')

In [2]:
directory = '/Users/sra/files/projects/citibike_project/tripdata'

# scan the directory for CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
# print(csv_files)

NameError: name 'os' is not defined

In [None]:
# remove '.csv' from the filenames
csv_filenames=[]
for i in csv_files:
    csv_filenames.append(i[:-4])
    
csv_filenames

# remove '-' and '.csv' from filenames
# csv_filenames=[]
# for i in csv_files:
#     csv_filenames.append(i[:-4])
#     csv_filenames.append(i.replace('-','') and i.replace('.csv',''))
    
# csv_filenames

# remove all but the date string at the beginning
# for i in csv_filenames:
#     csv_filenames=i[:5]
    
# csv_filenames

In [None]:
directory = '/Users/sra/files/projects/citibike_project/data/tripdata'

# scan the directory for CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
# print(csv_files)

for i, value in enumerate(csv_files):
    var_name=f'{csv_files}{i+1}

In [None]:
directory = '/Users/sra/files/projects/citibike_project/tripdata'

# scan the directory for CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
# print(csv_files)

# remove hyphens and retain only the numbers at the beginning of each string
processed_strings = [re.sub(r'[^0-9]', '', i.split('-', 1)[0]) for i in csv_files]

# print(processed_strings)

print('processed_strings:',len(processed_strings))
print('csv_files:',len(csv_files))