# Citibike Project

My goal is to use the [Citibike dataset](https://ride.citibikenyc.com/system-data) as the subject with which to practice SQL and Python.

First, I will need to download the data from the website. I will attempt to do this with `BeautifulSoup`.

Then I will use SQL within Python to query the dataset.

Future tasks will include creating a map of the rides throughout the city, to start.

## Imports

In [22]:
import requests
from bs4 import BeautifulSoup

### BeautifulSoup webcrawler

In [6]:
# url of the citibike tripdata
url = "https://s3.amazonaws.com/tripdata/index.html"


In [10]:
# sanity check - check object
url

'https://s3.amazonaws.com/tripdata/index.html'

In [11]:
# Send a get request and assign the response to a variable
response = requests.get(url)

In [13]:
# check class object
response

<Response [200]>

In [15]:
# execute 'content' function
# this will spit out a lot of text
response.content

b'<html>\r\n    <head>\r\n  <!--\r\n\r\n  Amazon S3 Bucket listing.\r\n\r\n\r\n  Copyright (C) 2008 Francesco Pasqualini\r\n\r\n      This program is free software: you can redistribute it and/or modify\r\n      it under the terms of the GNU General Public License as published by\r\n      the Free Software Foundation, either version 3 of the License, or\r\n      (at your option) any later version.\r\n\r\n      This program is distributed in the hope that it will be useful,\r\n      but WITHOUT ANY WARRANTY; without even the implied warranty of\r\n      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r\n      GNU General Public License for more details.\r\n\r\n      You should have received a copy of the GNU General Public License\r\n      along with this program.  If not, see <http://www.gnu.org/licenses/>.\r\n\r\n  -->\r\n  <!--\r\n\r\n  Modified by Nolan Lawson!  (http://nolanlawson.com).  I\'m keeping the spirit of the\r\n  GPL alive by issuing this with the same licen

In [17]:
# Turn the undecoded content into 
# a Beautiful Soup object and assign it to a variable
soup = BeautifulSoup(response.content)

In [18]:
# check contents of the `soup` object
soup

<html>
<head>
<!--

  Amazon S3 Bucket listing.


  Copyright (C) 2008 Francesco Pasqualini

      This program is free software: you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published by
      the Free Software Foundation, either version 3 of the License, or
      (at your option) any later version.

      This program is distributed in the hope that it will be useful,
      but WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      GNU General Public License for more details.

      You should have received a copy of the GNU General Public License
      along with this program.  If not, see <http://www.gnu.org/licenses/>.

  -->
<!--

  Modified by Nolan Lawson!  (http://nolanlawson.com).  I'm keeping the spirit of the
  GPL alive by issuing this with the same license!

  -->
<title>Bucket loading...</title>
<link href="//netdna.bootstrapcdn.com/bootstr

In [20]:
# the `soup` object is a BeautifulSoup object
type(soup)

bs4.BeautifulSoup

In [21]:
# find 'href' which has the download link
soup.find('href', class_='<a href="https://s3.amazonaws.com/tripdata/201306-citibike-tripdata.zip">201306-citibike-tripdata.zip</a>')

In [5]:
# url of the citibike tripdata
url = "https://s3.amazonaws.com/tripdata/index.html"

# Make a GET request to the webpage
response = requests.get(url)

# Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find the links or elements that contain the files you want to download
file_links = soup.find_all("a", href=True)  # Modify the find_all() method to match the specific elements containing the files

# Iterate over the file links and download each file
for link in file_links:
    file_url = link["href"]
    file_name = file_url.split("/")[-1]  # Extract the file name from the URL

    # Send a GET request to the file URL and save the content to a file
    file_response = requests.get(file_url)
    with open(file_name, "wb") as file:
        file.write(file_response.content)
        print(f"Downloaded: {file_name}")

In [1]:
def separate_dictionaries_by_value(dictionary):
    result = {}
    
    # Iterate over the key-value pairs in the dictionary
    for key, value in dictionary.items():
        if isinstance(value, list):
            value_tuple = tuple(value)
        else:
            value_tuple = value
        
        if value_tuple not in result:
            result[value_tuple] = {}  # Create a new dictionary for the value
        
        result[value_tuple][key] = value  # Add the key-value pair to the respective dictionary
    
    return result

# example usage
dictionary = {'key0': [30, 'abc'], 'key1': [10, 'abc'], 'key2': [10, 'abc'], 'key3': [20, 'abc'],
              'key4': [10, 'abc'], 'key5': [20, 'abc'], 'key6': [30, 'abc'], 'key7': [20, 'def']}
result = separate_dictionaries_by_value(dictionary)
print(result)

{(30, 'abc'): {'key0': [30, 'abc'], 'key6': [30, 'abc']}, (10, 'abc'): {'key1': [10, 'abc'], 'key2': [10, 'abc'], 'key4': [10, 'abc']}, (20, 'abc'): {'key3': [20, 'abc'], 'key5': [20, 'abc']}, (20, 'def'): {'key7': [20, 'def']}}


In [None]:
# for file in file_path:
file_path='/Users/sra/files/projects/citibike_project/tripdata'
counter_csv=0
counter_not_csv=0

for filename in os.listdir(file_path):
    # print('filename:','\n',filename)
    if filename.endswith('.csv'):
        one_file_path = os.path.join(file_path, filename)
        print(one_file_path)
        with open(one_file_path, 'r') as csv_file:
            reader = csv.reader(csv_file)
            headers = next(reader)  # get the header row
            print(headers)
        print('\n')
        counter_csv+=1 # count to make sure all the files are represented
    else:
        counter_not_csv+=1
        print('not csv file:\n',filename)
        continue

print('csv counter:',counter_csv)
print('not csv counter:',counter_not_csv)

In [None]:
for file in file_path:
    columns_dict={}
    columns_dict[file]=get_column_names(directory=file_path)
    
columns_dict

In [None]:
import os
import shutil
import csv

def organize_csv_files_by_header(directory):
    # create a dictionary to store headers and corresponding directories
    header_directories = {}
    print('header_directories:' header_directories)
    
    # scan the directory for CSV files
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
    print(csv_files)
    
    # process each CSV file
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        print(file_path)
        
        # read the header of the CSV file
        with open(file_path, 'r') as file:
            csv_reader = csv.reader(file)
            header = next(csv_reader)

        # check if the header matches any existing directories
        matching_directory = None
        for directory, existing_header in header_directories.items():
            if header == existing_header:
                matching_directory = directory
                break

        # if a matching directory is found, move the CSV file there
        if matching_directory:
            destination_directory = os.path.join(directory, matching_directory)
        else:
            # create a new directory for the header and move the CSV file there
            new_directory = f"directory_{len(header_directories) + 1}"
            header_directories[new_directory] = header
            destination_directory = os.path.join(directory, new_directory)
            os.makedirs(destination_directory)

        shutil.move(file_path, destination_directory)

    # print the mapping of headers to directories
    for directory, header in header_directories.items():
        print(f"Header: {header}  Directory: {directory}")

# Example usage
# directory_path = '/path/to/csv_directory'
# organize_csv_files_by_header(directory_path)

In [None]:
organize_csv_files_by_header(directory='/Users/sra/files/projects/citibike_project/data/tripdata')

In [None]:
directory = '/Users/sra/files/projects/citibike_project/tripdata'

# scan the directory for CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
# print(csv_files)
    
# process each CSV file
for csv_file in csv_files:
    file_path = os.path.join(directory, csv_file)
    # print(file_path)
        
    # read the header of the CSV file
    with open(file_path, 'r') as file:
        csv_reader = csv.reader(file)
        header = next(csv_reader)
        # print(header)

In [None]:
# https://www.geeksforgeeks.org/working-csv-files-python/

# importing csv module
import csv
 
# csv file name
filename = "aapl.csv"
# filename=csv_files

# initializing the titles and rows list
fields = []
rows = []
 
# reading csv file
with open(filename, 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # extracting field names through first row
    fields = next(csvreader)

    # extracting each data row one by one
    for row in csvreader:
        rows.append(row)

    # get total number of rows
    print("Total no. of rows: %d"%(csvreader.line_num))
 
# printing the field names
print('Field names are:' + ', '.join(field for field in fields))
 
# printing first 5 rows
print('\nFirst 5 rows are:\n')
for row in rows[:5]:
    # parsing each column of a row
    for col in row:
        print("%10s"%col,end=" "),
    print('\n')

In [2]:
directory = '/Users/sra/files/projects/citibike_project/tripdata'

# scan the directory for CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
# print(csv_files)

NameError: name 'os' is not defined

In [None]:
# remove '.csv' from the filenames
csv_filenames=[]
for i in csv_files:
    csv_filenames.append(i[:-4])
    
csv_filenames

# remove '-' and '.csv' from filenames
# csv_filenames=[]
# for i in csv_files:
#     csv_filenames.append(i[:-4])
#     csv_filenames.append(i.replace('-','') and i.replace('.csv',''))
    
# csv_filenames

# remove all but the date string at the beginning
# for i in csv_filenames:
#     csv_filenames=i[:5]
    
# csv_filenames

In [None]:
directory = '/Users/sra/files/projects/citibike_project/data/tripdata'

# scan the directory for CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
# print(csv_files)

for i, value in enumerate(csv_files):
    var_name=f'{csv_files}{i+1}

In [None]:
directory = '/Users/sra/files/projects/citibike_project/tripdata'

# scan the directory for CSV files
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
# print(csv_files)

# remove hyphens and retain only the numbers at the beginning of each string
processed_strings = [re.sub(r'[^0-9]', '', i.split('-', 1)[0]) for i in csv_files]

# print(processed_strings)

print('processed_strings:',len(processed_strings))
print('csv_files:',len(csv_files))

### Using GeeksForGeeks

In [23]:
import requests
from bs4 import BeautifulSoup

'''
Taken from:
https://www.geeksforgeeks.org/downloading-files-web-using-python/

URL of the archive web-page which provides link to
all video lectures. It would have been tiring to
download each video manually.
In this example, we first crawl the webpage to extract
all the links and then download videos.
'''

# specify the URL of the archive here
archive_url = "https://s3.amazonaws.com/tripdata/index.html"

In [24]:
'''
Taken from:
https://www.geeksforgeeks.org/downloading-files-web-using-python/

URL of the archive web-page which provides link to
all video lectures. It would have been tiring to
download each video manually.
In this example, we first crawl the webpage to extract
all the links and then download videos.
'''

def get_video_links():

    # create response object
    r = requests.get(archive_url)
    
    # create beautiful-soup object
    soup = BeautifulSoup(r.content,'html5lib')
    
    # find all links on web-page
    links = soup.findAll('a')

    # filter the link sending with .mp4
    video_links = [archive_url + link['href'] for link in links if link['href'].endswith('zip')]

    return video_links


def download_video_series(video_links):

    for link in video_links:

        '''iterate through all links in video_links
        and download them one by one'''

        # obtain filename by splitting url and getting
        # last string
        file_name = link.split('/')[-1]

        print( "Downloading file:%s"%file_name)

        # create response object
        r = requests.get(link, stream = True)

        # download started
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)

        print( "%s downloaded!\n"%file_name )

    print ("All videos downloaded!")
    return


if __name__ == "__main__":

    # getting all video links
    video_links = get_video_links()

    # download all videos
    download_video_series(video_links)

FeatureNotFound: Couldn't find a tree builder with the features you requested: html5lib. Do you need to install a parser library?

In [25]:
get_video_links()

FeatureNotFound: Couldn't find a tree builder with the features you requested: html5lib. Do you need to install a parser library?

In [26]:
# download all videos 
download_video_series(video_links) 

All videos downloaded!


### Using ChatGPT

In [27]:
# define url of citibike datasets
url = "https://s3.amazonaws.com/tripdata/index.html"

In [28]:
# make a GET request to the webpage
response = requests.get(url)

# check if response worked (code 200)
response

<Response [200]>

In [37]:
# create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# check type of soup object
type(soup)

# print soup
print(soup)

# prettify soup
print(soup.prettify())

<html>
<head>
<!--

  Amazon S3 Bucket listing.


  Copyright (C) 2008 Francesco Pasqualini

      This program is free software: you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published by
      the Free Software Foundation, either version 3 of the License, or
      (at your option) any later version.

      This program is distributed in the hope that it will be useful,
      but WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      GNU General Public License for more details.

      You should have received a copy of the GNU General Public License
      along with this program.  If not, see <http://www.gnu.org/licenses/>.

  -->
<!--

  Modified by Nolan Lawson!  (http://nolanlawson.com).  I'm keeping the spirit of the
  GPL alive by issuing this with the same license!

  -->
<title>Bucket loading...</title>
<link href="//netdna.bootstrapcdn.com/bootstr

In [32]:
# Find the links on the webpage
links = soup.find_all("a", href=True)
links

[]

In [31]:
# Iterate over the links and download/extract zip files
for link in links:
    file_url = link["href"]
    print(file_url)
    if file_url.endswith(".zip"):
        # Send a GET request to the zip file URL
        response = requests.get(file_url)
        print(response)

        # Extract the content of the zip file
        zip_file = zipfile.ZipFile(BytesIO(response.content))
        print(zip_file)
        
        # Iterate over the files in the zip file and 
        # extract them to a central location
        for file_name in zip_file.namelist():
            # Extract the file to the central location 
            zip_file.extract(file_name, "/Users/sra/files/projects/citibike_project/data")

            print(f"Extracted: {file_name} from {file_url}")

In [2]:
zip_url='https://s3.amazonaws.com/tripdata/201306-citibike-tripdata.zip'

In [3]:
import requests, zipfile, io
r = requests.get(zip_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("/Users/sra/files/projects/citibike_project/data")

In [6]:
path='/Users/sra/files/projects/citibike_project/data/201306-citibike-tripdata.csv'
file=pd.read_csv(path)
file.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,695,2013-06-01 00:00:01,2013-06-01 00:11:36,444,Broadway & W 24 St,40.742354,-73.989151,434.0,9 Ave & W 18 St,40.743174,-74.003664,19678,Subscriber,1983.0,1
1,693,2013-06-01 00:00:08,2013-06-01 00:11:41,444,Broadway & W 24 St,40.742354,-73.989151,434.0,9 Ave & W 18 St,40.743174,-74.003664,16649,Subscriber,1984.0,1
2,2059,2013-06-01 00:00:44,2013-06-01 00:35:03,406,Hicks St & Montague St,40.695128,-73.995951,406.0,Hicks St & Montague St,40.695128,-73.995951,19599,Customer,,0
3,123,2013-06-01 00:01:04,2013-06-01 00:03:07,475,E 15 St & Irving Pl,40.735243,-73.987586,262.0,Washington Park,40.691782,-73.97373,16352,Subscriber,1960.0,1
4,1521,2013-06-01 00:01:22,2013-06-01 00:26:43,2008,Little West St & 1 Pl,40.705693,-74.016777,310.0,State St & Smith St,40.689269,-73.989129,15567,Subscriber,1983.0,1


### Using [this stackoverflow link](https://stackoverflow.com/questions/23376816/python-s3-download-zip-file)

In [None]:
import zipfile
import boto3
import io

# this is just to demo. real use should use the config 
# environment variables or config file.
#
# See: http://boto3.readthedocs.org/en/latest/guide/configuration.html

session = boto3.session.Session(
    aws_access_key_id="ACCESSKEY", 
    aws_secret_access_key="SECRETKEY"
)

s3 = session.resource("s3")
bucket = s3.Bucket('stackoverflow-brice-test')
obj = bucket.Object('smsspamcollection.zip')

with io.BytesIO(obj.get()["Body"].read()) as tf:

    # rewind the file
    tf.seek(0)

    # Read the file as a zipfile and process the members
    with zipfile.ZipFile(tf, mode='r') as zipf:
        for subfile in zipf.namelist():
            print(subfile)

### Using ChatGPT again

In [1]:
import boto3
import zipfile

s3 = boto3.client('s3')

bucket_name = 'tripdata'  # Replace with the name of your S3 bucket
key = 'https://s3.amazonaws.com/tripdata/201306-citibike-tripdata.zip'  # Replace with the key (path) of the zip file in your bucket
local_file_path = '/Users/sra/files/projects/citibike_project/data.zip'  # Replace with the local path where you want to save the file

# Download the zip file from S3
s3.download_file(bucket_name, key, local_file_path)

# Extract the downloaded zip file
with zipfile.ZipFile(local_file_path, 'r') as zip_ref:
    zip_ref.extractall('/Users/sra/files/projects/citibike_project/data')  # Replace with the path where you want to extract the files


ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

### Using this link from GeeksForGeeks:
https://www.geeksforgeeks.org/how-to-scrape-all-the-text-from-body-tag-using-beautifulsoup-in-python/

In [8]:
# Import Beautiful Soup
from bs4 import BeautifulSoup
 
# Create the document
url = text
 
# Initialize the object with the document
soup = BeautifulSoup(url, "html.parser")
 
# Get the whole body tag
tag = soup.body
 
# Print each string recursively
for string in tag.strings:
    print(string)





Index of bucket "tripdata"








Name


Date Modified


Size


Type












201306-citibike-tripdata.zip


Apr 30th 2018, 09:18:55 am


16.79 MB


ZIP file








201307-201402-citibike-tripdata.zip


Jan 18th 2017, 05:23:25 pm


178.26 MB


ZIP file








201307-citibike-tripdata.zip


Jan 18th 2017, 05:23:27 pm


27.07 MB


ZIP file








201308-citibike-tripdata.zip


Jan 18th 2017, 05:23:27 pm


32.09 MB


ZIP file








201309-citibike-tripdata.zip


Jan 18th 2017, 05:23:27 pm


33.16 MB


ZIP file








201310-citibike-tripdata.zip


Jan 18th 2017, 05:23:28 pm


33.07 MB


ZIP file








201311-citibike-tripdata.zip


Jan 18th 2017, 05:23:28 pm


21.62 MB


ZIP file








201312-citibike-tripdata.zip


Jan 18th 2017, 05:23:28 pm


14.31 MB


ZIP file








201401-citibike-tripdata.zip


Jan 18th 2017, 05:23:29 pm


9.70 MB


ZIP file








201402-citibike-tripdata.zip


Jan 18th 2017, 05:23:29 pm


7.25 MB


ZIP file








201403-citibike-tripdata.zip

In [9]:
import requests
from bs4 import BeautifulSoup
 
# url of the website
doc = "https://s3.amazonaws.com/tripdata/index.html"
 
# getting response object
res = requests.get(doc)
 
# Initialize the object with the document
soup = BeautifulSoup(res.content, "html.parser")
 
# Get the whole body tag
tag = soup.body
 
# Print each string recursively
for string in tag.strings:
    print(string)





Bucket loading...








Name


Date Modified


Size


Type




















In [83]:
# folder where both combined csv's will reside
combined_folder='/Users/sra/files/projects/citibike_project/combined'

# folders for each csv group
csv_group1_path='/Users/sra/files/projects/citibike_project/combined/csv_group1'
csv_group2_path='/Users/sra/files/projects/citibike_project/combined/csv_group2'

# setting up lists
groups=[csv_group1,csv_group2]
csv_paths=[csv_group1_path,csv_group2_path]

len(groups[0])+len(groups[1])

csv_groups=[csv_group1_path,csv_group2_path]

In [None]:
original_directory = '/path/to/original_directory'
new_directory = '/path/to/new_directory'
files_to_move = ['file1.txt', 'file2.csv', 'file3.jpg']

# move_files_with_filenames(original_directory, new_directory, files_to_move)

In [None]:
# checking existence of base folder
if not os.path.exists(combined_folder):
    os.makedirs(combined_folder)
    
# two checks needed because there are three sub-folders
if not os.path.exists(csv_group1_path) and os.path.exists(csv_group2_path):
    os.makedirs(csv_group1_path)
    os.makedirs(csv_group2_path)
        
# for-loops
for group in groups:
    for csv in group:
             

In [174]:
original_directory = '/Users/sra/files/projects/citibike_project/combined/csv_group1'
combined_directory = '/Users/sra/files/projects/citibike_project/combined/group1_combined'
combined_filename = 'group1.csv'

combine_csv_files(original_directory, combined_directory, combined_filename)

  df = pd.read_csv(file_path)


AttributeError: 'DataFrame' object has no attribute 'concat'

original:

In [None]:
import os
import pandas as pd

# Directory containing the CSV files
directory = '/path/to/csv/files/'

# Get the list of CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Initialize an empty DataFrame for combined data
combined_data = pd.DataFrame()

# Iterate over each CSV file
for index, csv_file in enumerate(csv_files):
    file_path = os.path.join(directory, csv_file)
    
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Append data to combined DataFrame
    if index == 0:
        # Include header only for the first CSV
        combined_data = combined_data.append(df)
    else:
        # Skip header for other CSVs
        combined_data = combined_data.append(df[1:], ignore_index=True)

# Save the combined data to a new CSV file
combined_data.to_csv('/path/to/combined_file.csv', index=False)


In [1]:
def plot_bargraph_with_groupings(df, groupby, colourby, title, xlabel, ylabel):
    """
    https://stackoverflow.com/questions/48939795/how-to-plot-a-count-bar-chart-with-a-pandas-df-grouping-by-one-categorical-colu
    
    Plots a dataframe showing the frequency of datapoints grouped by one column and coloured by another.
    df : dataframe
    groupby: the column to groupby
    colourby: the column to color by
    title: the graph title
    xlabel: the x label,
    ylabel: the y label
    """

    import matplotlib.patches as mpatches

    # Makes a mapping from the unique colourby column items to a random color:
    ind_col_map = {x:y for x, y in zip(df[colourby].unique(),
                               [plt.cm.Paired(np.arange(len(df[colourby].unique())))][0])}


    # Find when the indicies of the soon to be bar graphs colors:
    unique_comb = df[[groupby, colourby]].drop_duplicates()
    name_ind_map = {x:y for x, y in zip(unique_comb[groupby], unique_comb[colourby])}
    c = df[groupby].value_counts().index.map(lambda x: ind_col_map[name_ind_map[x]])

    # Makes the bargraph:
    ax = df[groupby].value_counts().plot(kind='bar',
                                         figsize=FIG_SIZE,
                                         title=title,
                                         color=[c.values])
    # Makes a legend using the ind_col_map:
    legend_list = []
    for key in ind_col_map.keys():
        legend_list.append(mpatches.Patch(color=ind_col_map[key], label=key))

    # display the graph:
    plt.legend(handles=legend_list)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

In [2]:
plot_bargraph_with_groupings(df=group1_pd['start_station_name'],
                             groupby=group1_pd['start_station_name'],
                             colourby=group1_pd['start_station_name'],
                             title='Frequency of start_station_name',
                             xlabel='Category',
                             ylabel='Frequency')

NameError: name 'group1_pd' is not defined

In [1]:
group1_pd=pd.read_csv(group1_location)
group1_pd.info()

NameError: name 'pd' is not defined

In [2]:
group1_pd['start_station_name'].value_counts()

NameError: name 'group1_pd' is not defined

In [None]:
group1_pd['end_station_name'].value_counts()

There seems to be some junk in the columns - I assumed it was clean when I downloaded it from Citibike, which I guess was a mistake. 

I'll need to clean the CSV before I convert the database to be SQL-ready in a future notebook.

In [3]:
group1_pd.isna()

NameError: name 'group1_pd' is not defined

In [4]:
Displaying the value counts to disentangle the issue here:

SyntaxError: invalid syntax (3175787144.py, line 1)

In [5]:
# Calculate value counts
# counts = df['Column'].value_counts()

# Filter based on value counts
# filtered_df = df[df['Column'].isin(counts[counts >= 100].index)]

# counts=group1_pd['start_station_name'].value_counts()

# group1_pd[group1_pd['start_station_name'].isin(counts[counts >=100].index)].plot(kind='bar')

In [6]:
# group1_pl=group1_pd.

q=(
    pl.scan_csv(group1_location)
    # .filter(pl.col('member_casual')=='member')
    # .groupby('start_station_name')
    # .with_columns([pl.col('count').count().alias('count')])
    # .sort(pl.col(''))
)

df=q.collect()

df

NameError: name 'pl' is not defined

Use `parquvalue_counts store the large dataframes. They are currently in `.CSV` format and we will convert them to `parquet` format now.

In [8]:
group1=pd.read_csv(group1_location)

group1_pq=pa.Table.from_pandas(group1)

parquet_file_path='/Users/sra/files/projects/citibike_project/combined/group1_combined'
pq.write_table(group1_pq, parquet_file_path)

NameError: name 'pd' is not defined

In [9]:
# convert .CSV to .parquet

group1=pd.read_csv(group1_location)
group1=group1.to_parquet(group1_location)
group1.head()

NameError: name 'pd' is not defined

Use the `polars` package to manipulate the large dataframes, `group1` and `group2`.

In [11]:
q=(
    pl.scan_csv(group1_location)
    # .filter(pl.col('member_casual')=='member')
    .groupby('start_station_name')
    # .with_columns([pl.col('count').count().alias('count')])
    # .sort(pl.col(''))
)

df=q.collect()

NameError: name 'pl' is not defined

In [12]:
df

NameError: name 'df' is not defined

In [13]:
q=(
    pl.scan_csv('/Users/sra/files/projects/citibike_project/combined/group1_combined/group1.csv')
    .groupby(by='start_station_name').count()
    .sort(pl.col('count'),descending=True)
)

df=q.collect()

NameError: name 'pl' is not defined

_Important: Change to an environment that has the `mysql` package installed_

In [14]:
import pandas as pd

import mysql.connector

path_to_ride_pq='/Users/sra/files/projects/citibike_project/combined/group1_combined/group1_pl_ridenorm.parquet'
ride_pd=pd.read_parquet(path_to_ride_pq)

# MySQL connection details
host = 'localhost'
user = 'root'
password = 'rootroot'
database = 'citibike_project'
table = 'rides'

# Establish a connection to the MySQL server
cnx = mysql.connector.connect(host=host, user=user, password=password, database=database)


# Create a cursor object to execute SQL queries
cursor = cnx.cursor()

# Create table query
create_table_query = f"CREATE TABLE {table} (Name VARCHAR(50), Age INT, City VARCHAR(50))"
cursor.execute(create_table_query)

# Insert data into the table
for _, row in df.iterrows():
    insert_query = f"INSERT INTO {table} (Name, Age, City) VALUES (%s, %s, %s)"
    values = (row['Name'], row['Age'], row['City'])
    cursor.execute(insert_query, values)

# Commit the changes and close the connection
cnx.commit()
cursor.close()
cnx.close()

print(f"Data has been inserted into table '{table}' in database '{database}'.")

ModuleNotFoundError: No module named 'mysql'

In [None]:
table_name='rides'

flow_control=f'/Users/sra/files/projects/citibike_project/combined/group1_combined/flow_control/{table_name}_made'
flag=False

if not os.path.exists(flow_control):
    os.mkdir(flow_control)
    flag=True

if flag:

    # load the Parquet data into a pandas DataFrame
    parquet_file = '/Users/sra/files/projects/citibike_project/combined/group1_combined/group1_pl_ridenorm.parquet'
    print('parquet_file:',parquet_file,'\n')
    
    parquet_table = pq.read_table(parquet_file)
    print('parquet_table:',parquet_table,'\n')
    
    df = parquet_table.to_pandas()
    print('df.head():',df.head(),'\n')
    
    # connect to the MySQL database
    conn = mysql.connector.connect(
        host='localhost',
        user='root',
        password='rootroot',
        database='citibike_project'
    )
    print('conn:',conn,'\n')
    
    # write the DataFrame to a MySQL table
    df.to_sql(name=table_name, con=conn, if_exists='replace', index=False)
    print('wrote DataFrame to MySQL')
    
    # close the connection
    conn.close()
    print('closed connection')

    print(f"Table '{table_name}' created successfully")
    
else:
    
    print(f"Error: Table '{table_name}' already created.")

In [None]:
table_name='rides'

flow_control='/Users/sra/files/projects/citibike_project/combined/group1_combined/flow_control/rides_made'
flag=False

if not os.path.exists(flow_control):
    os.mkdir(flow_control)
    flag=True

if flag:
    # Establish a connection to the MySQL database
    conn = mysql.connector.connect(
        host='localhost',
        user='root',
        password='rootroot',
        database='citibike_project'
    )

    # Create a cursor object to execute SQL queries
    cursor = conn.cursor()

    # Create the table in the MySQL database with the desired schema
    create_table_query = f'''
        CREATE TABLE IF NOT EXISTS {table_name} (
            id TINYINT,
            type VARCHAR(255)
        )
    '''
    cursor.execute(create_table_query)

    # Define the data as a list of tuples or dictionaries
    data = [
        ('0', 'member'),
        ('1', 'casual')
    ]

    # Insert the data into the table
    insert_query = f'''
        INSERT INTO {table_name} (id, type)
        VALUES (%s, %s)
    '''

    cursor.executemany(insert_query, data)

    # Commit the changes and close the connection
    conn.commit()
    conn.close()
    
    print(f"Table '{table_name}' created successfully.")
    
if not flag:
    print(f"Table '{table_name}' already created.")