# Module 10 Assignment - Scraping a Website
    
    Author: Sameer Beloshe
    version 2.0
    
We will be creating a web scraper to parse a table from the Charities Bureau Website. From the website: “All charitable organizations operating in New York State are required by law to register and file annual financial reports with the Attorney General's Office. This includes any organization that conducts charitable activities, holds property that is used for charitable purposes, or solicits financial or other contributions.”

## Web Scraping

In [8]:
import pandas as pd
from selenium import webdriver

from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

from time import sleep

# Setting up WebDriver

s = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service=s)

# Function to scrape the current page
def scrape_current_page(browser):
    table = browser.find_element(By.CSS_SELECTOR, 'table.Bordered')
    data = []
    for row in table.find_elements(By.CSS_SELECTOR, 'tr')[1:]:  # Skip the header row
        cells = row.find_elements(By.CSS_SELECTOR, 'td')
        if cells:  # Check if there are cells in the row
            data.append([cell.text for cell in cells])
    return data

# Access the URL with WebDriver
browser.get('https://www.charitiesnys.com/RegistrySearch/search_charities.jsp')

# Find and input search criteria
inputElement = browser.find_element(By.XPATH,'//*[@id="header"]/div[2]/div/table/tbody/tr/td[2]/div/div/font/font/font/font/font/font/table/tbody/tr[4]/td/form/table/tbody/tr[2]/td[2]/input[1]')
inputElement.send_keys('0')

# Click the search button
inputElement1 = browser.find_element(By.XPATH,'//*[@id="header"]/div[2]/div/table/tbody/tr/td[2]/div/div/font/font/font/font/font/font/table/tbody/tr[4]/td/form/table/tbody/tr[10]/td/input[1]').click()
sleep(4)

# Initialize empty list to store all pages' data
all_pages_data = []

# Define a flag for whether to continue scraping
continue_scraping = True

while continue_scraping:
    # Scrape the current page
    current_page_data = scrape_current_page(browser)
    all_pages_data.extend(current_page_data)
    
    try:
        # Attempt to find and click the 'Next' button
        next_button = browser.find_element(By.LINK_TEXT, 'Next')
        next_button.click()
        sleep(4)  # Wait for the next page to load
    except NoSuchElementException:
        # If 'Next' button is not found, exit the loop
        continue_scraping = False


browser.quit()  # closing the browser

In [9]:
# Filter out the header rows that appear on every page

all_pages_data = [row for row in all_pages_data if not (row and row[0].startswith('Organization Name'))]

# Create a DataFrame from the collected data
df = pd.DataFrame(all_pages_data, columns=["Organization Name", "NY Reg #", "EIN", "Registrant Type", "City", "State"])

# Display the DataFrame

df

Unnamed: 0,Organization Name,NY Reg #,EIN,Registrant Type,City,State
0,"""Forever Captain Poodaman"" The Ahmad Butler Fo...",48-07-16,843800926,NFP,PHILADELPHIA,PA
1,"""Incredibly Blessed"" Inc",49-54-61,842071758,NFP,STATEN ISLAND,NY
2,"""R"" S.U.C.C.E.S.S. Foundation Inc.",49-06-59,874012670,NFP,ROCHESTER,NY
3,"""Studio 5404"" Inc.",44-39-58,463180470,NFP,MASSAPAQUA,NY
4,"""THEY ARE HAITIAN"" FUND, INC.",20-63-46,300170128,NFP,HUDSON,NY
...,...,...,...,...,...,...
95,University of Virginia Health Foundtion,40-44-88,412097394,NFP,CHARLOTTESVILLE,VA
96,Violin Player,41-40-19,270773158,NFP,LONDONDERRY,NH
97,"William A. Epps Community Center, Inc.",40-91-11,861074714,NFP,STATEN ISLAND,NY
98,WORLD SOCIETY OF CZESTOCHOWA JEWS AND THEIR DE...,40-46-49,205101779,NFP,NEW YORK,NY


## Creating S3 bucket

In [10]:
import awscli
import boto3

# Initialize S3 client without specifying the region

s3 = boto3.client('s3')

# Create a unique bucket name
bucket_name = 'm10-assignment-sameerbeloshe'

# Create S3 bucket
s3.create_bucket(Bucket=bucket_name)

print(f"Bucket '{bucket_name}' created successfully.")

Bucket 'm10-assignment-sameerbeloshe' created successfully.


## Loading the data

In [11]:
import awscli
import boto3
import pandas as pd

from io import StringIO  # For handling CSV content as a string buffer
from datetime import datetime  # For generating timestamps

# Assuming 'df' is your final DataFrame
csv_buffer = StringIO()
df.to_csv(csv_buffer)

# Generate timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Initialize an S3 client using boto3
s3_resource = boto3.resource('s3')
bucket_name = 'm10-assignment-sameerbeloshe'  # Replace with your bucket name
file_name = f'charities_bureau_scrape_{timestamp}.csv'  # Append timestamp to file name

# Upload the CSV file
s3_resource.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue())

print(f" Succesfully File : {file_name} uploaded to s3 bucket: {bucket_name}")

 Succesfully File : charities_bureau_scrape_2024-04-15_22-26-47.csv uploaded to s3 bucket: m10-assignment-sameerbeloshe


In [12]:
# Increment index by 1 to start from 1.

df.index += 1

# Make a copy of DataFrame df and assign it to df1
df1= df

df1

Unnamed: 0,Organization Name,NY Reg #,EIN,Registrant Type,City,State
1,"""Forever Captain Poodaman"" The Ahmad Butler Fo...",48-07-16,843800926,NFP,PHILADELPHIA,PA
2,"""Incredibly Blessed"" Inc",49-54-61,842071758,NFP,STATEN ISLAND,NY
3,"""R"" S.U.C.C.E.S.S. Foundation Inc.",49-06-59,874012670,NFP,ROCHESTER,NY
4,"""Studio 5404"" Inc.",44-39-58,463180470,NFP,MASSAPAQUA,NY
5,"""THEY ARE HAITIAN"" FUND, INC.",20-63-46,300170128,NFP,HUDSON,NY
...,...,...,...,...,...,...
96,University of Virginia Health Foundtion,40-44-88,412097394,NFP,CHARLOTTESVILLE,VA
97,Violin Player,41-40-19,270773158,NFP,LONDONDERRY,NH
98,"William A. Epps Community Center, Inc.",40-91-11,861074714,NFP,STATEN ISLAND,NY
99,WORLD SOCIETY OF CZESTOCHOWA JEWS AND THEIR DE...,40-46-49,205101779,NFP,NEW YORK,NY


## Uploading the updated dataframe into the s3 bucket.

In [13]:
import awscli
import boto3
import pandas as pd
from io import StringIO  # For handling CSV content as a string buffer
from datetime import datetime  # For generating timestamps

# Assuming 'df' is your final DataFrame
csv_buffer = StringIO()
df1.to_csv(csv_buffer)

# Generate timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Initialize an S3 client using boto3
s3_resource = boto3.resource('s3')
bucket_name = 'm10-assignment-sameerbeloshe'  
file_name = f'Updated_csv_charities_bureau_scrape_{timestamp}.csv'  # Append timestamp to file name

# Upload the CSV file
s3_resource.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue())

print(f"File {file_name} uploaded to {bucket_name}")

File Updated_csv_charities_bureau_scrape_2024-04-15_22-27-29.csv uploaded to m10-assignment-sameerbeloshe


## Checking the objects in S3 bucket

In [15]:
# Initialize an S3 client using boto3
s3_client = boto3.client('s3')

# Specify the bucket name
bucket_name = 'm10-assignment-sameerbeloshe'  # Replace with your bucket name

# List objects in the bucket
response = s3_client.list_objects_v2(Bucket=bucket_name)

# Print the list of objects
if 'Contents' in response:
    print("Objects in the bucket:")
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print("The bucket is empty or does not exist.")

Objects in the bucket:
Updated_csv_charities_bureau_scrape_2024-04-15_22-27-29.csv
charities_bureau_scrape_2024-04-15_22-26-47.csv
