In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

class XMLDataParser:
    def __init__(self, url):
        self.url = url

    def fetch_xml(self):
        response = requests.get(self.url)
        response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
        return response.content

    def parse_xml_to_df(self, xml_data):
        root = ET.fromstring(xml_data)
        all_records = []
        # Assuming that each 'row' element in the XML contains the data record
        for row in root.findall('.//row'):
            record = {}
            for child in row:
                # Create a dictionary item with the 'row' tag names as keys and text content as values
                record[child.tag] = child.text
            all_records.append(record)
        return pd.DataFrame(all_records)

# Usage
url = 'https://data.lacity.org/api/views/2nrs-mtv8/rows.xml?accessType=DOWNLOAD'
parser = XMLDataParser(url)
xml_content = parser.fetch_xml()
df = parser.parse_xml_to_df(xml_content)

# Display the DataFrame
print(df.head())


   row      dr_no            date_rptd             date_occ time_occ area  \
0  NaN        NaN                  NaN                  NaN      NaN  NaN   
1  NaN  010304468  2020-01-08T00:00:00  2020-01-08T00:00:00     2230   03   
2  NaN  190101086  2020-01-02T00:00:00  2020-01-01T00:00:00     0330   01   
3  NaN  200110444  2020-04-14T00:00:00  2020-02-13T00:00:00     1200   01   
4  NaN  191501505  2020-01-01T00:00:00  2020-01-01T00:00:00     1730   15   

     area_name rpt_dist_no part_1_2 crm_cd  ... status   status_desc crm_cd_1  \
0          NaN         NaN      NaN    NaN  ...    NaN           NaN      NaN   
1    Southwest        0377        2    624  ...     AO   Adult Other      624   
2      Central        0163        2    624  ...     IC   Invest Cont      624   
3      Central        0155        2    845  ...     AA  Adult Arrest      845   
4  N Hollywood        1543        2    745  ...     IC   Invest Cont      745   

                                  location      la

In [2]:
pip install pandas requests xml.etree.ElementTree



SyntaxError: invalid syntax (1057754415.py, line 1)

In [None]:
import requests
import pandas as pd

class ConsumerComplaintsAPIClient:
    def __init__(self, base_url):
        self.base_url = base_url

    def get_data(self, params={}):
        response = requests.get(self.base_url, params=params)
        response.raise_for_status()  # Raises an HTTPError if the status is 4xx or 5xx
        return response.json()

    def to_dataframe(self, json_data):
        # Adjust this if the JSON structure is different
        return pd.DataFrame(json_data.get('hits', {}).get('hits', []))

# Usage example:
base_url = 'https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/'
client = ConsumerComplaintsAPIClient(base_url)

# Example params - customize these as needed based on the API's documentation
params = {
    'size': 10  # Limits the number of results returned
}

json_response = client.get_data(params=params)
# The API nests the records under 'hits' -> 'hits', and each actual record is under the '_source' key
df = client.to_dataframe(json_response)

# Since each complaint is nested under the '_source' key, we extract this into a separate DataFrame
df = pd.json_normalize(df['_source'])

print(df.head())


In [None]:
pip install requests beautifulsoup4 pandas


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL of the webpage with the table
url = 'https://gaspgroup.org/air-quality/?gad_source=1&gclid=Cj0KCQiAmNeqBhD4ARIsADsYfTdQN2SF83cBgN2EFT0xmngSf21-WoH8fTOfuHobIivaQlaNAyodeH8aAs8tEALw_wcB'

# Send a GET request to the webpage
response = requests.get(url)
response.raise_for_status()  # Ensure we notice bad responses

# Parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table in the soup object - you would need to identify the correct table by an id or class if there are multiple tables
table = soup.find('table')  # You may need to adjust this if the table has a specific class or id

# Extract the table rows
rows = table.find_all('tr')

# The first row usually contains the header columns
headers = [header.get_text().strip() for header in rows[0].find_all('th')]

# Extract the data from the table into a list of dictionaries
table_data = []
for row in rows[1:]:  # Skip the header row
    cells = row.find_all('td')
    cell_data = {headers[i]: cell.get_text().strip() for i, cell in enumerate(cells)}
    table_data.append(cell_data)

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(table_data)

# Display the DataFrame
print(df)


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# Configure ChromeOptions to run headless if you don't need a browser UI
options = Options()
options.headless = True

# Set up the Chrome WebDriver
# Make sure you have downloaded the correct version of 'chromedriver' for your Chrome version and placed it in your PATH or specified location.
service = Service(executable_path='path/to/chromedriver')
driver = webdriver.Chrome(service=service, options=options)

# Open the page
driver.get('https://gaspgroup.org/air-quality/?gad_source=1&gclid=Cj0KCQiAmNeqBhD4ARIsADsYfTdQN2SF83cBgN2EFT0xmngSf21-WoH8fTOfuHobIivaQlaNAyodeH8aAs8tEALw_wcB')

# Wait for JavaScript to load. This time might need to be adjusted.
time.sleep(5)

# Now that the page is fully loaded, grab the HTML content
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

# Assuming the table has a unique identifier, find that table
# If the table doesn't have an id or class, you would need to find another way to locate it
table = soup.find('table', {'id': 'unique_table_id'})  # Replace with the actual id or class

# Extract the rows from the table, assuming the table is well structured with <tr> and <td> tags
rows = table.find_all('tr') if table else []

# Proceed with data extraction as before
# ...

# Don't forget to close the driver after you're done
driver.quit()


In [13]:
!pip install selenium


Collecting selenium
  Obtaining dependency information for selenium from https://files.pythonhosted.org/packages/0e/59/aae37fa93e2d4292c3148efcc3066c8ecfe5cfaa72bf8c0b1a5614622cf7/selenium-4.15.2-py3-none-any.whl.metadata
  Downloading selenium-4.15.2-py3-none-any.whl.metadata (6.9 kB)
Collecting trio~=0.17 (from selenium)
  Obtaining dependency information for trio~=0.17 from https://files.pythonhosted.org/packages/39/46/620fbe56f41fa3ccdda2136d947fb9bacce3d1eb163f057f0262a0ddf5e0/trio-0.23.1-py3-none-any.whl.metadata
  Downloading trio-0.23.1-py3-none-any.whl.metadata (4.9 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Obtaining dependency information for trio-websocket~=0.9 from https://files.pythonhosted.org/packages/48/be/a9ae5f50cad5b6f85bd2574c2c923730098530096e170c1ce7452394d7aa/trio_websocket-0.11.1-py3-none-any.whl.metadata
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Obtaining dependency informa

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# The URL of the page you want to scrape
url = 'https://gaspgroup.org/air-quality/'

# Perform the HTTP request to get the webpage content
response = requests.get(url)

# Raise an exception if the request failed
response.raise_for_status()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table you want to scrape
# You'll need to inspect the webpage to determine the correct identifier for the table
# I'll use a placeholder 'table_id' here, but you should replace it with the actual id or class
table = soup.find('table', id='table_id')

# Assuming the first row is the header
headers = [th.get_text(strip=True) for th in table.find_all('th')]

# Extract the table rows, skipping the header
rows = table.find_all('tr')[1:]

# Extract the table data
table_data = []
for row in rows:
    cols = row.find_all('td')
    row_data = [ele.get_text(strip=True) for ele in cols]
    table_data.append(row_data)

# Create the DataFrame using the header and rows
df = pd.DataFrame(table_data, columns=headers)

# Now you have a DataFrame `df` that you can use for analysis
print(df.head())


AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# The URL of the page you want to scrape
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'

# Perform the HTTP request to get the webpage content
response = requests.get(url)

# Raise an exception if the request failed
response.raise_for_status()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table you want to scrape, Wikipedia tables usually have the 'wikitable' class
table = soup.find('table', {'class': 'wikitable'})

# Check if a table is found
if table:
    # Extract the header names
    headers = [header.get_text(strip=True) for header in table.find_all('th')]

    # Extract the table rows, skipping the header
    rows = table.find_all('tr')[1:]

    # Extract the table data
    table_data = []
    for row in rows:
        cols = row.find_all(['td', 'th'])  # This gets all table data and header cells
        row_data = [ele.get_text(strip=True) for ele in cols]
        table_data.append(row_data)

    # Create the DataFrame using the header and rows
    df = pd.DataFrame(table_data, columns=headers)

    # Now you have a DataFrame `df` that you can use for analysis
    print(df.head())
else:
    print("No wikitable found on the page.")
