# Tax Assistant Data Scrapper

## Import Libaries 

In [30]:
import requests # pip install requests
from bs4 import BeautifulSoup #pip install beautifulsoup4 #pip install lxml
import os
import re

## Get URLs from sitemap

In [31]:
# Path to the local sitemap.xml file in your project folder
sitemap_file_path = './sitemap.xml' # got from here: https://www.xml-sitemaps.com/details-www.revenue.ie-492529958.html

with open(sitemap_file_path, 'r', encoding='utf-8') as file:
    sitemap_content = file.read()

soup = BeautifulSoup(sitemap_content, 'lxml-xml')  # Use 'lxml-xml' as the parser

# Extract URLs from the local sitemap.xml
urls = [loc.text for loc in soup.find_all('loc')]
print(len(urls))

500


## Crawl URLs & Get Text Data

In [32]:
def url_to_filename(url):
    # Remove the "http://" or "https://" part of the URL
    url_without_protocol = re.sub(r'^https?://', '', url)
    # Replace any remaining special characters with underscores
    filename = re.sub(r'[^a-zA-Z0-9]', '_', url_without_protocol)
    # Add a ".txt" extension
    filename += '.txt'
    return filename

# Create the directory if it doesn't exist
save_dir = './text/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Download text from revenue's website
for url in urls:
    response = requests.get(url)
    if response.status_code == 200:
        # Extract and save the text content
        text_content = response.text
        
        # Remove HTML tags to get plain text (you can use a library like BeautifulSoup for more advanced parsing)
        # Here, we use a simple regex to remove HTML tags
        text_content = re.sub(r'<[^>]+>', '', text_content)
        
        # Convert the URL to a valid filename
        filename = os.path.join(save_dir, url_to_filename(url))
        
        # Save the plain text content to the file
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(text_content)
        print("Downloading text: " + filename)
    else:
        print(f"Failed to download {url}. Status code: {response.status_code}")

Downloading text: ./text/www_revenue_ie_en_home_aspx.txt
Downloading text: ./text/www_revenue_ie_en_corporate_press_office_press_releases_2024_pr_010524_headline_results_aspx.txt
Downloading text: ./text/www_revenue_ie_en_jobs_and_pensions_paye_income_tax_returns_index_aspx.txt
Downloading text: ./text/www_revenue_ie_en_employing_people_becoming_an_employer_and_ongoing_obligations_reporting_jan_2024_index_aspx.txt
Downloading text: ./text/www_revenue_ie_en_customs_individuals_buying_online_personal_index_aspx.txt
Downloading text: ./text/www_revenue_ie_en_jobs_and_pensions_index_aspx.txt
Downloading text: ./text/www_revenue_ie_en_personal_tax_credits_reliefs_and_exemptions_index_aspx.txt
Downloading text: ./text/www_revenue_ie_en_life_events_and_personal_circumstances_index_aspx.txt
Downloading text: ./text/www_revenue_ie_en_self_assessment_and_self_employment_index_aspx.txt
Downloading text: ./text/www_revenue_ie_en_gains_gifts_and_inheritance_index_aspx.txt
Downloading text: ./text/w

## Clean Text Data

In [33]:
# Path to the directory containing your text files
input_path = './text/'
output_path = './data/'

# Iterate over each file in the directory
for filename in os.listdir(input_path):
    if filename.endswith(".txt"):  # or .html, .js, etc, depending on your file type
        file_path = os.path.join(input_path, filename)

        # Read the content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # Remove tabs and newline characters while preserving spaces
        modified_content = content

        file_path = os.path.join(output_path, filename)
        # Write the modified content back to the file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(modified_content)

        print("cleaned: " + file_path)

cleaned: ./data/www_revenue_ie_en_additional_incomes_benefits_from_your_employer_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_dirt_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_dividend_income_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_do_you_need_to_submit_a_tax_return_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_employment_related_shares_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_is_your_extra_income_taxable_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_maintenance_payments_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_overtime_bonuses_and_second_jobs_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_rental_income_index_aspx.txt
cleaned: ./data/www_revenue_ie_en_additional_incomes_social_welfare_payments_index_aspx.txt
cleaned: ./data/www_r

In [34]:
# Path to the directory containing your text files
input_path = './data/'
output_file = 'irish_tax_revenue_info.txt'

# Initialize an empty string to store the merged content
merged_content = ''

# Iterate over each file in the directory
for filename in os.listdir(input_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_path, filename)

        # Read the content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

            # Remove newline characters and excessive spaces
            content = content.replace('\n', ' ').replace('\r', '').strip()

        # Append the content of the current file to the merged content
        merged_content += content + ' '  # Adding a space between contents of different files

# Write the merged content to the output file
with open(output_file, 'w', encoding='utf-8') as merged_file:
    merged_file.write(merged_content.strip())  # Remove any leading/trailing whitespace

print("Merged content saved to " + output_file)

Merged content saved to irish_tax_revenue_info.txt


In [36]:
# File path
input_file = 'irish_tax_revenue_info.txt'
output_file = 'cleaned_irish_tax_revenue_info.txt'

# Read the content of the file
with open(input_file, 'r', encoding='utf-8') as file:
    content = file.read()

# Replace all non-alphanumeric and non-currency symbol characters with a space
cleaned_content = re.sub(r'[^A-Za-z0-9\$\€\£\¥ ]', ' ', content)

# Replace multiple spaces with a single space
cleaned_content = re.sub(r'\s+', ' ', cleaned_content)

# Write the cleaned content to a new file
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(cleaned_content.strip())

print("Cleaned content saved to " + output_file)

Cleaned content saved to cleaned_irish_tax_revenue_info.txt
