<a href="https://colab.research.google.com/github/shraiyanshdugar/hello-world/blob/master/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to extract links from a given URL
def extract_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', href=True)]
        return links
    except Exception as e:
        print(f"Error extracting links from {url}: {str(e)}")
        return []

# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Iterate over each row in the CSV file
    for row in reader:
        if len(row) < 2:
            continue  # Skip rows with less than 2 columns

        # Assuming the link is in the second column and the index is in the first column
        index = row[0]
        link = row[1]

        # Extract links from the current URL and store them as values for the key (current link)
        link_data[link] = extract_links(link)

# Print the resulting dictionary
for key, values in link_data.items():
    print(f"Link: {key}")
    print("Links found on the page:")
    for value in values:
        print(f" - {value}")
    print()

# You can also save this data to a new CSV file if needed
# Example:
# with open('link_data.csv', 'w', newline='') as output_csv:
#     writer = csv.writer(output_csv)
#     for key, values in link_data.items():
#         writer.writerow([key] + values)


Error extracting links from : Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error extracting links from http://www.risepizzeria.com/: HTTPConnectionPool(host='www.risepizzeria.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fb3f23e4ac0>: Failed to establish a new connection: [Errno -5] No address associated with hostname'))
Error extracting links from http://www.charliepalmersteak.com/locations/napa: HTTPConnectionPool(host='www.charliepalmersteak.com', port=80): Max retries exceeded with url: /locations/napa (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fb3f208c490>: Failed to establish a new connection: [Errno -5] No address associated with hostname'))
Error extracting links from http://www.lacabana.vista.com/: HTTPConnectionPool(host='www.lacabana.vista.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HT

KeyboardInterrupt: 

In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import re
import time
from requests.exceptions import Timeout  # Import the Timeout exception for links which takes too much time to load

# Function to categorize links on a webpage
def categorize_links(links):
    maps_link = ""
    instagram_link = ""
    facebook_link = ""
    twitter_link = ""
    other_links = []

    for link in links:
        if "maps.google.com" in link:
            maps_link = link
        elif "www.instagram.com" in link:
            instagram_link = link
        elif "www.facebook.com" in link:
            facebook_link = link
        elif "twitter.com" in link:
            twitter_link = link
        else:
            other_links.append(link)

    return [maps_link, instagram_link, facebook_link, twitter_link] + other_links


def extract_links_with_error_handling(url, index):
    retries = 3  # Number of retries before giving up
    retry_delay = 2  # Delay between retries in seconds
    for _ in range(retries):
        try:
            response = requests.get(url, timeout=10)  # Set a timeout of 10 seconds
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]
            return list(set(valid_links))  # Remove duplicates by converting to a set and back to a list
        except Timeout:
            print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except ConnectionError:
            print(f"Connection error occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except Exception as e:
            print(f"Error extracting links from index {index} for URL {url}: {str(e)}")
            return []

# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile_small.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Iterate over each row in the CSV file
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue  # Skip rows with less than 2 columns

        # Assuming the link is in the second column and the index is in the first column
        link_index = row[0]
        link = row[1]
        if(link == ''):
            continue
        print('going for ', link_index, ' : ', link)
        # Extract links from the current URL and store them as values for the key (current link)
        link_data[link] = extract_links(link, link_index)

        # Introduce a delay of 2 seconds before processing the next link
        time.sleep(1)  # You can adjust the sleep duration as needed

# Categorize links after extracting all links
for key, values in link_data.items():
    link_data[key] = categorize_links(values)

# Print the resulting dictionary and save it to an output file
output_filename = 'link_data.csv'
with open(output_filename, 'w', newline='') as output_csv:
    writer = csv.writer(output_csv)
    for key, values in link_data.items():
        print(f"Link: {key}")
        print("Links found on the page:")
        writer.writerow([key] + values)
        for value in values:
            print(f" - {value}")
        print()

print(f"Data saved to {output_filename}")


going for  0  :  http://www.yannistaverna.com/
going for  1  :  http://www.starbellysf.com/
going for  2  :  http://www.manzonirestaurant.com/
going for  3  :  http://www.macaronigrill.com/
going for  4  :  https://www.pfchangs.com/locations/us/ca/pleasanton/1330-stoneridge-mall-rd/9819-pleasanton.html
going for  5  :  https://www.pfchangs.com/locations/us/ca/sunnyvale/390-w-el-camino-real/6900-sunnyvale.html
going for  6  :  https://www.pfchangs.com/locations/us/ca/palo-alto/900-stanford-shopping-center/9911-palo-alto.html
going for  7  :  http://www.kenzonapa.com/
going for  9  :  http://monsoonhimalayancuisine.com/
Error extracting links from index 9 for URL http://monsoonhimalayancuisine.com/: 403 Client Error: Forbidden for url: https://monsoonhimalayancuisine.com/
going for  10  :  http://www.risepizzeria.com/
Link: http://www.yannistaverna.com/
Links found on the page:
 - 
 - 
 - 
 - 
 - https://cutt.ly/daftarmaha4d
 - https://secure.livechatinc.com/licence/12759861/v2/open_chat

In [3]:
import csv
import requests
from bs4 import BeautifulSoup
import re
import time
from requests.exceptions import Timeout, RequestException  # Import the Timeout exception for links which takes too much time to load

# Function to categorize links on a webpage
def categorize_links(links):
    if links is None:
        return []
    maps_link = ""
    instagram_link = ""
    facebook_link = ""
    twitter_link = ""
    other_links = []

    for link in links:
        if "maps.google.com" in link:
            maps_link = link
        elif "www.instagram.com" in link:
            instagram_link = link
        elif "www.facebook.com" in link:
            facebook_link = link
        elif "twitter.com" in link:
            twitter_link = link
        else:
            other_links.append(link)

    return [maps_link, instagram_link, facebook_link, twitter_link] + other_links


def extract_links_with_error_handling(url, index):
    retries = 3  # Number of retries before giving up
    retry_delay = 2  # Delay between retries in seconds
    for _ in range(retries):
        try:
            print('heyaaaa')
            response = requests.get(url, timeout=(5, 5))  # Set a timeout of 10 seconds for both connect and read
            print('aaaayeh')
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]
            return list(set(valid_links))  # Remove duplicates by converting to a set and back to a list
        except Timeout:
            print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except RequestException as e:
            print(f"RequestException occurred for index {index} and URL {url}: {str(e)}")
            return []  # Exit the loop and move to the next link
        except Exception as e:
            print(f"Error extracting links from index {index} for URL {url}: {str(e)}")
            return []

# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Iterate over each row in the CSV file
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue  # Skip rows with less than 2 columns

        # Assuming the link is in the second column and the index is in the first column
        link_index = row[0]
        link = row[1]
        if(link == ''):
            continue
        print('going for ', link_index, ' : ', link)
        # Extract links from the current URL and store them as values for the key (current link)
        link_data[link] = extract_links_with_error_handling(link, link_index)

        # Introduce a delay of 2 seconds before processing the next link
        time.sleep(1)  # You can adjust the sleep duration as needed

# Categorize links after extracting all links
for key, values in link_data.items():
    link_data[key] = categorize_links(values)

# Print the resulting dictionary and save it to an output file
output_filename = 'link_data_full.csv'
with open(output_filename, 'w', newline='') as output_csv:
    writer = csv.writer(output_csv)
    for key, values in link_data.items():
        print(f"Link: {key}")
        print("Links found on the page:")
        writer.writerow([key] + values)
        for value in values:
            print(f" - {value}")
        print()

print(f"Data saved to {output_filename}")


going for    :  restaurant_website
heyaaaa
RequestException occurred for index  and URL restaurant_website: Invalid URL 'restaurant_website': No scheme supplied. Perhaps you meant https://restaurant_website?
going for  0  :  http://www.yannistaverna.com/
heyaaaa
aaaayeh
going for  1  :  http://www.starbellysf.com/
heyaaaa
aaaayeh
going for  2  :  http://www.manzonirestaurant.com/
heyaaaa
aaaayeh
going for  3  :  http://www.macaronigrill.com/
heyaaaa
aaaayeh
going for  4  :  https://www.pfchangs.com/locations/us/ca/pleasanton/1330-stoneridge-mall-rd/9819-pleasanton.html
heyaaaa
aaaayeh
going for  5  :  https://www.pfchangs.com/locations/us/ca/sunnyvale/390-w-el-camino-real/6900-sunnyvale.html
heyaaaa
aaaayeh
going for  6  :  https://www.pfchangs.com/locations/us/ca/palo-alto/900-stanford-shopping-center/9911-palo-alto.html
heyaaaa
aaaayeh
going for  7  :  http://www.kenzonapa.com/
heyaaaa
aaaayeh
going for  9  :  http://monsoonhimalayancuisine.com/
heyaaaa
aaaayeh
RequestException occur

In [None]:
import re
from bs4 import BeautifulSoup

# Sample HTML content (replace this with your actual HTML content)
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Sample Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <p>Phone: +1 (123) 123-1223</p>
        <p>Email: info@example.com</p>
    </div>
    <div class="content">
        <p class="text">Visit our site for more information.</p>
    </div>
    <footer class="footer" id="page-footer">
        <p>&copy; 2023 Sample Company</p>
    </footer>
</body>
</html>
"""

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')

# Define a regex pattern to match phone numbers
phone_pattern = r'(\+\d{1,2}\s?)?\(\d{3}\)\s?\d{3}[-\s]\d{4}'

# Find elements containing phone numbers and capture their class attributes
phone_number_elements = soup.find_all(text=re.compile(phone_pattern))

for element in phone_number_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    phone_number = element.strip()

    # Check if phone_number is not None before calling strip()
    if phone_number is not None:
        phone_number = phone_number.strip()

    print(f"Phone Number: {phone_number}")
    print(f"Class Attribute: {element_class}")


Phone Number: Phone: +1 (123) 123-1223
Class Attribute: ['header']


  phone_number_elements = soup.find_all(text=re.compile(phone_pattern))


In [4]:
import re
from bs4 import BeautifulSoup

# Sample HTML content (replace this with your actual HTML content)
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Complex Page</title>
</head>
<body>
    <div class="header" id="top-header">
        <h1>Contact Us</h1>
        <div class="contact-info">
            <p class="phone">Phone: +1 (123) 123-1223</p>
            <p class="email">Email: info@example.com</p>
        </div>
        <div class="address">
            <p>Main Office:</p>
            <p class="street">123 Main St</p>
            <p class="city">Cityville</p>
            <p class="country">Countryland</p>
        </div>
    </div>
    <div class="content">
        <h2>About Us</h2>
        <p class="description">We are a company specializing in...</p>
        <div class="links">
            <a href="https://www.facebook.com" class="social-link">Facebook</a>
            <a href="https://www.twitter.com" class="social-link">Twitter</a>
            <a href="https://www.instagram.com" class="social-link">Instagram</a>
        </div>
    </div>
    <footer class="footer" id="page-footer">
        <div class="opening-hours">
            <p class="day">Monday - Friday</p>
            <p class="hours">9 AM - 6 PM</p>
        </div>
        <div class="copyright">
            <p>&copy; 2023 Sample Company</p>
        </div>
    </footer>
</body>
</html>
"""

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')

# Define regex patterns for phone num\bgroup\sorder\bbers, email addresses, timings, and addresses
phone_pattern = r'(\+\d{1,2}\s?)?\(\d{3}\)\s?\d{3}[-\s]\d{4}'
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
timings_pattern = r'(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)(?:day)? [0-9]+(?: AM| PM)? - [0-9]+(?: AM| PM)?'
address_pattern = r'\b\d+\s[A-Za-z\s,]+\b'
catering_pattern = re.compile(r'\bcatering\b', re.IGNORECASE)
group_order_pattern = re.compile(r'\bgroup\sorder\b', re.IGNORECASE)
counter_pattern = re.compile(r'\bcounter\b', re.IGNORECASE)
delivery_pattern = re.compile(r'\bdelivery\b', re.IGNORECASE)
bar_pattern = re.compile(r'\bbar\b', re.IGNORECASE)
pickup_pattern = re.compile(r'\bpick\s*-\s*up\b', re.IGNORECASE)


# Find elements containing phone numbers, email addresses, timings, and addresses and capture their class attributes
phone_number_elements = soup.find_all(text=re.compile(phone_pattern))
email_elements = soup.find_all(text=re.compile(email_pattern))
timings_elements = soup.find_all(text=re.compile(timings_pattern))
address_elements = soup.find_all(text=re.compile(address_pattern))

for element in phone_number_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    phone_number = element.strip()

    # Check if phone_number is not None before calling strip()
    if phone_number is not None:
        phone_number = phone_number.strip()

    print(f"Phone Number: {phone_number}")
    print(f"Class Attribute: {element_class}")

for element in email_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    email = element.strip()

    # Check if email is not None before calling strip()
    if email is not None:
        email = email.strip()

    print(f"Email Address: {email}")
    print(f"Class Attribute: {element_class}")

for element in timings_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    timings = element.strip()

    # Check if timings is not None before calling strip()
    if timings is not None:
        timings = timings.strip()

    print(f"Timings: {timings}")
    print(f"Class Attribute: {element_class}")

for element in address_elements:
    # Find the nearest parent element with a class attribute
    parent_element = element.find_parent(attrs={"class": True})
    element_class = parent_element['class'] if parent_element else None

    address = element.strip()

    # Check if address is not None before calling strip()
    if address is not None:
        address = address.strip()

    print(f"Address: {address}")
    print(f"Class Attribute: {element_class}")

Phone Number: Phone: +1 (123) 123-1223
Class Attribute: ['phone']
Email Address: Email: info@example.com
Class Attribute: ['email']
Address: 123 Main St
Class Attribute: ['street']
Address: 9 AM - 6 PM
Class Attribute: ['hours']
Address: Â© 2023 Sample Company
Class Attribute: ['copyright']


  phone_number_elements = soup.find_all(text=re.compile(phone_pattern))
  email_elements = soup.find_all(text=re.compile(email_pattern))
  timings_elements = soup.find_all(text=re.compile(timings_pattern))
  address_elements = soup.find_all(text=re.compile(address_pattern))


In [2]:
import csv
import requests
from bs4 import BeautifulSoup
import re
import time
from requests.exceptions import Timeout, RequestException
import threading

# Function to categorize links on a webpage
def categorize_links(links):
    if links is None:
        return []
    maps_link = ""
    instagram_link = ""
    facebook_link = ""
    twitter_link = ""
    other_links = []

    for link in links:
        if "maps.google.com" in link:
            maps_link = link
        elif "www.instagram.com" in link:
            instagram_link = link
        elif "www.facebook.com" in link:
            facebook_link = link
        elif "twitter.com" in link:
            twitter_link = link
        else:
            other_links.append(link)

    return [maps_link, instagram_link, facebook_link, twitter_link] + other_links

def extract_links_with_error_handling(url, index):
    retries = 3
    retry_delay = 2
    for _ in range(retries):
        try:
            print('heyaaaa')
            response = requests.get(url, timeout=(5, 5))
            print('aaaayeh')
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            links = [a['href'] for a in soup.find_all('a', href=True)]
            valid_links = [link for link in links if re.match(r'^https?://', link)]
            return list(set(valid_links))
        except Timeout:
            print(f"Timeout occurred for index {index} and URL {url}. Retrying...")
            time.sleep(retry_delay)
        except RequestException as e:
            print(f"RequestException occurred for index {index} and URL {url}: {str(e)}")
            return []
        except Exception as e:
            print(f"Error extracting links from index {index} for URL {url}: {str(e)}")
            return []

# Function for multithreading
def process_url(index, link):
    print('going for ', index, ' : ', link)
    link_data[link] = extract_links_with_error_handling(link, index)
    time.sleep(1)

# Initialize an empty dictionary to store the data
link_data = {}

# Read the CSV file containing links
csv_filename = 'outfile.csv'  # Change this to your CSV file name
with open(csv_filename, 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Iterate over each row in the CSV file
    threads = []
    for index, row in enumerate(reader):
        if len(row) < 2:
            continue

        # Assuming the link is in the second column and the index is in the first column
        link_index = row[0]
        link = row[1]
        if(link == ''):
            continue

        # Create a thread to process the URL
        thread = threading.Thread(target=process_url, args=(link_index, link))
        threads.append(thread)

    # Start all threads
    for thread in threads:
        thread.start()

    # Wait for all threads to finish
    for thread in threads:
        thread.join()

# Categorize links after extracting all links
for key, values in link_data.items():
    link_data[key] = categorize_links(values)

# Print the resulting dictionary and save it to an output file
output_filename = 'link_data_full.csv'
with open(output_filename, 'w', newline='') as output_csv:
    writer = csv.writer(output_csv)
    for key, values in link_data.items():
        print(f"Link: {key}")
        print("Links found on the page:")
        writer.writerow([key] + values)
        for value in values:
            print(f" - {value}")
        print()

print(f"Data saved to {output_filename}")


going for    :  restaurant_website
heyaaaa
going for  0  :  http://www.yannistaverna.com/
heyaaaa
going for RequestException occurred for index  and URL restaurant_website: Invalid URL 'restaurant_website': No scheme supplied. Perhaps you meant https://restaurant_website?
 1  :  http://www.starbellysf.com/
heyaaaa
going for  2  :  http://www.manzonirestaurant.com/
heyaaaa
going for  3  :  http://www.macaronigrill.com/
heyaaaa
going for  4  :  https://www.pfchangs.com/locations/us/ca/pleasanton/1330-stoneridge-mall-rd/9819-pleasanton.html
heyaaaa
going for  5  :  https://www.pfchangs.com/locations/us/ca/sunnyvale/390-w-el-camino-real/6900-sunnyvale.html
heyaaaagoing for  6  :  https://www.pfchangs.com/locations/us/ca/palo-alto/900-stanford-shopping-center/9911-palo-alto.html
going for  7  :  http://www.kenzonapa.com/
heyaaaa
heyaaaa

going for  9  :  http://monsoonhimalayancuisine.com/
heyaaaa
going for  10going for  11   :  http://www.risepizzeria.com/
heyaaaa
 :  http://www.primetable

In [None]:
# Initialize an empty set to store found items
found_items = set()

# List of websites to search
websites = ['website1.com', 'website2.com', 'website3.com']

# Item you want to find
item_to_find = 'Your Item'

for website in websites:
    # Check if the item has already been found
    if item_to_find in found_items:
        print(f'{item_to_find} already found. Skipping {website}.')
        continue

    # Search for the item on the website (replace this with your search logic)
    found_on_website = search_item_on_website(website, item_to_find)

    if found_on_website:
        print(f'Found {item_to_find} on {website}.')
        found_items.add(item_to_find)

# You can continue with the next website or perform other tasks as needed.
