diff --git a/websiteurl_scraper/README.md b/websiteurl_scraper/README.md new file mode 100644 index 000000000..1439b4b0c --- /dev/null +++ b/websiteurl_scraper/README.md @@ -0,0 +1,24 @@ + +
This python script focuses on retreving all the webpages links from a giver Url + +This link can also be present inside a button or an action +
+This can be install by using pip install ssl
+This can be install by using pip install urrlib
This can be installed by using BeautifulSoup4
+we need ot give A valid website link as input
+The program access the website and returns all the links present in the website
diff --git a/websiteurl_scraper/requirements.txt b/websiteurl_scraper/requirements.txt new file mode 100644 index 000000000..adc79f30b --- /dev/null +++ b/websiteurl_scraper/requirements.txt @@ -0,0 +1,12 @@ +We need to install 3 libraries in order for this file to work +the 1st is ssl +This can be install by using pip install ssl +and this library helps to us to tackle website certification issues + +the 2nd is urrlib +This can be install by using pip install urrlib +and this library helps us to acces the url + +the 3rd is bs4 which is beautifulsoup +This can be installed by using BeautifulSoup4 +This helps us to read the url and acces information \ No newline at end of file diff --git a/websiteurl_scraper/webUrlscraper.py b/websiteurl_scraper/webUrlscraper.py new file mode 100644 index 000000000..1ec4282fa --- /dev/null +++ b/websiteurl_scraper/webUrlscraper.py @@ -0,0 +1,36 @@ +# USings ssl and urrlib.request to read the contents of the url +# ssl helps us to avoid cretificate verifation and so on + +import ssl +from urllib.request import urlopen, Request +from bs4 import BeautifulSoup + +ctx = ssl.create_default_context() +ctx.check_hostname = False +ctx.verify_mode = ssl.CERT_NONE + +# getting in the website link +Url = input("Enter your Urllink") +try : + # trying to access the page + page = Request(Url, headers={'User-Agent': 'Mozilla/5.0'}) + page = urlopen(page, context=ctx).read() + # Using beautifulsoup to read the contents of the page + soup = BeautifulSoup(page, 'html.parser') + # finding all the link headers + links = soup.findAll('a') + if(links is not None) : + finalLinks = [] + # getting actual site links from the header a + for link in links : + if 'href' in str(link) : + templist = str(link).split("href") + index1 = templist[-1].index("\"") + index2 = templist[-1][index1 + 1 :].index("\"") + finalLinks.append(templist[-1][index1 : index2 + 3]) + print("Here are your final links") + # printing the final completed list + for i in finalLinks : + print(i) +except Exception as e : + print(str(e))