From c88efb14d965e3053af782e10df6b2e0d9b0146f Mon Sep 17 00:00:00 2001 From: "mahmoud.ahmed" Date: Thu, 8 Oct 2020 18:46:59 +0200 Subject: [PATCH 1/3] add: links extractor automation script --- links_extractor/README.md | 13 +++++ links_extractor/links_extractor.py | 57 +++++++++++++++++++ links_extractor/requirements.txt | 1 + links_extractor/sample/sample_text_file.txt | 13 +++++ .../sample/sample_text_file_links.txt | 6 ++ 5 files changed, 90 insertions(+) create mode 100644 links_extractor/README.md create mode 100644 links_extractor/links_extractor.py create mode 100644 links_extractor/requirements.txt create mode 100644 links_extractor/sample/sample_text_file.txt create mode 100644 links_extractor/sample/sample_text_file_links.txt diff --git a/links_extractor/README.md b/links_extractor/README.md new file mode 100644 index 000000000..615c73d78 --- /dev/null +++ b/links_extractor/README.md @@ -0,0 +1,13 @@ +# Links Extractor + +## Objective +This script automate extracting URLs from any ```.txt``` file content based on regex expression then exporting the extracted urls in ```.txt``` output file separated by line separator. +## Sample +- Sample input available in ```sample/sample_text_file.txt``` +- Sample output available in ```sample/sample_text_file_links.txt``` +## Requirements +```pip install requirements.txt``` +## How to run the script? +``` +python links_extractor.py file_name.txt +``` diff --git a/links_extractor/links_extractor.py b/links_extractor/links_extractor.py new file mode 100644 index 000000000..64c7319d8 --- /dev/null +++ b/links_extractor/links_extractor.py @@ -0,0 +1,57 @@ +import re +import sys + + +def get_urls(file_path): + """[start method to fire extracting urls process] + + Arguments: + file_path {[str]} -- [target text file path] + """ + text = read_text_file(file_path) + urls = extract_urls(text) + export_urls(urls, file_path) + + +def read_text_file(file_path): + """[summary] + + Arguments: + file_path {[str]} -- [target text file path] + + Returns: + [str] -- [file content to works on] + """ + with open(file_path) as f: + text = f.read() + return text + + +def extract_urls(text): + """[summary] + + Arguments: + text {[str]} -- [file content to works on] + + Returns: + [list] -- [extracted urls] + """ + url_regex_pattern = "(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+" + urls = re.findall(url_regex_pattern, text) + return urls + + +def export_urls(urls, file_path): + """[summary] + + Arguments: + urls {[list]} -- [extracted urls] + file_path {[str]} -- [result text file path it follow the same input file but with additional suffix] + """ + with open(file_path.replace(".txt", "_links.txt"), "w") as f: + text = f.write("\n".join(urls)) + + +if __name__ == "__main__": + file_path = sys.argv[1] + get_urls(file_path) diff --git a/links_extractor/requirements.txt b/links_extractor/requirements.txt new file mode 100644 index 000000000..f4508fee8 --- /dev/null +++ b/links_extractor/requirements.txt @@ -0,0 +1 @@ +regex==2020.9.27 \ No newline at end of file diff --git a/links_extractor/sample/sample_text_file.txt b/links_extractor/sample/sample_text_file.txt new file mode 100644 index 000000000..1d0b463eb --- /dev/null +++ b/links_extractor/sample/sample_text_file.txt @@ -0,0 +1,13 @@ +New album 'Heart To Mouth" is out now: https://lp.lnk.to/HeartToMouthID + + +Lost On You: http://smarturl.it/LostOnYouAlbum + +---------------------------------- + +Website: http://iamlp.com +Facebook: http://facebook.com/iamLP +Twitter: http://twitter.com/iamlp +Soundcloud: https://soundcloud.com/iamlpmusic +Suggested by WMG +LP - Muddy Waters [Live Session] diff --git a/links_extractor/sample/sample_text_file_links.txt b/links_extractor/sample/sample_text_file_links.txt new file mode 100644 index 000000000..7893d2b46 --- /dev/null +++ b/links_extractor/sample/sample_text_file_links.txt @@ -0,0 +1,6 @@ +https://lp.lnk.to/HeartToMouthID +http://smarturl.it/LostOnYouAlbum +http://iamlp.com +http://facebook.com/iamLP +http://twitter.com/iamlp +https://soundcloud.com/iamlpmusic \ No newline at end of file From 739fc1d4098f9e1dbf73f8d495e91a9f9845f5e2 Mon Sep 17 00:00:00 2001 From: "mahmoud.ahmed" Date: Thu, 8 Oct 2020 18:59:34 +0200 Subject: [PATCH 2/3] update: linting with pep8 online --- links_extractor/links_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/links_extractor/links_extractor.py b/links_extractor/links_extractor.py index 64c7319d8..8a96226ee 100644 --- a/links_extractor/links_extractor.py +++ b/links_extractor/links_extractor.py @@ -46,10 +46,10 @@ def export_urls(urls, file_path): Arguments: urls {[list]} -- [extracted urls] - file_path {[str]} -- [result text file path it follow the same input file but with additional suffix] + file_path {[str]} -- [result text file path] """ with open(file_path.replace(".txt", "_links.txt"), "w") as f: - text = f.write("\n".join(urls)) + f.write("\n".join(urls)) if __name__ == "__main__": From cc6964b7b0061c83470a90fd0c786f72d8ba5d23 Mon Sep 17 00:00:00 2001 From: "mahmoud.ahmed" Date: Sat, 10 Oct 2020 19:41:17 +0200 Subject: [PATCH 3/3] update: linting regular expression --- links_extractor/links_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/links_extractor/links_extractor.py b/links_extractor/links_extractor.py index 8a96226ee..075a88056 100644 --- a/links_extractor/links_extractor.py +++ b/links_extractor/links_extractor.py @@ -36,7 +36,7 @@ def extract_urls(text): Returns: [list] -- [extracted urls] """ - url_regex_pattern = "(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+" + url_regex_pattern = r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+" urls = re.findall(url_regex_pattern, text) return urls