diff --git a/links_extractor/README.md b/links_extractor/README.md new file mode 100644 index 000000000..615c73d78 --- /dev/null +++ b/links_extractor/README.md @@ -0,0 +1,13 @@ +# Links Extractor + +## Objective +This script automate extracting URLs from any ```.txt``` file content based on regex expression then exporting the extracted urls in ```.txt``` output file separated by line separator. +## Sample +- Sample input available in ```sample/sample_text_file.txt``` +- Sample output available in ```sample/sample_text_file_links.txt``` +## Requirements +```pip install requirements.txt``` +## How to run the script? +``` +python links_extractor.py file_name.txt +``` diff --git a/links_extractor/links_extractor.py b/links_extractor/links_extractor.py new file mode 100644 index 000000000..075a88056 --- /dev/null +++ b/links_extractor/links_extractor.py @@ -0,0 +1,57 @@ +import re +import sys + + +def get_urls(file_path): + """[start method to fire extracting urls process] + + Arguments: + file_path {[str]} -- [target text file path] + """ + text = read_text_file(file_path) + urls = extract_urls(text) + export_urls(urls, file_path) + + +def read_text_file(file_path): + """[summary] + + Arguments: + file_path {[str]} -- [target text file path] + + Returns: + [str] -- [file content to works on] + """ + with open(file_path) as f: + text = f.read() + return text + + +def extract_urls(text): + """[summary] + + Arguments: + text {[str]} -- [file content to works on] + + Returns: + [list] -- [extracted urls] + """ + url_regex_pattern = r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+" + urls = re.findall(url_regex_pattern, text) + return urls + + +def export_urls(urls, file_path): + """[summary] + + Arguments: + urls {[list]} -- [extracted urls] + file_path {[str]} -- [result text file path] + """ + with open(file_path.replace(".txt", "_links.txt"), "w") as f: + f.write("\n".join(urls)) + + +if __name__ == "__main__": + file_path = sys.argv[1] + get_urls(file_path) diff --git a/links_extractor/requirements.txt b/links_extractor/requirements.txt new file mode 100644 index 000000000..f4508fee8 --- /dev/null +++ b/links_extractor/requirements.txt @@ -0,0 +1 @@ +regex==2020.9.27 \ No newline at end of file diff --git a/links_extractor/sample/sample_text_file.txt b/links_extractor/sample/sample_text_file.txt new file mode 100644 index 000000000..1d0b463eb --- /dev/null +++ b/links_extractor/sample/sample_text_file.txt @@ -0,0 +1,13 @@ +New album 'Heart To Mouth" is out now: https://lp.lnk.to/HeartToMouthID + + +Lost On You: http://smarturl.it/LostOnYouAlbum + +---------------------------------- + +Website: http://iamlp.com +Facebook: http://facebook.com/iamLP +Twitter: http://twitter.com/iamlp +Soundcloud: https://soundcloud.com/iamlpmusic +Suggested by WMG +LP - Muddy Waters [Live Session] diff --git a/links_extractor/sample/sample_text_file_links.txt b/links_extractor/sample/sample_text_file_links.txt new file mode 100644 index 000000000..7893d2b46 --- /dev/null +++ b/links_extractor/sample/sample_text_file_links.txt @@ -0,0 +1,6 @@ +https://lp.lnk.to/HeartToMouthID +http://smarturl.it/LostOnYouAlbum +http://iamlp.com +http://facebook.com/iamLP +http://twitter.com/iamlp +https://soundcloud.com/iamlpmusic \ No newline at end of file