From c88efb14d965e3053af782e10df6b2e0d9b0146f Mon Sep 17 00:00:00 2001
From: "mahmoud.ahmed" <mahmoud.ahmed@benchmarklabs.io>
Date: Thu, 8 Oct 2020 18:46:59 +0200
Subject: [PATCH 1/3] add: links extractor automation script

---
 links_extractor/README.md                     | 13 +++++
 links_extractor/links_extractor.py            | 57 +++++++++++++++++++
 links_extractor/requirements.txt              |  1 +
 links_extractor/sample/sample_text_file.txt   | 13 +++++
 .../sample/sample_text_file_links.txt         |  6 ++
 5 files changed, 90 insertions(+)
 create mode 100644 links_extractor/README.md
 create mode 100644 links_extractor/links_extractor.py
 create mode 100644 links_extractor/requirements.txt
 create mode 100644 links_extractor/sample/sample_text_file.txt
 create mode 100644 links_extractor/sample/sample_text_file_links.txt

diff --git a/links_extractor/README.md b/links_extractor/README.md
new file mode 100644
index 000000000..615c73d78
--- /dev/null
+++ b/links_extractor/README.md
@@ -0,0 +1,13 @@
+# Links Extractor
+
+## Objective
+This script automate extracting URLs from any ```.txt``` file content based on regex expression then exporting the extracted urls in ```.txt``` output file separated by line separator.
+## Sample
+- Sample input available in ```sample/sample_text_file.txt```
+- Sample output available in ```sample/sample_text_file_links.txt```
+## Requirements
+```pip install requirements.txt```
+## How to run the script?
+```
+python links_extractor.py file_name.txt
+```
diff --git a/links_extractor/links_extractor.py b/links_extractor/links_extractor.py
new file mode 100644
index 000000000..64c7319d8
--- /dev/null
+++ b/links_extractor/links_extractor.py
@@ -0,0 +1,57 @@
+import re
+import sys
+
+
+def get_urls(file_path):
+    """[start method to fire extracting urls process]
+
+    Arguments:
+        file_path {[str]} -- [target text file path]
+    """
+    text = read_text_file(file_path)
+    urls = extract_urls(text)
+    export_urls(urls, file_path)
+
+
+def read_text_file(file_path):
+    """[summary]
+
+    Arguments:
+        file_path {[str]} -- [target text file path]
+
+    Returns:
+        [str] -- [file content to works on]
+    """
+    with open(file_path) as f:
+        text = f.read()
+    return text
+
+
+def extract_urls(text):
+    """[summary]
+
+    Arguments:
+        text {[str]} -- [file content to works on]
+
+    Returns:
+        [list] -- [extracted urls]
+    """
+    url_regex_pattern = "(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+"
+    urls = re.findall(url_regex_pattern, text)
+    return urls
+
+
+def export_urls(urls, file_path):
+    """[summary]
+
+    Arguments:
+        urls {[list]} -- [extracted urls]
+        file_path {[str]} -- [result text file path it follow the same input file but with additional suffix]
+    """
+    with open(file_path.replace(".txt", "_links.txt"), "w") as f:
+        text = f.write("\n".join(urls))
+
+
+if __name__ == "__main__":
+    file_path = sys.argv[1]
+    get_urls(file_path)
diff --git a/links_extractor/requirements.txt b/links_extractor/requirements.txt
new file mode 100644
index 000000000..f4508fee8
--- /dev/null
+++ b/links_extractor/requirements.txt
@@ -0,0 +1 @@
+regex==2020.9.27
\ No newline at end of file
diff --git a/links_extractor/sample/sample_text_file.txt b/links_extractor/sample/sample_text_file.txt
new file mode 100644
index 000000000..1d0b463eb
--- /dev/null
+++ b/links_extractor/sample/sample_text_file.txt
@@ -0,0 +1,13 @@
+New album 'Heart To Mouth" is out now: https://lp.lnk.to/HeartToMouthID
+
+
+Lost On You: http://smarturl.it/LostOnYouAlbum
+
+----------------------------------
+
+Website: http://iamlp.com
+Facebook: http://facebook.com/iamLP
+Twitter: http://twitter.com/iamlp
+Soundcloud: https://soundcloud.com/iamlpmusic
+Suggested by WMG
+LP - Muddy Waters [Live Session]
diff --git a/links_extractor/sample/sample_text_file_links.txt b/links_extractor/sample/sample_text_file_links.txt
new file mode 100644
index 000000000..7893d2b46
--- /dev/null
+++ b/links_extractor/sample/sample_text_file_links.txt
@@ -0,0 +1,6 @@
+https://lp.lnk.to/HeartToMouthID
+http://smarturl.it/LostOnYouAlbum
+http://iamlp.com
+http://facebook.com/iamLP
+http://twitter.com/iamlp
+https://soundcloud.com/iamlpmusic
\ No newline at end of file

From 739fc1d4098f9e1dbf73f8d495e91a9f9845f5e2 Mon Sep 17 00:00:00 2001
From: "mahmoud.ahmed" <mahmoud.ahmed@benchmarklabs.io>
Date: Thu, 8 Oct 2020 18:59:34 +0200
Subject: [PATCH 2/3] update: linting with pep8 online

---
 links_extractor/links_extractor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/links_extractor/links_extractor.py b/links_extractor/links_extractor.py
index 64c7319d8..8a96226ee 100644
--- a/links_extractor/links_extractor.py
+++ b/links_extractor/links_extractor.py
@@ -46,10 +46,10 @@ def export_urls(urls, file_path):
 
     Arguments:
         urls {[list]} -- [extracted urls]
-        file_path {[str]} -- [result text file path it follow the same input file but with additional suffix]
+        file_path {[str]} -- [result text file path]
     """
     with open(file_path.replace(".txt", "_links.txt"), "w") as f:
-        text = f.write("\n".join(urls))
+        f.write("\n".join(urls))
 
 
 if __name__ == "__main__":

From cc6964b7b0061c83470a90fd0c786f72d8ba5d23 Mon Sep 17 00:00:00 2001
From: "mahmoud.ahmed" <mahmoud.ahmed@benchmarklabs.io>
Date: Sat, 10 Oct 2020 19:41:17 +0200
Subject: [PATCH 3/3] update: linting regular expression

---
 links_extractor/links_extractor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/links_extractor/links_extractor.py b/links_extractor/links_extractor.py
index 8a96226ee..075a88056 100644
--- a/links_extractor/links_extractor.py
+++ b/links_extractor/links_extractor.py
@@ -36,7 +36,7 @@ def extract_urls(text):
     Returns:
         [list] -- [extracted urls]
     """
-    url_regex_pattern = "(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+"
+    url_regex_pattern = r"(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+"
     urls = re.findall(url_regex_pattern, text)
     return urls