Init commit

ramsy0dev · Sep 13, 2023 · 57eada7 · 57eada7
1 parent 5e06646
commit 57eada7
Show file tree

Hide file tree

Showing 19 changed files with 1,825 additions and 0 deletions.
diff --git a/README.md b/README.md
diff --git a/poetry.lock b/poetry.lock
diff --git a/proxycrawler/__main__.py b/proxycrawler/__main__.py
@@ -0,0 +1,4 @@
+from proxycrawler.cli import run
+
+if __name__ == "__main__":
+    run()
diff --git a/proxycrawler/cli.py b/proxycrawler/cli.py
@@ -0,0 +1,196 @@
+import os
+import sys
+import typer
+
+from rich import print
+from rich.console import Console
+
+from proxycrawler import helpers
+from proxycrawler import constants
+from proxycrawler.messages import (
+    info,
+    errors
+)
+from proxycrawler.src.proxycrawler import ProxyCrawler
+from proxycrawler.src.database.database_handler import DatabaseHandler
+
+# Init cli
+cli = typer.Typer()
+
+@cli.command()
+def version():
+    """ proxycrawler's version """
+    print(f"[bold white]Version [bold cyan]{constants.VERSION}[bold white]")
+
+@cli.command()
+def scrap(
+    enable_save_on_run: bool = typer.Option(True, "--enable-save-on-run", help="Save valid proxies while proxycrawler is still running (can be useful in case of a bad internet connection)"),
+    group_by_protocol: bool = typer.Option(False, "--group-by-protocol", help="Save proxies into seperate files based on the supported protocols [http, https, socks4, sock5]"),
+    output_file_path: str = typer.Option(None, "--output-file-path", help="Costum output file path to save results (.txt)")
+):
+    """ Start scrapping proxies """
+    console = Console()
+
+    # Configuring console
+    console._log_render.omit_repeated_times = False # Repeat the timestamp even if the logs were logged on the same time
+
+    # Check output file path
+    if output_file_path is not None and not os.path.exists("/".join(output_file_path.split("/")[:-1])):
+        console.log(
+            errors.UNVALID_OUTPUT_FILE_PATH(
+                output_file_path=output_file_path
+            )
+        )
+        sys.exit(1)
+
+    # Init database handler
+    database_handler = DatabaseHandler()
+
+    # Init ProxyCrawler
+    proxy_crawler = ProxyCrawler(
+        database_handler=database_handler,
+        console=console,
+    )
+
+    # Fetching proxies and validating them
+    proxy_crawler.crawl_proxies(
+        enable_save_on_run=enable_save_on_run,
+        group_by_protocol=group_by_protocol,
+        output_file_path=output_file_path
+    )
+
+@cli.command()
+def export_db(
+    proxies_count: int = typer.Option(None, "--proxies-count", help="Number of proxies to export (exports all by default)"),
+    validate_proxies: bool = typer.Option(True, "--validate", help="Validate proxies"),
+    group_by_protocol: bool = typer.Option(False, "--group-by-protocol", help="Save proxies into seperate files based on the supported protocols [http, https, sock4, sock5]"),
+    output_file_path: str = typer.Option(None, "--output-file-path", help="Costum output file path to save results (.txt)")
+):
+    """ Export proxies from the database """
+    console = Console()
+
+    # Configuring console
+    console._log_render.omit_repeated_times = False # Repeat the timestamp even if the logs were logged on the same time
+
+    # Check output file path
+    if output_file_path is not None and not os.path.exists("/".join(output_file_path.split("/")[:-1])):
+        console.log(
+            errors.UNVALID_OUTPUT_FILE_PATH(
+                output_file_path=output_file_path
+            )
+        )
+        sys.exit(1)
+
+    # Init database handler
+    database_handler = DatabaseHandler()
+
+    # Init proxycrawler
+    proxy_crawler = ProxyCrawler(
+        database_handler=database_handler,
+        console=console,
+    )
+
+    console.log(
+        info.FETCHING_AND_VALIDATING_PROXIES_FROM_DATABASE
+    )
+
+    proxy_crawler.export_database_proxies(
+        proxies_count=proxies_count,
+        group_by_protocol=group_by_protocol,
+        validate_proxies=validate_proxies,
+        output_file_path=output_file_path
+    )
+
+@cli.command()
+def validate(
+    proxy_file_path: str = typer.Option(None, "--proxy-file", help="path to the proxy file"),
+    protocol: str = typer.Option(None, "--protocol", help="Set a specific protocol to test the proxies on"),
+    test_all_protocols: bool = typer.Option(False, "--test-all-protocols", help="Test all the protocols on a proxy"),
+    group_by_protocol: bool = typer.Option(False, "--group-by-protocol", help="Save proxies into seperate files based on the supported protocols [http, https, sock4, sock5]"),
+    output_file_path: str = typer.Option(None, "--output-file-path", help="Costum output file path to save results (.txt)")
+):
+    """ Validate a proxies list file """
+    console = Console()
+
+    # Configuring console
+    console._log_render.omit_repeated_times = False # Repeat the timestamp even if the logs were logged on the same time
+
+    # Init database handler
+    database_handler = DatabaseHandler()
+
+    # Init proxycrawler
+    proxy_crawler = ProxyCrawler(
+        database_handler=database_handler,
+        console=console,
+    )
+
+    # Check output file path
+    if output_file_path is not None and not os.path.exists("/".join(output_file_path.split("/")[:-1])):
+        console.log(
+            errors.UNVALID_OUTPUT_FILE_PATH(
+                output_file_path=output_file_path
+            )
+        )
+        sys.exit(1)
+
+    # Check if the proxies file exists
+    if not os.path.exists(proxy_file_path):
+        console.log(errors.PROXY_FILE_DOESNT_EXIST)
+        sys.exit(1)
+
+    # Check the file's extension
+    if not proxy_file_path.endswith(".txt"):
+        console.log(errors.FILE_EXTENSION_NOT_SUPPORTED)
+        sys.exit(1)
+
+    # Check the format of the proxies
+    proxies = [proxy.strip() for proxy in open(proxy_file_path, "r").readlines()]
+    results = []
+
+    for proxy in proxies:
+        if not proxy_crawler.check_proxy_fromat(proxy=proxy):
+            results.append(proxy)
+
+    if len(results) != 0:
+        console.log(errors.UNVALID_PROXY_FORMAT)
+        sys.exit(1)
+
+    # Check the protocol
+    protocols = [
+        "http",
+        "https",
+        "socks4",
+        "socks5"
+    ]
+    if protocol is not None and protocol not in protocols:
+        console.log(
+            errors.UNVALID_PROXY_PROTOCOL(
+                protocol=protocol
+            )
+        )
+        sys.exit(1)
+
+    # Validate the list of proxies
+    console.log(
+        info.VALIDATING_PROXIES_FROM_FILE(
+            proxies_count=len(proxies),
+            proxy_file_path=proxy_file_path
+        )
+    )
+
+    proxy_crawler.validate_proxies(
+        proxies=proxies,
+        protocol=protocol,
+        test_all_protocols=test_all_protocols,
+        group_by_protocol=group_by_protocol,
+        proxy_file_path=proxy_file_path,
+        output_file_path=output_file_path
+    )
+
+def run():
+    """ Runs proxycrawler """
+    helpers.banner()
+    cli()
+
+if __name__ == "__main__":
+    run()
diff --git a/proxycrawler/constants.py b/proxycrawler/constants.py
@@ -0,0 +1,27 @@
+import os
+
+# Package main info
+PACKAGE = "proxycrawler"
+VERSION = "0.1.0"
+AUTHOR = "ramsy0dev"
+GITHUB = "https://github.com/ramsy0dev/proxycrawler"
+
+# Banner
+BANNER = f"""[bold white]
+                                                          __
+    ____  _________  _  ____  ________________ __      __/ /__  _____
+   / __ \/ ___/ __ \| |/_/ / / / ___/ ___/ __ `/ | /| / / / _ \/ ___/
+  / /_/ / /  / /_/ />  </ /_/ / /__/ /  / /_/ /| |/ |/ / /  __/ /
+ / .___/_/   \____/_/|_|\__, /\___/_/   \__,_/ |__/|__/_/\___/_/ Version [bold cyan]{VERSION}[bold white]
+/_/                    /____/
+                            Made by [bold green]`ramsy0dev`[bold white]
+"""
+
+# Home path
+HOME = os.path.expanduser("~")
+
+# Database URL
+DATABASE_URL = f"sqlite+pysqlite:///{HOME}/.config/proxycrawler/database.db"
+
+# Debug proxycrawler
+DEBUG = False
diff --git a/proxycrawler/helpers.py b/proxycrawler/helpers.py
@@ -0,0 +1,33 @@
+import uuid
+import string
+import random
+import hashlib
+import datetime
+
+from rich import print
+
+from proxycrawler import constants
+
+def banner() -> None:
+    """ proxycrawler's banner """
+    print(constants.BANNER)
+
+# def log_json(json_data: str, console) -> None:
+#     """ Logs out the json data in a beautified way """
+#     splited_json_data = json_data.split("\n")
+
+#     for log_line in splited_json_data:
+#         console.log(log_line)
+
+def date() -> datetime:
+    """ Returns the current date """
+    return datetime.datetime.now()
+
+def generate_uid(data: str) -> str:
+    """ Generates a UID based on the given data """
+    data = f"{data}{''.join([char for char in random.choices(string.ascii_letters)])}"
+
+    hashed_data_salt = hashlib.md5(data.encode()).hexdigest()
+    generated_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, hashed_data_salt)
+
+    return str(generated_uuid)
diff --git a/proxycrawler/messages/debug.py b/proxycrawler/messages/debug.py
@@ -0,0 +1,6 @@
+"""
+    Debug messages used through out proxycrawler
+    to help in debugging
+"""
+
+EXCEPTION_RAISED_WHEN_VALIDATING_PROXY = lambda proxy, error: f"[bold blue][DEBUG] [bold white]Exception raised when validating proxy:[bold green]{proxy}[bold white]. Error: {error}"
diff --git a/proxycrawler/messages/errors.py b/proxycrawler/messages/errors.py
@@ -0,0 +1,23 @@
+"""
+    Errors messages used through out proxycrawler
+    to log out to the end-user
+"""
+
+FILE_EXTENSION_NOT_SUPPORTED = f"[bold red][ERROR] [bold white]The provided proxy file's extension is not supported. Please make sure it's a plain text file (.txt) and try again"
+
+PROXY_FILE_DOESNT_EXIST = f"[bold red][ERROR] [bold white]The provided proxy file path doesn't seem to exists. Please verify it and try again"
+
+UNVALID_OUTPUT_FILE_PATH = lambda output_file_path: f"[bold red][ERROR] [bold white]Unvalid output file path [bold red]'{output_file_path}'[bold white]. Please change it and try again (or you can leave it empty)"
+
+FAILD_TO_REQUEST_GEONODE_API = lambda error: f"[bold red][ERROR] [bold white]Faild to request [bold green]geonode[bold white]'s API. Error: {error}"
+FAILD_TO_REQUEST_FREE_PROXY_LIST = lambda error: f"[bold red][ERROR] [bold white]Faild to request [bold green]free-proxy-list.net[bold white]. Error: {error}"
+
+UNVALID_COUNTRY_CODE = lambda country_code, supported_country_code: f"[bold red][ ! ] [bold white]Unvalid country code [bold red]'{country_code}'[bold white]. Supported country code: \n{supported_country_code}"
+
+UNVALID_PROXY_FORMAT = f"[bold red][ERROR] [bold white]Unvalid proxies format. Format should be [bold green]<protocol>://ip:port[bold white]. Please fix it and try again"
+
+UNVALID_PROXY_PROTOCOL = lambda protocol, protocols: f"[bold red][ERROR] [bold white]Unvalid proxy protocol [bold red]'{protocol}'. the supported protocols are [bold green]{protocols}[bold white] (you may keep --protocol null to test it on all protocols)"
+
+NO_PROXIES_WHERE_GATHERED = lambda proxies: f"[bold red][ERROR] [bold white]No proxies where gathered. proxies:[bold red]{proxies}[bold white]"
+
+NO_PROXIES_WHERE_FOUND_IN_THE_DATABASE = "[bold red][ERROR] [bold white]No proxies where found in the database"
diff --git a/proxycrawler/messages/info.py b/proxycrawler/messages/info.py
@@ -0,0 +1,20 @@
+"""
+    Info messages used through out proxycrawler
+    to log out to the end-user
+"""
+
+USING_SERVICE = lambda service_name, service_url: f"[bold green][INFO] [bold white]Using service [bold green]'{service_name}'[bold white] with url:[bold red]'{service_url}'[bold white]"
+
+REQUESTING_GEONODE_API = lambda api_url, payload: f"[bold green][INFO] [bold white]Requesting [bold green]Geonode[bold white]'s API at api_url:[bold green]'{api_url}'[bold white] with payload: {payload}"
+
+REQUESTING_FREE_PROXY_LIST = lambda url: f"[bold green][INFO] [bold white]Scrapping [bold green]free-proxy-list[bold white] at url:[bold green]'{url}'[bold white]"
+
+FOUND_A_VALID_PROXY = lambda proxy: f"[bold green][INFO] [bold white]Found a valid proxy: [bold green]{proxy.proxy}[bold white]"
+
+PROXIES_SAVED_IN_PATHS = lambda output_file_paths: "[bold green][INFO] [bold white]Proxies saved in the following files:{}".format("".join([f"\n\t[bold green]->[bold white] {path}" for path in output_file_paths]))
+
+FETCHING_AND_VALIDATING_PROXIES_FROM_DATABASE = f"[bold green][INFO] [bold white]Fetching and validating proxies from the database"
+
+FETCHED_PROXIES_FROM_THE_DATABASE = lambda count: f"[bold green][INFO] [bold white]Fetched [bold green]'{count}'[bold white] proxies from the database. Validating them ..."
+
+VALIDATING_PROXIES_FROM_FILE = lambda proxies_count, proxy_file_path: f"[bold green][INFO] [bold white]Found [bold green]'{proxies_count}'[bold white] proxies from [bold green]'{proxy_file_path}'[bold white]. Validating them..."