Skip to content

Commit

Permalink
Init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ramsy0dev committed Sep 13, 2023
1 parent 5e06646 commit 57eada7
Show file tree
Hide file tree
Showing 19 changed files with 1,825 additions and 0 deletions.
Empty file added README.md
Empty file.
365 changes: 365 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions proxycrawler/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from proxycrawler.cli import run

if __name__ == "__main__":
run()
196 changes: 196 additions & 0 deletions proxycrawler/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import os
import sys
import typer

from rich import print
from rich.console import Console

from proxycrawler import helpers
from proxycrawler import constants
from proxycrawler.messages import (
info,
errors
)
from proxycrawler.src.proxycrawler import ProxyCrawler
from proxycrawler.src.database.database_handler import DatabaseHandler

# Init cli
cli = typer.Typer()

@cli.command()
def version():
""" proxycrawler's version """
print(f"[bold white]Version [bold cyan]{constants.VERSION}[bold white]")

@cli.command()
def scrap(
enable_save_on_run: bool = typer.Option(True, "--enable-save-on-run", help="Save valid proxies while proxycrawler is still running (can be useful in case of a bad internet connection)"),
group_by_protocol: bool = typer.Option(False, "--group-by-protocol", help="Save proxies into seperate files based on the supported protocols [http, https, socks4, sock5]"),
output_file_path: str = typer.Option(None, "--output-file-path", help="Costum output file path to save results (.txt)")
):
""" Start scrapping proxies """
console = Console()

# Configuring console
console._log_render.omit_repeated_times = False # Repeat the timestamp even if the logs were logged on the same time

# Check output file path
if output_file_path is not None and not os.path.exists("/".join(output_file_path.split("/")[:-1])):
console.log(
errors.UNVALID_OUTPUT_FILE_PATH(
output_file_path=output_file_path
)
)
sys.exit(1)

# Init database handler
database_handler = DatabaseHandler()

# Init ProxyCrawler
proxy_crawler = ProxyCrawler(
database_handler=database_handler,
console=console,
)

# Fetching proxies and validating them
proxy_crawler.crawl_proxies(
enable_save_on_run=enable_save_on_run,
group_by_protocol=group_by_protocol,
output_file_path=output_file_path
)

@cli.command()
def export_db(
proxies_count: int = typer.Option(None, "--proxies-count", help="Number of proxies to export (exports all by default)"),
validate_proxies: bool = typer.Option(True, "--validate", help="Validate proxies"),
group_by_protocol: bool = typer.Option(False, "--group-by-protocol", help="Save proxies into seperate files based on the supported protocols [http, https, sock4, sock5]"),
output_file_path: str = typer.Option(None, "--output-file-path", help="Costum output file path to save results (.txt)")
):
""" Export proxies from the database """
console = Console()

# Configuring console
console._log_render.omit_repeated_times = False # Repeat the timestamp even if the logs were logged on the same time

# Check output file path
if output_file_path is not None and not os.path.exists("/".join(output_file_path.split("/")[:-1])):
console.log(
errors.UNVALID_OUTPUT_FILE_PATH(
output_file_path=output_file_path
)
)
sys.exit(1)

# Init database handler
database_handler = DatabaseHandler()

# Init proxycrawler
proxy_crawler = ProxyCrawler(
database_handler=database_handler,
console=console,
)

console.log(
info.FETCHING_AND_VALIDATING_PROXIES_FROM_DATABASE
)

proxy_crawler.export_database_proxies(
proxies_count=proxies_count,
group_by_protocol=group_by_protocol,
validate_proxies=validate_proxies,
output_file_path=output_file_path
)

@cli.command()
def validate(
proxy_file_path: str = typer.Option(None, "--proxy-file", help="path to the proxy file"),
protocol: str = typer.Option(None, "--protocol", help="Set a specific protocol to test the proxies on"),
test_all_protocols: bool = typer.Option(False, "--test-all-protocols", help="Test all the protocols on a proxy"),
group_by_protocol: bool = typer.Option(False, "--group-by-protocol", help="Save proxies into seperate files based on the supported protocols [http, https, sock4, sock5]"),
output_file_path: str = typer.Option(None, "--output-file-path", help="Costum output file path to save results (.txt)")
):
""" Validate a proxies list file """
console = Console()

# Configuring console
console._log_render.omit_repeated_times = False # Repeat the timestamp even if the logs were logged on the same time

# Init database handler
database_handler = DatabaseHandler()

# Init proxycrawler
proxy_crawler = ProxyCrawler(
database_handler=database_handler,
console=console,
)

# Check output file path
if output_file_path is not None and not os.path.exists("/".join(output_file_path.split("/")[:-1])):
console.log(
errors.UNVALID_OUTPUT_FILE_PATH(
output_file_path=output_file_path
)
)
sys.exit(1)

# Check if the proxies file exists
if not os.path.exists(proxy_file_path):
console.log(errors.PROXY_FILE_DOESNT_EXIST)
sys.exit(1)

# Check the file's extension
if not proxy_file_path.endswith(".txt"):
console.log(errors.FILE_EXTENSION_NOT_SUPPORTED)
sys.exit(1)

# Check the format of the proxies
proxies = [proxy.strip() for proxy in open(proxy_file_path, "r").readlines()]
results = []

for proxy in proxies:
if not proxy_crawler.check_proxy_fromat(proxy=proxy):
results.append(proxy)

if len(results) != 0:
console.log(errors.UNVALID_PROXY_FORMAT)
sys.exit(1)

# Check the protocol
protocols = [
"http",
"https",
"socks4",
"socks5"
]
if protocol is not None and protocol not in protocols:
console.log(
errors.UNVALID_PROXY_PROTOCOL(
protocol=protocol
)
)
sys.exit(1)

# Validate the list of proxies
console.log(
info.VALIDATING_PROXIES_FROM_FILE(
proxies_count=len(proxies),
proxy_file_path=proxy_file_path
)
)

proxy_crawler.validate_proxies(
proxies=proxies,
protocol=protocol,
test_all_protocols=test_all_protocols,
group_by_protocol=group_by_protocol,
proxy_file_path=proxy_file_path,
output_file_path=output_file_path
)

def run():
""" Runs proxycrawler """
helpers.banner()
cli()

if __name__ == "__main__":
run()
27 changes: 27 additions & 0 deletions proxycrawler/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os

# Package main info
PACKAGE = "proxycrawler"
VERSION = "0.1.0"
AUTHOR = "ramsy0dev"
GITHUB = "https://github.com/ramsy0dev/proxycrawler"

# Banner
BANNER = f"""[bold white]
__
____ _________ _ ____ ________________ __ __/ /__ _____
/ __ \/ ___/ __ \| |/_/ / / / ___/ ___/ __ `/ | /| / / / _ \/ ___/
/ /_/ / / / /_/ /> </ /_/ / /__/ / / /_/ /| |/ |/ / / __/ /
/ .___/_/ \____/_/|_|\__, /\___/_/ \__,_/ |__/|__/_/\___/_/ Version [bold cyan]{VERSION}[bold white]
/_/ /____/
Made by [bold green]`ramsy0dev`[bold white]
"""

# Home path
HOME = os.path.expanduser("~")

# Database URL
DATABASE_URL = f"sqlite+pysqlite:///{HOME}/.config/proxycrawler/database.db"

# Debug proxycrawler
DEBUG = False
33 changes: 33 additions & 0 deletions proxycrawler/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import uuid
import string
import random
import hashlib
import datetime

from rich import print

from proxycrawler import constants

def banner() -> None:
""" proxycrawler's banner """
print(constants.BANNER)

# def log_json(json_data: str, console) -> None:
# """ Logs out the json data in a beautified way """
# splited_json_data = json_data.split("\n")

# for log_line in splited_json_data:
# console.log(log_line)

def date() -> datetime:
""" Returns the current date """
return datetime.datetime.now()

def generate_uid(data: str) -> str:
""" Generates a UID based on the given data """
data = f"{data}{''.join([char for char in random.choices(string.ascii_letters)])}"

hashed_data_salt = hashlib.md5(data.encode()).hexdigest()
generated_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, hashed_data_salt)

return str(generated_uuid)
6 changes: 6 additions & 0 deletions proxycrawler/messages/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""
Debug messages used through out proxycrawler
to help in debugging
"""

EXCEPTION_RAISED_WHEN_VALIDATING_PROXY = lambda proxy, error: f"[bold blue][DEBUG] [bold white]Exception raised when validating proxy:[bold green]{proxy}[bold white]. Error: {error}"
23 changes: 23 additions & 0 deletions proxycrawler/messages/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
Errors messages used through out proxycrawler
to log out to the end-user
"""

FILE_EXTENSION_NOT_SUPPORTED = f"[bold red][ERROR] [bold white]The provided proxy file's extension is not supported. Please make sure it's a plain text file (.txt) and try again"

PROXY_FILE_DOESNT_EXIST = f"[bold red][ERROR] [bold white]The provided proxy file path doesn't seem to exists. Please verify it and try again"

UNVALID_OUTPUT_FILE_PATH = lambda output_file_path: f"[bold red][ERROR] [bold white]Unvalid output file path [bold red]'{output_file_path}'[bold white]. Please change it and try again (or you can leave it empty)"

FAILD_TO_REQUEST_GEONODE_API = lambda error: f"[bold red][ERROR] [bold white]Faild to request [bold green]geonode[bold white]'s API. Error: {error}"
FAILD_TO_REQUEST_FREE_PROXY_LIST = lambda error: f"[bold red][ERROR] [bold white]Faild to request [bold green]free-proxy-list.net[bold white]. Error: {error}"

UNVALID_COUNTRY_CODE = lambda country_code, supported_country_code: f"[bold red][ ! ] [bold white]Unvalid country code [bold red]'{country_code}'[bold white]. Supported country code: \n{supported_country_code}"

UNVALID_PROXY_FORMAT = f"[bold red][ERROR] [bold white]Unvalid proxies format. Format should be [bold green]<protocol>://ip:port[bold white]. Please fix it and try again"

UNVALID_PROXY_PROTOCOL = lambda protocol, protocols: f"[bold red][ERROR] [bold white]Unvalid proxy protocol [bold red]'{protocol}'. the supported protocols are [bold green]{protocols}[bold white] (you may keep --protocol null to test it on all protocols)"

NO_PROXIES_WHERE_GATHERED = lambda proxies: f"[bold red][ERROR] [bold white]No proxies where gathered. proxies:[bold red]{proxies}[bold white]"

NO_PROXIES_WHERE_FOUND_IN_THE_DATABASE = "[bold red][ERROR] [bold white]No proxies where found in the database"
20 changes: 20 additions & 0 deletions proxycrawler/messages/info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""
Info messages used through out proxycrawler
to log out to the end-user
"""

USING_SERVICE = lambda service_name, service_url: f"[bold green][INFO] [bold white]Using service [bold green]'{service_name}'[bold white] with url:[bold red]'{service_url}'[bold white]"

REQUESTING_GEONODE_API = lambda api_url, payload: f"[bold green][INFO] [bold white]Requesting [bold green]Geonode[bold white]'s API at api_url:[bold green]'{api_url}'[bold white] with payload: {payload}"

REQUESTING_FREE_PROXY_LIST = lambda url: f"[bold green][INFO] [bold white]Scrapping [bold green]free-proxy-list[bold white] at url:[bold green]'{url}'[bold white]"

FOUND_A_VALID_PROXY = lambda proxy: f"[bold green][INFO] [bold white]Found a valid proxy: [bold green]{proxy.proxy}[bold white]"

PROXIES_SAVED_IN_PATHS = lambda output_file_paths: "[bold green][INFO] [bold white]Proxies saved in the following files:{}".format("".join([f"\n\t[bold green]->[bold white] {path}" for path in output_file_paths]))

FETCHING_AND_VALIDATING_PROXIES_FROM_DATABASE = f"[bold green][INFO] [bold white]Fetching and validating proxies from the database"

FETCHED_PROXIES_FROM_THE_DATABASE = lambda count: f"[bold green][INFO] [bold white]Fetched [bold green]'{count}'[bold white] proxies from the database. Validating them ..."

VALIDATING_PROXIES_FROM_FILE = lambda proxies_count, proxy_file_path: f"[bold green][INFO] [bold white]Found [bold green]'{proxies_count}'[bold white] proxies from [bold green]'{proxy_file_path}'[bold white]. Validating them..."
Loading

0 comments on commit 57eada7

Please sign in to comment.