diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..174478de1 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,16 @@ +## Proposed changes + + + +### Proof + + + +## Checklist + + + +- [ ] Pull request is created against the [dev](https://github.com/projectdiscovery/httpx/tree/dev) branch +- [ ] All checks passed (lint, unit/integration/regression tests etc.) with my changes +- [ ] I have added tests that prove my fix is effective or that my feature works +- [ ] I have added necessary documentation (if appropriate) \ No newline at end of file diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index f58ebd805..11bda1298 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -15,12 +15,7 @@ jobs: steps: - uses: actions/checkout@v4 - uses: projectdiscovery/actions/setup/go@v1 - - name: Run golangci-lint - uses: golangci/golangci-lint-action@v5 - with: - version: latest - args: --timeout 5m - working-directory: . + - uses: projectdiscovery/actions/golangci-lint/v2@v1 build: name: Test Builds diff --git a/.github/workflows/functional-test.yml b/.github/workflows/functional-test.yml index cd1b90719..f712dddae 100644 --- a/.github/workflows/functional-test.yml +++ b/.github/workflows/functional-test.yml @@ -7,7 +7,7 @@ on: - '**.mod' workflow_dispatch: -jobs: +jobs: functional: name: Functional Test runs-on: ${{ matrix.os }} @@ -15,13 +15,10 @@ jobs: matrix: os: [ubuntu-latest, windows-latest, macOS-latest] steps: - - name: Set up Go - uses: actions/setup-go@v4 - with: - go-version: 1.21.x - - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 + + - uses: projectdiscovery/actions/setup/go@v1 - name: Functional Tests run: | diff --git a/.github/workflows/release-binary.yml b/.github/workflows/release-binary.yml index 884e0f4bb..b906129db 100644 --- a/.github/workflows/release-binary.yml +++ b/.github/workflows/release-binary.yml @@ -11,23 +11,20 @@ jobs: runs-on: ubuntu-latest-16-cores steps: - name: "Check out code" - uses: actions/checkout@v3 - with: + uses: actions/checkout@v4 + with: fetch-depth: 0 - - - name: "Set up Go" - uses: actions/setup-go@v4 - with: - go-version: 1.21.x - + + - uses: projectdiscovery/actions/setup/go@v1 + - name: "Create release on GitHub" uses: goreleaser/goreleaser-action@v4 - with: + with: args: "release --clean" version: latest workdir: . - env: + env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" SLACK_WEBHOOK: "${{ secrets.RELEASE_SLACK_WEBHOOK }}" DISCORD_WEBHOOK_ID: "${{ secrets.DISCORD_WEBHOOK_ID }}" - DISCORD_WEBHOOK_TOKEN: "${{ secrets.DISCORD_WEBHOOK_TOKEN }}" \ No newline at end of file + DISCORD_WEBHOOK_TOKEN: "${{ secrets.DISCORD_WEBHOOK_TOKEN }}" diff --git a/.github/workflows/release-test.yml b/.github/workflows/release-test.yml index c91518530..c163e9c72 100644 --- a/.github/workflows/release-test.yml +++ b/.github/workflows/release-test.yml @@ -12,15 +12,12 @@ jobs: runs-on: ubuntu-latest-16-cores steps: - name: "Check out code" - uses: actions/checkout@v3 - with: + uses: actions/checkout@v4 + with: fetch-depth: 0 - - name: Set up Go - uses: actions/setup-go@v4 - with: - go-version: 1.21.x - + - uses: projectdiscovery/actions/setup/go@v1 + - name: release test uses: goreleaser/goreleaser-action@v4 with: diff --git a/Dockerfile b/Dockerfile index 6cbe235c0..279594ac2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Base -FROM golang:1.24.5-alpine AS builder +FROM golang:1.25.7-alpine AS builder RUN apk add --no-cache git build-base gcc musl-dev WORKDIR /app diff --git a/README.md b/README.md index 98dce07a5..5dddd7379 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ # Installation Instructions -`httpx` requires **go >=1.24.0** to install successfully. Run the following command to get the repo: +`httpx` requires **go >=1.25.0** to install successfully. Run the following command to get the repo: ```sh go install -v github.com/projectdiscovery/httpx/cmd/httpx@latest @@ -98,42 +98,40 @@ INPUT: -im, -input-mode string mode of input file (burp) PROBES: - -sc, -status-code display response status-code - -cl, -content-length display response content-length - -ct, -content-type display response content-type - -location display response redirect location - -favicon display mmh3 hash for '/favicon.ico' file - -hash string display response body hash (supported: md5,mmh3,simhash,sha1,sha256,sha512) - -jarm display jarm fingerprint hash - -rt, -response-time display response time - -lc, -line-count display response body line count - -wc, -word-count display response body word count - -title display page title - -bp, -body-preview display first N characters of response body (default 100) - -server, -web-server display server name + -sc, -status-code display response status-code + -cl, -content-length display response content-length + -ct, -content-type display response content-type + -location display response redirect location + -favicon display mmh3 hash for '/favicon.ico' file + -hash string display response body hash (supported: md5,mmh3,simhash,sha1,sha256,sha512) + -jarm display jarm fingerprint hash + -rt, -response-time display response time + -lc, -line-count display response body line count + -wc, -word-count display response body word count + -title display page title + -bp, -body-preview display first N characters of response body (default 100) + -server, -web-server display server name -td, -tech-detect display technology in use based on wappalyzer dataset -cff, -custom-fingerprint-file string path to a custom fingerprint file for technology detection - -cpe display CPE (Common Platform Enumeration) based on awesome-search-queries - -wp, -wordpress display WordPress plugins and themes -method display http request method - -ws, -websocket display server using websocket - -ip display host ip - -cname display host cname - -extract-fqdn, -efqdn get domain and subdomains from response body and header in jsonl/csv output - -asn display host asn information - -cdn display cdn/waf in use (default true) - -probe display probe status + -ws, -websocket display server using websocket + -ip display host ip + -cname display host cname + -extract-fqdn, -efqdn get domain and subdomains from response body and header in jsonl/csv output + -asn display host asn information + -cdn display cdn/waf in use (default true) + -probe display probe status HEADLESS: -ss, -screenshot enable saving screenshot of the page using headless browser -system-chrome enable using local installed chrome for screenshot -ho, -headless-options string[] start headless chrome with additional options -esb, -exclude-screenshot-bytes enable excluding screenshot bytes from json output - -no-screenshot-full-page disable saving full page screenshot -ehb, -exclude-headless-body enable excluding headless header from json output + -no-screenshot-full-page disable saving full page screenshot -st, -screenshot-timeout value set timeout for screenshot in seconds (default 10s) -sid, -screenshot-idle value set idle time before taking screenshot in seconds (default 1s) - -jsc, -javascript-code string[] execute JavaScript code after navigation + -jsc, -javascript-code string[] execute JavaScript code after navigation MATCHERS: -mc, -match-code string match response with specified status code (-mc 200,302) @@ -143,7 +141,7 @@ MATCHERS: -mfc, -match-favicon string[] match response with specified favicon hash (-mfc 1494302000) -ms, -match-string string[] match response with specified string (-ms admin) -mr, -match-regex string[] match response with specified regex (-mr admin) - -mcdn, -match-cdn string[] match host with specified cdn provider (cloudfront, fastly, google) + -mcdn, -match-cdn string[] match host with specified cdn provider (cloudfront, fastly, google, etc.) -mrt, -match-response-time string match response with specified response time in seconds (-mrt '< 1') -mdc, -match-condition string match response with dsl expression condition @@ -152,19 +150,22 @@ EXTRACTOR: -ep, -extract-preset string[] display response content matched by a pre-defined regex (url,ipv4,mail) FILTERS: - -fc, -filter-code string filter response with specified status code (-fc 403,401) - -fep, -filter-error-page filter response with ML based error page detection - -fd, -filter-duplicates filter out near-duplicate responses (only first response is retained) - -fl, -filter-length string filter response with specified content length (-fl 23,33) - -flc, -filter-line-count string filter response body with specified line count (-flc 423,532) - -fwc, -filter-word-count string filter response body with specified word count (-fwc 423,532) - -ffc, -filter-favicon string[] filter response with specified favicon hash (-ffc 1494302000) - -fs, -filter-string string[] filter response with specified string (-fs admin) - -fe, -filter-regex string[] filter response with specified regex (-fe admin) - -fcdn, -filter-cdn string[] filter host with specified cdn provider (cloudfront, fastly, google) - -frt, -filter-response-time string filter response with specified response time in seconds (-frt '> 1') - -fdc, -filter-condition string filter response with dsl expression condition - -strip strips all tags in response. supported formats: html,xml (default html) + -fc, -filter-code string filter response with specified status code (-fc 403,401) + -fpt, -filter-page-type string[] filter response with specified page type (e.g. -fpt login,captcha,parked) + -fep, -filter-error-page [DEPRECATED: use -fpt] filter response with ML based error page detection + -fd, -filter-duplicates filter out near-duplicate responses (only first response is retained) + -fl, -filter-length string filter response with specified content length (-fl 23,33) + -flc, -filter-line-count string filter response body with specified line count (-flc 423,532) + -fwc, -filter-word-count string filter response body with specified word count (-fwc 423,532) + -ffc, -filter-favicon string[] filter response with specified favicon hash (-ffc 1494302000) + -fs, -filter-string string[] filter response with specified string (-fs admin) + -fe, -filter-regex string[] filter response with specified regex (-fe admin) + -fcdn, -filter-cdn string[] filter host with specified cdn provider (cloudfront, fastly, google, etc.) + -frt, -filter-response-time string filter response with specified response time in seconds (-frt '> 1') + -fdc, -filter-condition string filter response with dsl expression condition + -strip strips all tags in response. supported formats: html,xml (default html) + -lof, -list-output-fields list of fields to output (comma separated) + -eof, -exclude-output-fields string[] exclude output fields output based on a condition RATE-LIMIT: -t, -threads int number of threads to use (default 50) @@ -202,10 +203,16 @@ OUTPUT: -include-chain include redirect http chain in JSON output (-json only) -store-chain include http redirect chain in responses (-sr only) -svrc, -store-vision-recon-cluster include visual recon clusters (-ss and -sr only) - -pr, -protocol string protocol to use (unknown, http11) + -pr, -protocol string protocol to use (unknown, http11, http2, http3) -fepp, -filter-error-page-path string path to store filtered error pages (default "filtered_error_page.json") - -lof, -list-output-fields list available output field names for filtering - -eof, -exclude-output-fields string[] exclude specified output fields from results + -rdb, -result-db store results in database + -rdbc, -result-db-config string path to database config file + -rdbt, -result-db-type string database type (mongodb, postgres, mysql) + -rdbcs, -result-db-conn string database connection string (env: HTTPX_DB_CONNECTION_STRING) + -rdbn, -result-db-name string database name (default "httpx") + -rdbtb, -result-db-table string table/collection name (default "results") + -rdbbs, -result-db-batch-size int batch size for database inserts (default 100) + -rdbor, -result-db-omit-raw omit raw request/response data from database CONFIGURATIONS: -config string path to the httpx configuration file (default $HOME/.config/httpx/config.yaml) @@ -214,9 +221,9 @@ CONFIGURATIONS: -deny string[] denied list of IP/CIDR's to process (file or comma separated) -sni, -sni-name string custom TLS SNI name -random-agent enable Random User-Agent to use (default true) - -auto-referer set the Referer header to the current URL (default false) + -auto-referer set the Referer header to the current URL -H, -header string[] custom http headers to send with request - -http-proxy, -proxy string http proxy to use (eg http://127.0.0.1:8080) + -http-proxy, -proxy string proxy (http|socks) to use (eg http://127.0.0.1:8080) -unsafe send raw requests skipping golang normalization -resume resume scan using resume.cfg -fr, -follow-redirects follow http redirects @@ -252,14 +259,14 @@ DEBUG: OPTIMIZATIONS: -nf, -no-fallback display both probed protocol (HTTPS and HTTP) - -nfs, -no-fallback-scheme probe with protocol scheme specified in input + -nfs, -no-fallback-scheme probe with protocol scheme specified in input -maxhr, -max-host-error int max error count per host before skipping remaining path/s (default 30) -e, -exclude string[] exclude host matching specified filter ('cdn', 'private-ips', cidr, ip, regex) -retries int number of retries -timeout int timeout in seconds (default 10) -delay value duration between each http request (eg: 200ms, 1s) (default -1ns) - -rsts, -response-size-to-save int max response size to save in bytes (default 2147483647) - -rstr, -response-size-to-read int max response size to read in bytes (default 2147483647) + -rsts, -response-size-to-save int max response size to save in bytes (default 512000000) + -rstr, -response-size-to-read int max response size to read in bytes (default 512000000) CLOUD: -auth configure projectdiscovery cloud (pdcp) api key (default true) diff --git a/cmd/httpx/httpx.go b/cmd/httpx/httpx.go index 77e54f4c4..5af2a0d9b 100644 --- a/cmd/httpx/httpx.go +++ b/cmd/httpx/httpx.go @@ -10,6 +10,7 @@ import ( "github.com/logrusorgru/aurora" "github.com/projectdiscovery/gologger" + "github.com/projectdiscovery/httpx/internal/db" "github.com/projectdiscovery/httpx/internal/pdcp" "github.com/projectdiscovery/httpx/runner" pdcpauth "github.com/projectdiscovery/utils/auth/pdcp" @@ -64,6 +65,9 @@ func main() { // setup optional asset upload _ = setupOptionalAssetUpload(options) + // setup optional database output + _ = setupDatabaseOutput(options) + httpxRunner, err := runner.New(options) if err != nil { gologger.Fatal().Msgf("Could not create runner: %s\n", err) @@ -73,21 +77,26 @@ func main() { c := make(chan os.Signal, 1) signal.Notify(c, os.Interrupt) go func() { - for range c { - gologger.Info().Msgf("CTRL+C pressed: Exiting\n") - httpxRunner.Close() - if options.ShouldSaveResume() { - gologger.Info().Msgf("Creating resume file: %s\n", runner.DefaultResumeFile) - err := httpxRunner.SaveResumeConfig() - if err != nil { - gologger.Error().Msgf("Couldn't create resume file: %s\n", err) - } - } - os.Exit(1) - } + // First Ctrl+C: stop dispatching, let in-flight requests finish + <-c + gologger.Info().Msgf("CTRL+C pressed: Exiting\n") + httpxRunner.Interrupt() + // Second Ctrl+C: force exit + <-c + gologger.Info().Msgf("Forcing exit\n") + os.Exit(1) }() httpxRunner.RunEnumeration() + + if httpxRunner.IsInterrupted() && options.ShouldSaveResume() { + gologger.Info().Msgf("Creating resume file: %s\n", runner.DefaultResumeFile) + err := httpxRunner.SaveResumeConfig() + if err != nil { + gologger.Error().Msgf("Couldn't create resume file: %s\n", err) + } + } + httpxRunner.Close() } @@ -143,3 +152,62 @@ func setupOptionalAssetUpload(opts *runner.Options) *pdcp.UploadWriter { } return writer } + +// setupDatabaseOutput sets up database output for storing results +// This is optional and only initialized when explicitly enabled via -rdb flag +func setupDatabaseOutput(opts *runner.Options) *db.Writer { + if !opts.ResultDatabase { + return nil + } + + var cfg *db.Config + var err error + + if opts.ResultDatabaseConfig != "" { + // Load configuration from file + cfg, err = db.LoadConfigFromFile(opts.ResultDatabaseConfig) + if err != nil { + gologger.Fatal().Msgf("Could not load database config: %s\n", err) + } + } else { + // Build configuration from CLI options + dbOpts := &db.Options{ + Enabled: opts.ResultDatabase, + Type: opts.ResultDatabaseType, + ConnectionString: opts.ResultDatabaseConnStr, + DatabaseName: opts.ResultDatabaseName, + TableName: opts.ResultDatabaseTable, + BatchSize: opts.ResultDatabaseBatchSize, + OmitRaw: opts.ResultDatabaseOmitRaw, + } + cfg, err = dbOpts.ToConfig() + if err != nil { + gologger.Fatal().Msgf("Invalid database configuration: %s\n", err) + } + } + + writer, err := db.NewWriter(context.Background(), cfg) + if err != nil { + gologger.Fatal().Msgf("Could not setup database output: %s\n", err) + } + + // Chain with existing OnResult callback if present + existingCallback := opts.OnResult + opts.OnResult = func(r runner.Result) { + if existingCallback != nil { + existingCallback(r) + } + writer.GetWriterCallback()(r) + } + + // Chain with existing OnClose callback if present + existingClose := opts.OnClose + opts.OnClose = func() { + writer.Close() + if existingClose != nil { + existingClose() + } + } + + return writer +} diff --git a/common/pagetypeclassifier/clf.gob b/common/pagetypeclassifier/clf.gob deleted file mode 100644 index 7624ee775..000000000 Binary files a/common/pagetypeclassifier/clf.gob and /dev/null differ diff --git a/common/pagetypeclassifier/dataset.txt b/common/pagetypeclassifier/dataset.txt deleted file mode 100644 index c55ab2efb..000000000 --- a/common/pagetypeclassifier/dataset.txt +++ /dev/null @@ -1,380 +0,0 @@ -The Forum page seems to have a glitch. Our technicians are on it.||error -There was a problem with the Product Details page. Try reloading.||error -Error 500: The E-books page is experiencing a problem.||error -Unfortunately, the Video Tutorials page is down for maintenance.||error -Our Archive page is currently unavailable. We apologize for the inconvenience.||error -We're having trouble loading the Membership Details page.||error -An error occurred while trying to access the Profile Settings page.||error -Error 404: The Team page could not be found.||error -Our Project Highlights page seems to be having some technical issues.||error -We're sorry, but we can't seem to find the Donations page.||error -You've landed on our Forum page. Engage in interesting discussions.||nonerror -Welcome to the Product Details page. Learn more about our products here.||nonerror -You are now on our E-books page. Enjoy a wealth of knowledge.||nonerror -This is the Video Tutorials page. Learn with our easy-to-follow videos.||nonerror -Welcome to our Archive. Dive into our rich history.||nonerror -You're now on the Membership Details page. See the benefits of joining us.||nonerror -This is your Profile Settings page. Update your personal details as needed.||nonerror -You're on the Team page. Meet the people behind our organization.||nonerror -Welcome to our Project Highlights page. See what we've been up to.||nonerror -You've landed on the Donations page. Every contribution helps us do more.||nonerror -500 - Server Error This is highly unusual! Our tech team have been notified and are working on it.||error -Sorry this page is currently under maintenance.||error -Access Denied - You don't have permission to access this page.||error -This page seems to be missing 404 Error!||error -Sorry something went wrong. Please try again later.||error -We're sorry this page could not be found 404.||error -The page you requested could not be found on our site.||error -500 - Internal server error. There is a problem with the resource you are looking for and it cannot be displayed.||error -Error 401 Unauthorized: Access is denied due to invalid credentials.||error -Bad request 400. Your browser sent a request that this server could not understand.||error -This is a 404 error page||error -Sorry this page does not exist||error -Error 500: Internal Server Error||error -Oops! That page can’t be found.Try searching from the field above or go to the home page.||error -An error has occurred while processing your request. It happens to the best of us! Don't worry! There are no bugs without a fix! Let's try again! What were you looking for? If you are an adventurer search this site! If difficulties persist please contact the website administrator and report the error below. 404 Page not found||error -Whoops our bad... The page you requested was not found and we have a fine guess why. If you typed the URL directly please make sure the spelling is correct. If you clicked on a link to get here the link is outdated. What can you do? Have no fear help is near! There are many ways you can get back on track with Magento Store. Go back to the previous page. Use the search bar at the top of the page to search for your products. Follow these links to get you back on track! Store Home My Account||error -404 - Page not found Unfortunately the requested page could not be found.||error -PAGE NOT FOUND The page you're looking for doesn't seem to exist anymore… Return to the homepage||error -Who moved my... lemon? Oh no - looks like we can't find the page you are looking for. But you know the saying; when life gives you lemons... okay we can't find a clever way to end that sentence but we do have 2 suggestions to help you find what you were looking for: Go to the front page Or Search for a specific topic If something you need really is missing we would love it if you would let us know ❤️️||error -404—page not found||error -Apologies but there's a 503 Service Unavailable error. The server cannot handle the request.||error -Sorry you don't have access rights to this page. Error 403: Forbidden.||error -404 - Oops! The page you are looking for has been misplaced.||error -Sorry the server encountered an unexpected condition that prevented it from fulfilling the request. Error 500: Internal Server Error.||error -Whoa! The page you're looking for seems to have vanished. Error 404.||error -Sorry this page has moved or doesn't exist anymore. Error 404.||error -Sorry but your request timed out. Please try again. Error 504: Gateway Timeout.||error -We're sorry but an unknown error occurred while processing your request.||error -Error 502: Bad Gateway. The server encountered a temporary error and could not complete your request.||error -The requested resource could not be found on this server. Please verify your request and try again. Error 404.||error -This Help Center page is temporarily unavailable.||error -Privacy Policy page not found. Please try again later.||error -There seems to be an error on our Services page. We're working to fix it.||error -An error occurred while loading the Search Results page.||error -Category page not found. It might have been removed or relocated.||error -There was a problem loading the Cart page. Please try again.||error -Our Terms of Service page is currently down for maintenance.||error -We're sorry, but the Sitemap is not available at the moment.||error -We're having trouble loading the Reviews page.||error -An error occurred while trying to access the Partners page.||error -Settings page is currently unavailable. We apologize for the inconvenience.||error -Error 404: Resources page not found.||error -Our Press Releases page seems to be having some technical issues.||error -We're sorry, but we can't seem to find the Case Studies page.||error -There was a problem loading the Community page. Please refresh the page.||error -Error 503: The Subscriptions page is temporarily unavailable.||error -There's a problem with our Customer Support page. We're on it.||error -We're having trouble finding the Notifications page. It may have been moved.||error -There was a problem with the Feedback page. Try again later.||error -Our Transactions page is currently experiencing some issues. We appreciate your patience.||error -Your request has been successfully submitted.||nonerror -You have successfully logged out.||nonerror -Congratulations on successfully completing the course!||nonerror -The payment has been processed successfully.||nonerror -Thank you for your feedback!||nonerror -Your download will start shortly.||nonerror -Profile updated successfully.||nonerror -Thanks for contacting us! We'll get back to you as soon as possible.||nonerror -Sign-up successful. Welcome to our community!||nonerror -Your booking has been confirmed. Check your email for details.||nonerror -Welcome! Your registration was successful.||nonerror -Congratulations! You've successfully updated your profile.||nonerror -Great! Your order was placed successfully. We'll send you an email confirmation soon.||nonerror -Welcome back! Your login was successful.||nonerror -Success! You've added the item to your cart.||nonerror -Your request was sent successfully. We'll get back to you as soon as possible.||nonerror -Great job! Your settings have been saved.||nonerror -Your message has been submitted successfully. We appreciate your feedback.||nonerror -Thank you for subscribing to our newsletter!||nonerror -Great news! Your transaction was successful.||nonerror -Welcome to our homepage. Feel free to browse around||nonerror -Thanks for signing up! You're now a registered user.||nonerror -Your order has been placed successfully! You'll receive a confirmation email shortly||nonerror -Congratulations your account has been successfully created||nonerror -Thank you for your inquiry. We will respond to your message within 24 hours||nonerror -You've successfully added the item to your cart!||nonerror -Success! Your password has been updated||nonerror -Welcome back! You have successfully logged in||nonerror -Great job! Your profile has been updated||nonerror -Your message was sent successfully. We'll get back to you shortly||nonerror -Welcome to our website. Explore and enjoy our services.||nonerror -Thank you for visiting our About Us page. Learn more about our journey and team.||nonerror -You are now browsing our Products page. Check out our latest offerings.||nonerror -This is our Contact Us page. Feel free to reach out with any queries or feedback.||nonerror -You have reached the end of the page. Scroll up to continue browsing.||nonerror -Welcome to the News section. Stay updated with our latest announcements.||nonerror -Now viewing: Image Gallery. Enjoy a visual tour of our activities.||nonerror -You're on our FAQ page. Get answers to common questions.||nonerror -Welcome to the Blog section. Engage with our thoughts and insights.||nonerror -This is the Discussion Forum. Join in, ask questions, or help others.||nonerror -You're on the Login page. Enter your credentials to access your account.||login -Welcome to the Sign-Up page. Join our community today.||nonerror -This is your User Dashboard. Manage your account and settings here.||nonerror -You've reached the Checkout page. Review your order and proceed to payment.||nonerror -Welcome to the Download section . Access our digital resources here.||nonerror -This is the Careers page. Explore job opportunities with us.||nonerror -You're viewing the Events Calendar. Keep track of upcoming activities.||nonerror -This is the User Profile page. Update your information as needed.||nonerror -Welcome to our Testimonials page. Read reviews and stories from our users.||nonerror -You are now on the Home page. Start exploring from here.||nonerror -Welcome to home page||nonerror -You're now on our Help Center page. Find answers to common questions here.||nonerror -Welcome to our Privacy Policy page. Learn how we protect your personal information.||nonerror -You've landed on the Services page. Explore what we have to offer.||nonerror -This is the Search Results page. Did you find what you were looking for?||nonerror -Now browsing the Category page. View all items in this category.||nonerror -You're now on the Cart page. Review your selections before proceeding to checkout.||nonerror -Welcome to our Terms of Service page. Understand our conditions for providing services.||nonerror -You are currently on our Sitemap. Navigate our website with ease.||nonerror -You are on the Reviews page. Check out what others have to say about us.||nonerror -Now viewing the Partners page. Meet the organizations we collaborate with.||nonerror -You're on the Settings page. Customize your user experience.||nonerror -This is our Resources page. Access useful documents and guides.||nonerror -You've landed on the Press Releases page. Stay updated with our latest news.||nonerror -Welcome to our Case Studies page. Discover our past projects and achievements.||nonerror -You're now on the Community page. Connect and interact with other members.||nonerror -You are currently on the Subscriptions page. Manage your preferences here.||nonerror -Now viewing the Customer Support page. We're here to help.||nonerror -This is the Notifications page. Keep track of your updates and alerts.||nonerror -You've landed on the Feedback page. Share your thoughts with us.||nonerror -Welcome to the Transactions page. Monitor your past and current transactions.||nonerror -500 - Server Error This is highly unusual! Our tech team have been notified and are working on it.||error -Sorry this page is currently under maintenance.||error -Access Denied - You don't have permission to access this page.||error -This page seems to be missing 404 Error!||error -Sorry something went wrong. Please try again later.||error -We're sorry this page could not be found 404.||error -The page you requested could not be found on our site.||error -500 - Internal server error. There is a problem with the resource you are looking for and it cannot be displayed.||error -Error 401 Unauthorized: Access is denied due to invalid credentials.||error -Bad request 400. Your browser sent a request that this server could not understand.||error -Your request has been successfully submitted.||nonerror -You have successfully logged out.||nonerror -Congratulations on successfully completing the course!||nonerror -The payment has been processed successfully.||nonerror -Thank you for your feedback!||nonerror -Your download will start shortly.||nonerror -Profile updated successfully.||nonerror -Thanks for contacting us! We'll get back to you as soon as possible.||nonerror -Sign-up successful. Welcome to our community!||nonerror -Your booking has been confirmed. Check your email for details.||nonerror -This is a 404 error page||error -Sorry this page does not exist||error -Error 500: Internal Server Error||error -Oops! That page can’t be found.Try searching from the field above or go to the home page.||error -An error has occurred while processing your request. It happens to the best of us! Don't worry! There are no bugs without a fix! Let's try again! What were you looking for? If you are an adventurer search this site! If difficulties persist please contact the website administrator and report the error below. 404 Page not found||error -Whoops our bad... The page you requested was not found and we have a fine guess why. If you typed the URL directly please make sure the spelling is correct. If you clicked on a link to get here the link is outdated. What can you do? Have no fear help is near! There are many ways you can get back on track with Magento Store. Go back to the previous page. Use the search bar at the top of the page to search for your products. Follow these links to get you back on track! Store Home | My Account||error -404 - Page not found Unfortunately the requested page could not be found.||error -PAGE NOT FOUND The page you're looking for doesn't seem to exist anymore… Return to the homepage||error -Who moved my... lemon? Oh no - looks like we can't find the page you are looking for. But you know the saying; when life gives you lemons... okay we can't find a clever way to end that sentence but we do have 2 suggestions to help you find what you were looking for: Go to the front page Or Search for a specific topic If something you need really is missing we would love it if you would let us know ❤️️||error -404—page not found||error -Apologies but there's a 503 Service Unavailable error. The server cannot handle the request.||error -Sorry you don't have access rights to this page. Error 403: Forbidden.||error -404 - Oops! The page you are looking for has been misplaced.||error -Sorry the server encountered an unexpected condition that prevented it from fulfilling the request. Error 500: Internal Server Error.||error -Whoa! The page you're looking for seems to have vanished. Error 404.||error -Sorry this page has moved or doesn't exist anymore. Error 404.||error -Sorry but your request timed out. Please try again. Error 504: Gateway Timeout.||error -We're sorry but an unknown error occurred while processing your request.||error -Error 502: Bad Gateway. The server encountered a temporary error and could not complete your request.||error -The requested resource could not be found on this server. Please verify your request and try again. Error 404.||error -Welcome! Your registration was successful.||nonerror -Congratulations! You've successfully updated your profile.||nonerror -Great! Your order was placed successfully. We'll send you an email confirmation soon.||nonerror -Welcome back! Your login was successful.||nonerror -Success! You've added the item to your cart.||nonerror -Your request was sent successfully. We'll get back to you as soon as possible.||nonerror -Great job! Your settings have been saved.||nonerror -Your message has been submitted successfully. We appreciate your feedback.||nonerror -Thank you for subscribing to our newsletter!||nonerror -Great news! Your transaction was successful.||nonerror -Welcome to our homepage. Feel free to browse around||nonerror -Thanks for signing up! You're now a registered user.||nonerror -Your order has been placed successfully! You'll receive a confirmation email shortly||nonerror -Congratulations your account has been successfully created||nonerror -Thank you for your inquiry. We will respond to your message within 24 hours||nonerror -You've successfully added the item to your cart!||nonerror -Success! Your password has been updated||nonerror -Welcome back! You have successfully logged in||nonerror -Great job! Your profile has been updated||nonerror -Your message was sent successfully. We'll get back to you shortly||nonerror -Welcome to the Login page. Please sign in to continue.||login -Please enter your username and password on the login page.||login -You have reached the login page. Access your account by logging in.||login -Login required. Please authenticate to access this page.||login -Welcome back! Please log in to your account.||login -Sign in to your account on this login page.||login -Secure Login: Enter your credentials to proceed.||login -This is the login page. Please enter your email and password.||login -Access denied. Please log in to continue.||login -You're on the login page. Forgot your password? Click here to reset.||login -User Login: Please provide your username and password.||login -Login to your account to access exclusive features.||login -Authentication required. Please log in.||login -Welcome back! Sign in to access your dashboard.||login -Please log in to proceed to the checkout page.||login -Member login: Enter your credentials below.||login -Staff login portal. Please enter your login details.||login -Customer login: Sign in to view your orders.||login -Partner login: Please authenticate to access partner resources.||login -Administrator login page. Enter your admin credentials.||login -Please log in to access your profile settings.||login -Login successful. Redirecting to your account dashboard.||login -Incorrect password. Please try again.||login -Session expired. Please log in again.||login -Welcome to the secure login page. Your privacy is important to us.||login -Access restricted. Please log in to view this content.||login -Please log in to access the members-only area.||login -Sign in with your social media account on the login page.||login -New user? Register here or log in if you already have an account.||login -Log in to participate in the forum discussions.||login -Access your account by logging in here.||login -Please log in to access your personalized dashboard.||login -Enter your login details to continue.||login -Login Page: Securely enter your credentials.||login -Welcome to the user login portal.||login -Sign in to manage your account settings.||login -This is the login screen. Please authenticate.||login -Returning user? Please log in.||login -Please log in to view your messages.||login -Log in to access premium content.||login -Authentication page: Enter your username and password.||login -Please enter your login information to proceed.||login -User authentication required. Please log in.||login -Log in now to unlock exclusive features.||login -Sign in to check your account balance.||login -Welcome back! Please enter your login credentials.||login -Member login area: Access restricted content by logging in.||login -Please sign in to continue to your profile.||login -Staff members, please log in to access internal resources.||login -Enter your email and password to log in.||login -Login required to view this page. Please sign in.||login -Access your profile by logging into your account.||login -Please provide your login credentials to access the system.||login -Log in to track your order status.||login -Welcome to the employee login page.||login -Secure area: Please log in to continue.||login -Please log in to update your preferences.||login -Sign in to access your learning materials.||login -Please authenticate to proceed to the next step.||login -Login Page: Your session has expired, please log in again.||login -Welcome back! Enter your credentials to sign in.||login -Client login: Access your project details here.||login -Agent login portal: Please sign in with your ID.||login -Enter your user ID and password to log in.||login -Log in to view your subscription details.||login -Login Page: Forgot your password? Click here to reset it.||login -Access restricted to authorized users only. Please log in.||login -Vendor login: Manage your listings by logging in.||login -Please log in to access your saved items.||login -Log in to participate in our online courses.||login -Sign in to view your appointment schedule.||login -Welcome to the admin login page.||login -Please enter your credentials to log in securely.||login -Log in to view your recent activities.||login -Authentication needed. Please sign in to proceed.||login -Member login: Keep me signed in checkbox available.||login -Log in with your email or username.||login -Access your account dashboard by logging in.||login -Sign in to post comments on articles.||login -Please log in to access your billing information.||login -Log in to access your personalized recommendations.||login -Please sign in to view your shopping cart.||login -Enter your credentials to log in and start shopping.||login -Welcome to the customer login page. Sign in to continue.||login -Authentication required. Please log in with your secure ID.||login -Log in to access exclusive member discounts.||login -Please log in to view and manage your wishlist.||login -Sign in to access your event tickets and details.||login -Faculty login: Please enter your staff ID and password.||login -Log in to access your investment portfolio.||login -Access your medical records by logging in securely.||login -Please sign in to continue to the payment gateway.||login -Login required to access your order history.||login -Welcome back! Log in to resume your session.||login -Please log in to submit your application.||login -Enter your username and password to log in to the portal.||login -Student login: Access your course materials by signing in.||login -Log in to customize your news feed preferences.||login -Please authenticate to access your secure messages.||login -Sign in to sync your data across devices.||login -Log in to join the live webinar.||login -Please log in to access your reservation details.||login -Welcome to the supplier login page.||login -Log in to access your support tickets.||login -Enter your credentials to log in and view analytics.||login -Please sign in to access developer resources.||login -Login required to view confidential documents.||login -Log in to participate in the survey.||login -Please authenticate to access the admin dashboard.||login -Sign in to view your loyalty points balance.||login -Log in to manage your email subscriptions.||login -Please log in to proceed with the enrollment process.||login -Access your download history by logging in.||login -Welcome back! Please log in to renew your membership.||login -Enter your employee ID to log in to the time tracking system.||login -Log in to update your security settings.||login -Please sign in to access your saved searches.||login -Authentication required for accessing project files.||login -Log in to collaborate with your team members.||login -Please enter your PIN and password to log in.||login -Sign in to access your fitness progress dashboard.||login -Log in to check your test results.||login -Please log in to schedule your appointments.||login -Welcome to the volunteer login page.||login -Log in to view your donation history.||login -Please authenticate to access the control panel.||login -Sign in to review and accept your job offer.||login -Log in to access premium tutorials and guides.||login -Please log in to manage your API keys.||login -Please log in with your email address and password.||login -Enter your username and password to access your account.||login -Sign in to your account using your email and password.||login -Welcome back! Please enter your login credentials.||login -Email address: [input field] Password: [input field]||login -Login to your account. Don't have one? Sign up now.||login -Username: [input field] Password: [input field] Remember me?||login -Forgot your password? Click here to reset it.||login -Please enter your email and password to continue.||login -Secure login portal. Enter credentials below.||login -Access your account by logging in below.||login -Remember me on this device.||login -Login required. Please sign in to proceed.||login -Forgot password? Reset it here.||login -Sign in with your email address and password.||login -Welcome! Please log in to your account.||login -User login: Enter your email and password.||login -Email: [input field] Password: [input field]||login -Please authenticate by entering your login details.||login -Sign in to your account or register for a new one.||login -Login page: Access restricted to authorized users only.||login -Need help logging in? Click here.||login -Enter your credentials to log in.||login -Keep me signed in.||login -Please sign in to access exclusive content.||login -Welcome back! Sign in to your dashboard.||login -Forgot your username or password? Retrieve them here.||login -Log in using your email or username.||login -Authentication required. Please log in.||login -Password recovery: Reset your password now.||login -Log in to manage your account settings.||login -Sign in to continue to checkout.||login -Enter your login information below.||login -Sign in to access your personalized dashboard.||login -Welcome to the member login page.||login -Already have an account? Log in here.||login -Enter email and password to sign in.||login -Sign in to your profile.||login -Member login: Access your account here.||login -Please log in to continue.||login -Enter your password to log in.||login -Sign in to view your messages.||login -Login to your profile to see updates.||login -Log in to your account to access features.||login -Please provide your username and password.||login -Log in to manage your subscriptions.||login -Sign in using your credentials.||login -Access denied. Please log in first.||login -Authentication portal. Enter login details.||login -Need an account? Sign up or log in if you already have one.||login diff --git a/common/pagetypeclassifier/pagetypeclassifier.go b/common/pagetypeclassifier/pagetypeclassifier.go deleted file mode 100644 index 5e333b1c6..000000000 --- a/common/pagetypeclassifier/pagetypeclassifier.go +++ /dev/null @@ -1,156 +0,0 @@ -package pagetypeclassifier - -import ( - _ "embed" - "fmt" - "strings" - "sync" - - htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" - "github.com/microcosm-cc/bluemonday" - "github.com/projectdiscovery/utils/ml/naive_bayes" -) - -//go:embed clf.gob -var classifierData []byte - -type PageTypeClassifier struct { - classifier *naive_bayes.NaiveBayesClassifier -} - -func New() (*PageTypeClassifier, error) { - classifier, err := naive_bayes.NewClassifierFromFileData(classifierData) - if err != nil { - return nil, err - } - return &PageTypeClassifier{classifier: classifier}, nil -} - -func (n *PageTypeClassifier) Classify(html string) string { - text, err := htmlToText(html) - if err != nil || text == "" { - return "other" - } - return n.classifier.Classify(text) -} - -var ( - // sanitizerPolicy is an aggressive bluemonday policy that strips most HTML - // to reduce nesting depth and prevent parser stack overflow - sanitizerPolicy *bluemonday.Policy - sanitizerPolicyOnce sync.Once -) - -// getSanitizerPolicy returns an ultra-aggressive HTML sanitizer policy that strips -// almost all elements to minimize nesting depth and prevent parser stack overflow. -func getSanitizerPolicy() *bluemonday.Policy { - sanitizerPolicyOnce.Do(func() { - p := bluemonday.NewPolicy() - // Ultra-aggressive policy: Allow only the most basic text elements - // to minimize nesting and reduce parser stack depth - p.AllowElements("p", "br", "h1", "h2", "h3", "h4", "h5", "h6") - p.AllowElements("strong", "em", "b", "i") - // Remove div, span, ul, ol, li as they can create deep nesting - // No attributes allowed to prevent style-based nesting issues - sanitizerPolicy = p - }) - return sanitizerPolicy -} - -// htmlToText safely converts HTML to text with multiple fallback strategies. -// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased. -// Strategy: -// 1. Length limit the input HTML to prevent massive documents -// 2. Sanitize HTML aggressively with bluemonday to reduce nesting -// 3. Convert sanitized HTML to markdown with panic recovery -// 4. If conversion fails, fallback to plain text extraction -func htmlToText(html string) (text string, err error) { - defer func() { - if r := recover(); r != nil { - err = fmt.Errorf("html parser panic: %v", r) - text = "" - } - }() - - // Limit input size to prevent processing extremely large HTML documents - const maxHTMLSize = 1024 * 1024 // 1MB limit - if len(html) > maxHTMLSize { - html = html[:maxHTMLSize] - } - - // First, sanitize HTML with ultra-aggressive bluemonday policy - sanitizedHTML := getSanitizerPolicy().Sanitize(html) - - // If sanitization failed or produced empty result, try plain text fallback - if sanitizedHTML == "" { - return extractPlainText(html), nil - } - - // Convert sanitized HTML to markdown - text, err = htmltomarkdown.ConvertString(sanitizedHTML) - if err != nil { - // If markdown conversion fails, fallback to plain text extraction - return extractPlainText(sanitizedHTML), nil - } - - if text == "" { - // If result is empty, try plain text fallback - return extractPlainText(sanitizedHTML), nil - } - - return text, nil -} - -// extractPlainText is a simple fallback that extracts text content without HTML parsing -// This is used when the HTML parser fails due to complexity or nesting depth -func extractPlainText(html string) string { - // Simple regex-based text extraction as fallback - // Remove script and style tags first - text := html - - // Remove script tags and content - for { - start := strings.Index(text, " - -
-Some important content here
-