projectdiscovery · Mzack9999 · Jul 19, 2023 · Jun 20, 2023 · Jun 22, 2023 · Jun 22, 2023
diff --git a/README.md b/README.md
@@ -131,13 +131,14 @@ EXTRACTOR:
 
 FILTERS:
    -fc, -filter-code string            filter response with specified status code (-fc 403,401)
+   -fep, -filter-error-page            filter response with ML based error page detection
    -fl, -filter-length string          filter response with specified content length (-fl 23,33)
    -flc, -filter-line-count string     filter response body with specified line count (-flc 423,532)
    -fwc, -filter-word-count string     filter response body with specified word count (-fwc 423,532)
    -ffc, -filter-favicon string[]      filter response with specified favicon hash (-mfc 1494302000)
    -fs, -filter-string string          filter response with specified string (-fs admin)
    -fe, -filter-regex string           filter response with specified regex (-fe admin)
-   -fcdn, -filter-cdn string[]         filter host with specified cdn provider (incapsula, oracle, google, azure, cloudflare, cloudfront, fastly, akamai, sucuri, leaseweb)
+   -fcdn, -filter-cdn string[]         filter host with specified cdn provider (google, leaseweb, stackpath, cloudfront, fastly)
    -frt, -filter-response-time string  filter response with specified response time in seconds (-frt '> 1')
    -fdc, -filter-condition string      filter response with dsl expression condition
 
@@ -343,6 +344,26 @@ https://support.hackerone.com [301,302,301,200] [HackerOne] [Cloudflare,Ruby on
 https://resources.hackerone.com [301,301,404] [Sorry, no Folders found.]
 ```
 
+### Error Page Classifier and Filtering
+The Error Page Classifier and Filtering feature aims to add intelligence to the tool by enabling it to classify and filter out common error pages returned by web applications. It is an enhancement to the existing httpx capabilities and is geared towards reducing the noise in the results and helping users focus on what matters most.
+
+```console
+httpx -l list.txt -fep
+
+    __    __  __       _  __
+   / /_  / /_/ /_____ | |/ /
+  / __ \/ __/ __/ __ \|   /
+ / / / / /_/ /_/ /_/ /   |
+/_/ /_/\__/\__/ .___/_/|_|
+             /_/
+
+                projectdiscovery.io
+
+[INF] Current httpx version v1.3.2 (latest)
+https://projectdiscovery.io
+https://scanme.sh
+```
+
 ### Favicon Hash
 
 

diff --git a/common/errorpageclassifier/classifier.go b/common/errorpageclassifier/classifier.go
@@ -0,0 +1,194 @@
+//ref: https://github.com/sausheong/gonb
+
+package errorpageclassifier
+
+import (
+	"bytes"
+	"encoding/gob"
+	"io"
+	"os"
+	"regexp"
+	"sort"
+	"strings"
+
+	"github.com/kljensen/snowball"
+)
+
+var (
+	cleaner   = regexp.MustCompile(`[^\w\s]`)
+	stopWords = map[string]struct{}{"a": {}, "able": {}, "about": {}, "above": {}, "abroad": {}, "according": {}, "accordingly": {}, "across": {}, "actually": {}, "adj": {}, "after": {}, "afterwards": {}, "again": {}, "against": {}, "ago": {}, "ahead": {}, "ain't": {}, "all": {}, "allow": {}, "allows": {}, "almost": {}, "alone": {}, "along": {}, "alongside": {}, "already": {}, "also": {}, "although": {}, "always": {}, "am": {}, "amid": {}, "amidst": {}, "among": {}, "amongst": {}, "an": {}, "and": {}, "another": {}, "any": {}, "anybody": {}, "anyhow": {}, "anyone": {}, "anything": {}, "anyway": {}, "anyways": {}, "anywhere": {}, "apart": {}, "appear": {}, "appreciate": {}, "appropriate": {}, "are": {}, "aren't": {}, "around": {}, "as": {}, "a's": {}, "aside": {}, "ask": {}, "asking": {}, "associated": {}, "at": {}, "available": {}, "away": {}, "awfully": {}, "b": {}, "back": {}, "backward": {}, "backwards": {}, "be": {}, "became": {}, "because": {}, "become": {}, "becomes": {}, "becoming": {}, "been": {}, "before": {}, "beforehand": {}, "begin": {}, "behind": {}, "being": {}, "believe": {}, "below": {}, "beside": {}, "besides": {}, "best": {}, "better": {}, "between": {}, "beyond": {}, "both": {}, "brief": {}, "but": {}, "by": {}, "c": {}, "came": {}, "can": {}, "cannot": {}, "cant": {}, "can't": {}, "caption": {}, "cause": {}, "causes": {}, "certain": {}, "certainly": {}, "changes": {}, "clearly": {}, "c'mon": {}, "co": {}, "co.": {}, "com": {}, "come": {}, "comes": {}, "concerning": {}, "consequently": {}, "consider": {}, "considering": {}, "contain": {}, "containing": {}, "contains": {}, "corresponding": {}, "could": {}, "couldn't": {}, "course": {}, "c's": {}, "currently": {}, "d": {}, "dare": {}, "daren't": {}, "definitely": {}, "described": {}, "despite": {}, "did": {}, "didn't": {}, "different": {}, "directly": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "done": {}, "don't": {}, "down": {}, "downwards": {}, "during": {}, "e": {}, "each": {}, "edu": {}, "eg": {}, "eight": {}, "eighty": {}, "either": {}, "else": {}, "elsewhere": {}, "end": {}, "ending": {}, "enough": {}, "entirely": {}, "especially": {}, "et": {}, "etc": {}, "even": {}, "ever": {}, "evermore": {}, "every": {}, "everybody": {}, "everyone": {}, "everything": {}, "everywhere": {}, "ex": {}, "exactly": {}, "example": {}, "except": {}, "f": {}, "fairly": {}, "far": {}, "farther": {}, "few": {}, "fewer": {}, "fifth": {}, "first": {}, "five": {}, "followed": {}, "following": {}, "follows": {}, "for": {}, "forever": {}, "former": {}, "formerly": {}, "forth": {}, "forward": {}, "found": {}, "four": {}, "from": {}, "further": {}, "furthermore": {}, "g": {}, "get": {}, "gets": {}, "getting": {}, "given": {}, "gives": {}, "go": {}, "goes": {}, "going": {}, "gone": {}, "got": {}, "gotten": {}, "greetings": {}, "h": {}, "had": {}, "hadn't": {}, "half": {}, "happens": {}, "hardly": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "hello": {}, "help": {}, "hence": {}, "her": {}, "here": {}, "hereafter": {}, "hereby": {}, "herein": {}, "here's": {}, "hereupon": {}, "hers": {}, "herself": {}, "he's": {}, "hi": {}, "him": {}, "himself": {}, "his": {}, "hither": {}, "hopefully": {}, "how": {}, "howbeit": {}, "however": {}, "hundred": {}, "i": {}, "i'd": {}, "ie": {}, "if": {}, "ignored": {}, "i'll": {}, "i'm": {}, "immediate": {}, "in": {}, "inasmuch": {}, "inc": {}, "inc.": {}, "indeed": {}, "indicate": {}, "indicated": {}, "indicates": {}, "inner": {}, "inside": {}, "insofar": {}, "instead": {}, "into": {}, "inward": {}, "is": {}, "isn't": {}, "it": {}, "it'd": {}, "it'll": {}, "its": {}, "it's": {}, "itself": {}, "i've": {}, "j": {}, "just": {}, "k": {}, "keep": {}, "keeps": {}, "kept": {}, "know": {}, "known": {}, "knows": {}, "l": {}, "last": {}, "lately": {}, "later": {}, "latter": {}, "latterly": {}, "least": {}, "less": {}, "lest": {}, "let": {}, "let's": {}, "like": {}, "liked": {}, "likely": {}, "likewise": {}, "little": {}, "look": {}, "looking": {}, "looks": {}, "low": {}, "lower": {}, "ltd": {}, "m": {}, "made": {}, "mainly": {}, "make": {}, "makes": {}, "many": {}, "may": {}, "maybe": {}, "mayn't": {}, "me": {}, "mean": {}, "meantime": {}, "meanwhile": {}, "merely": {}, "might": {}, "mightn't": {}, "mine": {}, "minus": {}, "miss": {}, "more": {}, "moreover": {}, "most": {}, "mostly": {}, "mr": {}, "mrs": {}, "much": {}, "must": {}, "mustn't": {}, "my": {}, "myself": {}, "n": {}, "name": {}, "namely": {}, "nd": {}, "near": {}, "nearly": {}, "necessary": {}, "need": {}, "needn't": {}, "needs": {}, "neither": {}, "never": {}, "neverf": {}, "neverless": {}, "nevertheless": {}, "new": {}, "next": {}, "nine": {}, "ninety": {}, "no": {}, "nobody": {}, "non": {}, "none": {}, "nonetheless": {}, "noone": {}, "no-one": {}, "nor": {}, "normally": {}, "not": {}, "nothing": {}, "notwithstanding": {}, "novel": {}, "now": {}, "nowhere": {}, "o": {}, "obviously": {}, "of": {}, "off": {}, "often": {}, "oh": {}, "ok": {}, "okay": {}, "old": {}, "on": {}, "once": {}, "one": {}, "ones": {}, "one's": {}, "only": {}, "onto": {}, "opposite": {}, "or": {}, "other": {}, "others": {}, "otherwise": {}, "ought": {}, "oughtn't": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "outside": {}, "over": {}, "overall": {}, "own": {}, "p": {}, "particular": {}, "particularly": {}, "past": {}, "per": {}, "perhaps": {}, "placed": {}, "please": {}, "plus": {}, "possible": {}, "presumably": {}, "probably": {}, "provided": {}, "provides": {}, "q": {}, "que": {}, "quite": {}, "qv": {}, "r": {}, "rather": {}, "rd": {}, "re": {}, "really": {}, "reasonably": {}, "recent": {}, "recently": {}, "regarding": {}, "regardless": {}, "regards": {}, "relatively": {}, "respectively": {}, "right": {}, "round": {}, "s": {}, "said": {}, "same": {}, "saw": {}, "say": {}, "saying": {}, "says": {}, "second": {}, "secondly": {}, "see": {}, "seeing": {}, "seem": {}, "seemed": {}, "seeming": {}, "seems": {}, "seen": {}, "self": {}, "selves": {}, "sensible": {}, "sent": {}, "serious": {}, "seriously": {}, "seven": {}, "several": {}, "shall": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "since": {}, "six": {}, "so": {}, "some": {}, "somebody": {}, "someday": {}, "somehow": {}, "someone": {}, "something": {}, "sometime": {}, "sometimes": {}, "somewhat": {}, "somewhere": {}, "soon": {}, "sorry": {}, "specified": {}, "specify": {}, "specifying": {}, "still": {}, "sub": {}, "such": {}, "sup": {}, "sure": {}, "t": {}, "take": {}, "taken": {}, "taking": {}, "tell": {}, "tends": {}, "th": {}, "than": {}, "thank": {}, "thanks": {}, "thanx": {}, "that": {}, "that'll": {}, "thats": {}, "that's": {}, "that've": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "thence": {}, "there": {}, "thereafter": {}, "thereby": {}, "there'd": {}, "therefore": {}, "therein": {}, "there'll": {}, "there're": {}, "theres": {}, "there's": {}, "thereupon": {}, "there've": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "thing": {}, "things": {}, "think": {}, "third": {}, "thirty": {}, "this": {}, "thorough": {}, "thoroughly": {}, "those": {}, "though": {}, "three": {}, "through": {}, "throughout": {}, "thru": {}, "thus": {}, "till": {}, "to": {}, "together": {}, "too": {}, "took": {}, "toward": {}, "towards": {}, "tried": {}, "tries": {}, "truly": {}, "try": {}, "trying": {}, "t's": {}, "twice": {}, "two": {}, "u": {}, "un": {}, "under": {}, "underneath": {}, "undoing": {}, "unfortunately": {}, "unless": {}, "unlike": {}, "unlikely": {}, "until": {}, "unto": {}, "up": {}, "upon": {}, "upwards": {}, "us": {}, "use": {}, "used": {}, "useful": {}, "uses": {}, "using": {}, "usually": {}, "v": {}, "value": {}, "various": {}, "versus": {}, "very": {}, "via": {}, "viz": {}, "vs": {}, "w": {}, "want": {}, "wants": {}, "was": {}, "wasn't": {}, "way": {}, "we": {}, "we'd": {}, "welcome": {}, "well": {}, "we'll": {}, "went": {}, "were": {}, "we're": {}, "weren't": {}, "we've": {}, "what": {}, "whatever": {}, "what'll": {}, "what's": {}, "what've": {}, "when": {}, "whence": {}, "whenever": {}, "where": {}, "whereafter": {}, "whereas": {}, "whereby": {}, "wherein": {}, "where's": {}, "whereupon": {}, "wherever": {}, "whether": {}, "which": {}, "whichever": {}, "while": {}, "whilst": {}, "whither": {}, "who": {}, "who'd": {}, "whoever": {}, "whole": {}, "who'll": {}, "whom": {}, "whomever": {}, "who's": {}, "whose": {}, "why": {}, "will": {}, "willing": {}, "wish": {}, "with": {}, "within": {}, "without": {}, "wonder": {}, "won't": {}, "would": {}, "wouldn't": {}, "x": {}, "y": {}, "yes": {}, "yet": {}, "you": {}, "you'd": {}, "you'll": {}, "your": {}, "you're": {}, "yours": {}, "yourself": {}, "yourselves": {}, "you've": {}, "z": {}, "zero": {}}
+)
+
+type Sorted struct {
+	Category    string
+	Probability float64
+}
+
+// Classifier is what we use to classify documents
+type Classifier struct {
+	Words               map[string]map[string]int
+	TotalWords          int
+	CategoriesDocuments map[string]int
+	TotalDocuments      int
+	CategoriesWords     map[string]int
+	Threshold           float64
+}
+
+// create and initialize the classifier
+func NewClassifier(categories []string, threshold float64) *Classifier {
+	classifier := &Classifier{
+		Words:               make(map[string]map[string]int),
+		TotalWords:          0,
+		CategoriesDocuments: make(map[string]int),
+		TotalDocuments:      0,
+		CategoriesWords:     make(map[string]int),
+		Threshold:           threshold,
+	}
+
+	for _, category := range categories {
+		classifier.Words[category] = make(map[string]int)
+		classifier.CategoriesDocuments[category] = 0
+		classifier.CategoriesWords[category] = 0
+	}
+	return classifier
+}
+
+// create and initialize the classifier from a file
+func NewClassifierFromFile(path string) (*Classifier, error) {
+	classifier := &Classifier{}
+
+	fl, err := os.Open(path)
+	if err != nil {
+		return classifier, err
+	}
+	defer fl.Close()
+
+	return NewClassifierWithReader(fl)
+}
+
+// create and initialize the classifier from a file data
+func NewClassifierFromFileData(data []byte) (*Classifier, error) {
+	return NewClassifierWithReader(bytes.NewReader(data))
+}
+
+// create and initialize the classifier from a file data
+func NewClassifierWithReader(reader io.Reader) (*Classifier, error) {
+	classifier := &Classifier{}
+	err := gob.NewDecoder(reader).Decode(classifier)
+	if err != nil {
+		return classifier, err
+	}
+
+	return classifier, nil
+}
+
+// save the classifier to a file
+// func (c *Classifier) SaveClassifierToFile(path string) error {
+// 	fl, err := os.Create(path)
+// 	if err != nil {
+// 		return err
+// 	}
+// 	defer fl.Close()
+
+// 	err = gob.NewEncoder(fl).Encode(&c)
+// 	if err != nil {
+// 		return err
+// 	}
+
+// 	return nil
+// }
+
+// Train the classifier
+// func (c *Classifier) Train(category string, document string) {
+// 	for word, count := range countWords(document) {
+// 		c.Words[category][word] += count
+// 		c.CategoriesWords[category] += count
+// 		c.TotalWords += count
+// 	}
+// 	c.CategoriesDocuments[category]++
+// 	c.TotalDocuments++
+// }
+
+// Classify a document
+func (c *Classifier) Classify(document string) (category string) {
+	// get all the probabilities of each category
+	prob := c.Probabilities(document)
+
+	// sort the categories according to probabilities
+	var sp []Sorted
+	for c, p := range prob {
+		sp = append(sp, Sorted{c, p})
+	}
+	sort.Slice(sp, func(i, j int) bool {
+		return sp[i].Probability > sp[j].Probability
+	})
+
+	// if the highest probability is above threshold select that
+	if sp[0].Probability/sp[1].Probability > c.Threshold {
+		category = sp[0].Category
+	} else {
+		category = "other"
+	}
+
+	return
+}
+
+// Probabilities of each category
+func (c *Classifier) Probabilities(document string) (p map[string]float64) {
+	p = make(map[string]float64)
+	for category := range c.Words {
+		p[category] = c.pCategoryDocument(category, document)
+	}
+	return
+}
+
+// p (document | category)
+func (c *Classifier) pDocumentCategory(category string, document string) (p float64) {
+	p = 1.0
+	for word := range countWords(document) {
+		p = p * c.pWordCategory(category, word)
+	}
+	return p
+}
+
+func (c *Classifier) pWordCategory(category string, word string) float64 {
+	return float64(c.Words[category][stem(word)]+1) / float64(c.CategoriesWords[category])
+}
+
+// p (category)
+func (c *Classifier) pCategory(category string) float64 {
+	return float64(c.CategoriesDocuments[category]) / float64(c.TotalDocuments)
+}
+
+// p (category | document)
+func (c *Classifier) pCategoryDocument(category string, document string) float64 {
+	return c.pDocumentCategory(category, document) * c.pCategory(category)
+}
+
+// clean up and split words in document, then stem each word and count the occurrence
+func countWords(document string) (wordCount map[string]int) {
+	cleaned := cleanDocument(document)
+	words := strings.Split(cleaned, " ")
+	wordCount = make(map[string]int)
+	for _, word := range words {
+		if _, ok := stopWords[word]; !ok {
+			key := stem(strings.ToLower(word))
+			wordCount[key]++
+		}
+	}
+	return
+}
+
+func cleanDocument(text string) string {
+	return cleaner.ReplaceAllString(text, "")
+}
+
+// stem a word using the Snowball algorithm
+func stem(word string) string {
+	stemmed, err := snowball.Stem(word, "english", true)
+	if err == nil {
+		return stemmed
+	}
+	// fmt.Println("Cannot stem word:", word)
+	return word
+}
diff --git a/common/errorpageclassifier/clf.gob b/common/errorpageclassifier/clf.gob
diff --git a/common/errorpageclassifier/cm.go b/common/errorpageclassifier/cm.go
@@ -0,0 +1,105 @@
+package errorpageclassifier
+
+// import "fmt"
+
+// type ConfusionMatrix struct {
+// 	matrix [][]int
+// 	labels []string
+// }
+
+// func NewConfusionMatrix(actual, predicted []string, labels []string) *ConfusionMatrix {
+// 	n := len(labels)
+// 	matrix := make([][]int, n)
+// 	for i := range matrix {
+// 		matrix[i] = make([]int, n)
+// 	}
+
+// 	labelIndices := make(map[string]int)
+// 	for i, label := range labels {
+// 		labelIndices[label] = i
+// 	}
+
+// 	for i := range actual {
+// 		matrix[labelIndices[actual[i]]][labelIndices[predicted[i]]]++
+// 	}
+
+// 	return &ConfusionMatrix{
+// 		matrix: matrix,
+// 		labels: labels,
+// 	}
+// }
+
+// func (cm *ConfusionMatrix) PrintConfusionMatrix() {
+// 	fmt.Printf("%30s\n", "Confusion Matrix")
+// 	fmt.Println()
+// 	// Print header
+// 	fmt.Printf("%-15s", "")
+// 	for _, label := range cm.labels {
+// 		fmt.Printf("%-15s", label)
+// 	}
+// 	fmt.Println()
+
+// 	// Print rows
+// 	for i, row := range cm.matrix {
+// 		fmt.Printf("%-15s", cm.labels[i])
+// 		for _, value := range row {
+// 			fmt.Printf("%-15d", value)
+// 		}
+// 		fmt.Println()
+// 	}
+// 	fmt.Println()
+// }
+
+// func (cm *ConfusionMatrix) PrintClassificationReport() {
+// 	fmt.Printf("%30s\n", "Classification Report")
+// 	fmt.Println()
+
+// 	fmt.Printf("\n%-15s %-10s %-10s %-10s %-10s\n", "", "precision", "recall", "f1-score", "support")
+
+// 	totals := map[string]float64{"true": 0, "predicted": 0, "correct": 0}
+// 	macroAvg := map[string]float64{"precision": 0, "recall": 0, "f1-score": 0}
+
+// 	for i, label := range cm.labels {
+// 		truePos := cm.matrix[i][i]
+// 		falsePos, falseNeg := 0, 0
+// 		for j := 0; j < len(cm.labels); j++ {
+// 			if i != j {
+// 				falsePos += cm.matrix[j][i]
+// 				falseNeg += cm.matrix[i][j]
+// 			}
+// 		}
+
+// 		precision := float64(truePos) / float64(truePos+falsePos)
+// 		recall := float64(truePos) / float64(truePos+falseNeg)
+// 		f1Score := 2 * precision * recall / (precision + recall)
+// 		support := truePos + falseNeg
+
+// 		fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", label, precision, recall, f1Score, support)
+
+// 		totals["true"] += float64(support)
+// 		totals["predicted"] += float64(truePos + falsePos)
+// 		totals["correct"] += float64(truePos)
+
+// 		macroAvg["precision"] += precision
+// 		macroAvg["recall"] += recall
+// 		macroAvg["f1-score"] += f1Score
+// 	}
+
+// 	accuracy := totals["correct"] / totals["true"]
+// 	fmt.Printf("\n%-26s %-10s %-10.2f %-10d", "accuracy", "", accuracy, int(totals["true"]))
+
+// 	fmt.Printf("\n%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "macro avg",
+// 		macroAvg["precision"]/float64(len(cm.labels)),
+// 		macroAvg["recall"]/float64(len(cm.labels)),
+// 		macroAvg["f1-score"]/float64(len(cm.labels)),
+// 		int(totals["true"]))
+
+// 	precisionWeightedAvg := totals["correct"] / totals["predicted"]
+// 	recallWeightedAvg := totals["correct"] / totals["true"]
+// 	f1ScoreWeightedAvg := 2 * precisionWeightedAvg * recallWeightedAvg / (precisionWeightedAvg + recallWeightedAvg)
+
+// 	fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "weighted avg",
+// 		precisionWeightedAvg, recallWeightedAvg, f1ScoreWeightedAvg, int(totals["true"]))
+
+// 	fmt.Println()
+// }