Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error page classifier #1245

Merged
merged 16 commits into from
Jul 19, 2023
Merged
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,14 @@ EXTRACTOR:

FILTERS:
-fc, -filter-code string filter response with specified status code (-fc 403,401)
-fep, -filter-error-page filter response with ML based error page detection
-fl, -filter-length string filter response with specified content length (-fl 23,33)
-flc, -filter-line-count string filter response body with specified line count (-flc 423,532)
-fwc, -filter-word-count string filter response body with specified word count (-fwc 423,532)
-ffc, -filter-favicon string[] filter response with specified favicon hash (-mfc 1494302000)
-fs, -filter-string string filter response with specified string (-fs admin)
-fe, -filter-regex string filter response with specified regex (-fe admin)
-fcdn, -filter-cdn string[] filter host with specified cdn provider (incapsula, oracle, google, azure, cloudflare, cloudfront, fastly, akamai, sucuri, leaseweb)
-fcdn, -filter-cdn string[] filter host with specified cdn provider (google, leaseweb, stackpath, cloudfront, fastly)
-frt, -filter-response-time string filter response with specified response time in seconds (-frt '> 1')
-fdc, -filter-condition string filter response with dsl expression condition

Expand Down Expand Up @@ -343,6 +344,26 @@ https://support.hackerone.com [301,302,301,200] [HackerOne] [Cloudflare,Ruby on
https://resources.hackerone.com [301,301,404] [Sorry, no Folders found.]
```

### Error Page Classifier and Filtering
The Error Page Classifier and Filtering feature aims to add intelligence to the tool by enabling it to classify and filter out common error pages returned by web applications. It is an enhancement to the existing httpx capabilities and is geared towards reducing the noise in the results and helping users focus on what matters most.

```console
httpx -l list.txt -fep

__ __ __ _ __
/ /_ / /_/ /_____ | |/ /
/ __ \/ __/ __/ __ \| /
/ / / / /_/ /_/ /_/ / |
/_/ /_/\__/\__/ .___/_/|_|
/_/

projectdiscovery.io

[INF] Current httpx version v1.3.2 (latest)
https://projectdiscovery.io
https://scanme.sh
```

### Favicon Hash


Expand Down
194 changes: 194 additions & 0 deletions common/errorpageclassifier/classifier.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
//ref: https://github.com/sausheong/gonb

package errorpageclassifier

import (
"bytes"
"encoding/gob"
"io"
"os"
"regexp"
"sort"
"strings"

"github.com/kljensen/snowball"
)

var (
cleaner = regexp.MustCompile(`[^\w\s]`)
stopWords = map[string]struct{}{"a": {}, "able": {}, "about": {}, "above": {}, "abroad": {}, "according": {}, "accordingly": {}, "across": {}, "actually": {}, "adj": {}, "after": {}, "afterwards": {}, "again": {}, "against": {}, "ago": {}, "ahead": {}, "ain't": {}, "all": {}, "allow": {}, "allows": {}, "almost": {}, "alone": {}, "along": {}, "alongside": {}, "already": {}, "also": {}, "although": {}, "always": {}, "am": {}, "amid": {}, "amidst": {}, "among": {}, "amongst": {}, "an": {}, "and": {}, "another": {}, "any": {}, "anybody": {}, "anyhow": {}, "anyone": {}, "anything": {}, "anyway": {}, "anyways": {}, "anywhere": {}, "apart": {}, "appear": {}, "appreciate": {}, "appropriate": {}, "are": {}, "aren't": {}, "around": {}, "as": {}, "a's": {}, "aside": {}, "ask": {}, "asking": {}, "associated": {}, "at": {}, "available": {}, "away": {}, "awfully": {}, "b": {}, "back": {}, "backward": {}, "backwards": {}, "be": {}, "became": {}, "because": {}, "become": {}, "becomes": {}, "becoming": {}, "been": {}, "before": {}, "beforehand": {}, "begin": {}, "behind": {}, "being": {}, "believe": {}, "below": {}, "beside": {}, "besides": {}, "best": {}, "better": {}, "between": {}, "beyond": {}, "both": {}, "brief": {}, "but": {}, "by": {}, "c": {}, "came": {}, "can": {}, "cannot": {}, "cant": {}, "can't": {}, "caption": {}, "cause": {}, "causes": {}, "certain": {}, "certainly": {}, "changes": {}, "clearly": {}, "c'mon": {}, "co": {}, "co.": {}, "com": {}, "come": {}, "comes": {}, "concerning": {}, "consequently": {}, "consider": {}, "considering": {}, "contain": {}, "containing": {}, "contains": {}, "corresponding": {}, "could": {}, "couldn't": {}, "course": {}, "c's": {}, "currently": {}, "d": {}, "dare": {}, "daren't": {}, "definitely": {}, "described": {}, "despite": {}, "did": {}, "didn't": {}, "different": {}, "directly": {}, "do": {}, "does": {}, "doesn't": {}, "doing": {}, "done": {}, "don't": {}, "down": {}, "downwards": {}, "during": {}, "e": {}, "each": {}, "edu": {}, "eg": {}, "eight": {}, "eighty": {}, "either": {}, "else": {}, "elsewhere": {}, "end": {}, "ending": {}, "enough": {}, "entirely": {}, "especially": {}, "et": {}, "etc": {}, "even": {}, "ever": {}, "evermore": {}, "every": {}, "everybody": {}, "everyone": {}, "everything": {}, "everywhere": {}, "ex": {}, "exactly": {}, "example": {}, "except": {}, "f": {}, "fairly": {}, "far": {}, "farther": {}, "few": {}, "fewer": {}, "fifth": {}, "first": {}, "five": {}, "followed": {}, "following": {}, "follows": {}, "for": {}, "forever": {}, "former": {}, "formerly": {}, "forth": {}, "forward": {}, "found": {}, "four": {}, "from": {}, "further": {}, "furthermore": {}, "g": {}, "get": {}, "gets": {}, "getting": {}, "given": {}, "gives": {}, "go": {}, "goes": {}, "going": {}, "gone": {}, "got": {}, "gotten": {}, "greetings": {}, "h": {}, "had": {}, "hadn't": {}, "half": {}, "happens": {}, "hardly": {}, "has": {}, "hasn't": {}, "have": {}, "haven't": {}, "having": {}, "he": {}, "he'd": {}, "he'll": {}, "hello": {}, "help": {}, "hence": {}, "her": {}, "here": {}, "hereafter": {}, "hereby": {}, "herein": {}, "here's": {}, "hereupon": {}, "hers": {}, "herself": {}, "he's": {}, "hi": {}, "him": {}, "himself": {}, "his": {}, "hither": {}, "hopefully": {}, "how": {}, "howbeit": {}, "however": {}, "hundred": {}, "i": {}, "i'd": {}, "ie": {}, "if": {}, "ignored": {}, "i'll": {}, "i'm": {}, "immediate": {}, "in": {}, "inasmuch": {}, "inc": {}, "inc.": {}, "indeed": {}, "indicate": {}, "indicated": {}, "indicates": {}, "inner": {}, "inside": {}, "insofar": {}, "instead": {}, "into": {}, "inward": {}, "is": {}, "isn't": {}, "it": {}, "it'd": {}, "it'll": {}, "its": {}, "it's": {}, "itself": {}, "i've": {}, "j": {}, "just": {}, "k": {}, "keep": {}, "keeps": {}, "kept": {}, "know": {}, "known": {}, "knows": {}, "l": {}, "last": {}, "lately": {}, "later": {}, "latter": {}, "latterly": {}, "least": {}, "less": {}, "lest": {}, "let": {}, "let's": {}, "like": {}, "liked": {}, "likely": {}, "likewise": {}, "little": {}, "look": {}, "looking": {}, "looks": {}, "low": {}, "lower": {}, "ltd": {}, "m": {}, "made": {}, "mainly": {}, "make": {}, "makes": {}, "many": {}, "may": {}, "maybe": {}, "mayn't": {}, "me": {}, "mean": {}, "meantime": {}, "meanwhile": {}, "merely": {}, "might": {}, "mightn't": {}, "mine": {}, "minus": {}, "miss": {}, "more": {}, "moreover": {}, "most": {}, "mostly": {}, "mr": {}, "mrs": {}, "much": {}, "must": {}, "mustn't": {}, "my": {}, "myself": {}, "n": {}, "name": {}, "namely": {}, "nd": {}, "near": {}, "nearly": {}, "necessary": {}, "need": {}, "needn't": {}, "needs": {}, "neither": {}, "never": {}, "neverf": {}, "neverless": {}, "nevertheless": {}, "new": {}, "next": {}, "nine": {}, "ninety": {}, "no": {}, "nobody": {}, "non": {}, "none": {}, "nonetheless": {}, "noone": {}, "no-one": {}, "nor": {}, "normally": {}, "not": {}, "nothing": {}, "notwithstanding": {}, "novel": {}, "now": {}, "nowhere": {}, "o": {}, "obviously": {}, "of": {}, "off": {}, "often": {}, "oh": {}, "ok": {}, "okay": {}, "old": {}, "on": {}, "once": {}, "one": {}, "ones": {}, "one's": {}, "only": {}, "onto": {}, "opposite": {}, "or": {}, "other": {}, "others": {}, "otherwise": {}, "ought": {}, "oughtn't": {}, "our": {}, "ours": {}, "ourselves": {}, "out": {}, "outside": {}, "over": {}, "overall": {}, "own": {}, "p": {}, "particular": {}, "particularly": {}, "past": {}, "per": {}, "perhaps": {}, "placed": {}, "please": {}, "plus": {}, "possible": {}, "presumably": {}, "probably": {}, "provided": {}, "provides": {}, "q": {}, "que": {}, "quite": {}, "qv": {}, "r": {}, "rather": {}, "rd": {}, "re": {}, "really": {}, "reasonably": {}, "recent": {}, "recently": {}, "regarding": {}, "regardless": {}, "regards": {}, "relatively": {}, "respectively": {}, "right": {}, "round": {}, "s": {}, "said": {}, "same": {}, "saw": {}, "say": {}, "saying": {}, "says": {}, "second": {}, "secondly": {}, "see": {}, "seeing": {}, "seem": {}, "seemed": {}, "seeming": {}, "seems": {}, "seen": {}, "self": {}, "selves": {}, "sensible": {}, "sent": {}, "serious": {}, "seriously": {}, "seven": {}, "several": {}, "shall": {}, "shan't": {}, "she": {}, "she'd": {}, "she'll": {}, "she's": {}, "should": {}, "shouldn't": {}, "since": {}, "six": {}, "so": {}, "some": {}, "somebody": {}, "someday": {}, "somehow": {}, "someone": {}, "something": {}, "sometime": {}, "sometimes": {}, "somewhat": {}, "somewhere": {}, "soon": {}, "sorry": {}, "specified": {}, "specify": {}, "specifying": {}, "still": {}, "sub": {}, "such": {}, "sup": {}, "sure": {}, "t": {}, "take": {}, "taken": {}, "taking": {}, "tell": {}, "tends": {}, "th": {}, "than": {}, "thank": {}, "thanks": {}, "thanx": {}, "that": {}, "that'll": {}, "thats": {}, "that's": {}, "that've": {}, "the": {}, "their": {}, "theirs": {}, "them": {}, "themselves": {}, "then": {}, "thence": {}, "there": {}, "thereafter": {}, "thereby": {}, "there'd": {}, "therefore": {}, "therein": {}, "there'll": {}, "there're": {}, "theres": {}, "there's": {}, "thereupon": {}, "there've": {}, "these": {}, "they": {}, "they'd": {}, "they'll": {}, "they're": {}, "they've": {}, "thing": {}, "things": {}, "think": {}, "third": {}, "thirty": {}, "this": {}, "thorough": {}, "thoroughly": {}, "those": {}, "though": {}, "three": {}, "through": {}, "throughout": {}, "thru": {}, "thus": {}, "till": {}, "to": {}, "together": {}, "too": {}, "took": {}, "toward": {}, "towards": {}, "tried": {}, "tries": {}, "truly": {}, "try": {}, "trying": {}, "t's": {}, "twice": {}, "two": {}, "u": {}, "un": {}, "under": {}, "underneath": {}, "undoing": {}, "unfortunately": {}, "unless": {}, "unlike": {}, "unlikely": {}, "until": {}, "unto": {}, "up": {}, "upon": {}, "upwards": {}, "us": {}, "use": {}, "used": {}, "useful": {}, "uses": {}, "using": {}, "usually": {}, "v": {}, "value": {}, "various": {}, "versus": {}, "very": {}, "via": {}, "viz": {}, "vs": {}, "w": {}, "want": {}, "wants": {}, "was": {}, "wasn't": {}, "way": {}, "we": {}, "we'd": {}, "welcome": {}, "well": {}, "we'll": {}, "went": {}, "were": {}, "we're": {}, "weren't": {}, "we've": {}, "what": {}, "whatever": {}, "what'll": {}, "what's": {}, "what've": {}, "when": {}, "whence": {}, "whenever": {}, "where": {}, "whereafter": {}, "whereas": {}, "whereby": {}, "wherein": {}, "where's": {}, "whereupon": {}, "wherever": {}, "whether": {}, "which": {}, "whichever": {}, "while": {}, "whilst": {}, "whither": {}, "who": {}, "who'd": {}, "whoever": {}, "whole": {}, "who'll": {}, "whom": {}, "whomever": {}, "who's": {}, "whose": {}, "why": {}, "will": {}, "willing": {}, "wish": {}, "with": {}, "within": {}, "without": {}, "wonder": {}, "won't": {}, "would": {}, "wouldn't": {}, "x": {}, "y": {}, "yes": {}, "yet": {}, "you": {}, "you'd": {}, "you'll": {}, "your": {}, "you're": {}, "yours": {}, "yourself": {}, "yourselves": {}, "you've": {}, "z": {}, "zero": {}}
)

type Sorted struct {
Category string
Probability float64
}

// Classifier is what we use to classify documents
type Classifier struct {
Words map[string]map[string]int
TotalWords int
CategoriesDocuments map[string]int
TotalDocuments int
CategoriesWords map[string]int
Threshold float64
}

// create and initialize the classifier
func NewClassifier(categories []string, threshold float64) *Classifier {
classifier := &Classifier{
Words: make(map[string]map[string]int),
TotalWords: 0,
CategoriesDocuments: make(map[string]int),
TotalDocuments: 0,
CategoriesWords: make(map[string]int),
Threshold: threshold,
}

for _, category := range categories {
classifier.Words[category] = make(map[string]int)
classifier.CategoriesDocuments[category] = 0
classifier.CategoriesWords[category] = 0
}
return classifier
}

// create and initialize the classifier from a file
func NewClassifierFromFile(path string) (*Classifier, error) {
classifier := &Classifier{}

fl, err := os.Open(path)
if err != nil {
return classifier, err
}
defer fl.Close()

return NewClassifierWithReader(fl)
}

// create and initialize the classifier from a file data
func NewClassifierFromFileData(data []byte) (*Classifier, error) {
return NewClassifierWithReader(bytes.NewReader(data))
}

// create and initialize the classifier from a file data
func NewClassifierWithReader(reader io.Reader) (*Classifier, error) {
classifier := &Classifier{}
err := gob.NewDecoder(reader).Decode(classifier)
if err != nil {
return classifier, err
}

return classifier, nil
}

// save the classifier to a file
// func (c *Classifier) SaveClassifierToFile(path string) error {
// fl, err := os.Create(path)
// if err != nil {
// return err
// }
// defer fl.Close()

// err = gob.NewEncoder(fl).Encode(&c)
// if err != nil {
// return err
// }

// return nil
// }

// Train the classifier
// func (c *Classifier) Train(category string, document string) {
// for word, count := range countWords(document) {
// c.Words[category][word] += count
// c.CategoriesWords[category] += count
// c.TotalWords += count
// }
// c.CategoriesDocuments[category]++
// c.TotalDocuments++
// }

// Classify a document
func (c *Classifier) Classify(document string) (category string) {
// get all the probabilities of each category
prob := c.Probabilities(document)

// sort the categories according to probabilities
var sp []Sorted
for c, p := range prob {
sp = append(sp, Sorted{c, p})
}
sort.Slice(sp, func(i, j int) bool {
return sp[i].Probability > sp[j].Probability
})

// if the highest probability is above threshold select that
if sp[0].Probability/sp[1].Probability > c.Threshold {
category = sp[0].Category
} else {
category = "other"
}

return
}

// Probabilities of each category
func (c *Classifier) Probabilities(document string) (p map[string]float64) {
p = make(map[string]float64)
for category := range c.Words {
p[category] = c.pCategoryDocument(category, document)
}
return
}

// p (document | category)
func (c *Classifier) pDocumentCategory(category string, document string) (p float64) {
p = 1.0
for word := range countWords(document) {
p = p * c.pWordCategory(category, word)
}
return p
}

func (c *Classifier) pWordCategory(category string, word string) float64 {
return float64(c.Words[category][stem(word)]+1) / float64(c.CategoriesWords[category])
}

// p (category)
func (c *Classifier) pCategory(category string) float64 {
return float64(c.CategoriesDocuments[category]) / float64(c.TotalDocuments)
}

// p (category | document)
func (c *Classifier) pCategoryDocument(category string, document string) float64 {
return c.pDocumentCategory(category, document) * c.pCategory(category)
}

// clean up and split words in document, then stem each word and count the occurrence
func countWords(document string) (wordCount map[string]int) {
cleaned := cleanDocument(document)
words := strings.Split(cleaned, " ")
wordCount = make(map[string]int)
for _, word := range words {
if _, ok := stopWords[word]; !ok {
key := stem(strings.ToLower(word))
wordCount[key]++
}
}
return
}

func cleanDocument(text string) string {
return cleaner.ReplaceAllString(text, "")
}

// stem a word using the Snowball algorithm
func stem(word string) string {
stemmed, err := snowball.Stem(word, "english", true)
if err == nil {
return stemmed
}
// fmt.Println("Cannot stem word:", word)
return word
}
Binary file added common/errorpageclassifier/clf.gob
Binary file not shown.
105 changes: 105 additions & 0 deletions common/errorpageclassifier/cm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package errorpageclassifier

// import "fmt"

// type ConfusionMatrix struct {
// matrix [][]int
// labels []string
// }

// func NewConfusionMatrix(actual, predicted []string, labels []string) *ConfusionMatrix {
// n := len(labels)
// matrix := make([][]int, n)
// for i := range matrix {
// matrix[i] = make([]int, n)
// }

// labelIndices := make(map[string]int)
// for i, label := range labels {
// labelIndices[label] = i
// }

// for i := range actual {
// matrix[labelIndices[actual[i]]][labelIndices[predicted[i]]]++
// }

// return &ConfusionMatrix{
// matrix: matrix,
// labels: labels,
// }
// }

// func (cm *ConfusionMatrix) PrintConfusionMatrix() {
// fmt.Printf("%30s\n", "Confusion Matrix")
// fmt.Println()
// // Print header
// fmt.Printf("%-15s", "")
// for _, label := range cm.labels {
// fmt.Printf("%-15s", label)
// }
// fmt.Println()

// // Print rows
// for i, row := range cm.matrix {
// fmt.Printf("%-15s", cm.labels[i])
// for _, value := range row {
// fmt.Printf("%-15d", value)
// }
// fmt.Println()
// }
// fmt.Println()
// }

// func (cm *ConfusionMatrix) PrintClassificationReport() {
// fmt.Printf("%30s\n", "Classification Report")
// fmt.Println()

// fmt.Printf("\n%-15s %-10s %-10s %-10s %-10s\n", "", "precision", "recall", "f1-score", "support")

// totals := map[string]float64{"true": 0, "predicted": 0, "correct": 0}
// macroAvg := map[string]float64{"precision": 0, "recall": 0, "f1-score": 0}

// for i, label := range cm.labels {
// truePos := cm.matrix[i][i]
// falsePos, falseNeg := 0, 0
// for j := 0; j < len(cm.labels); j++ {
// if i != j {
// falsePos += cm.matrix[j][i]
// falseNeg += cm.matrix[i][j]
// }
// }

// precision := float64(truePos) / float64(truePos+falsePos)
// recall := float64(truePos) / float64(truePos+falseNeg)
// f1Score := 2 * precision * recall / (precision + recall)
// support := truePos + falseNeg

// fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", label, precision, recall, f1Score, support)

// totals["true"] += float64(support)
// totals["predicted"] += float64(truePos + falsePos)
// totals["correct"] += float64(truePos)

// macroAvg["precision"] += precision
// macroAvg["recall"] += recall
// macroAvg["f1-score"] += f1Score
// }

// accuracy := totals["correct"] / totals["true"]
// fmt.Printf("\n%-26s %-10s %-10.2f %-10d", "accuracy", "", accuracy, int(totals["true"]))

// fmt.Printf("\n%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "macro avg",
// macroAvg["precision"]/float64(len(cm.labels)),
// macroAvg["recall"]/float64(len(cm.labels)),
// macroAvg["f1-score"]/float64(len(cm.labels)),
// int(totals["true"]))

// precisionWeightedAvg := totals["correct"] / totals["predicted"]
// recallWeightedAvg := totals["correct"] / totals["true"]
// f1ScoreWeightedAvg := 2 * precisionWeightedAvg * recallWeightedAvg / (precisionWeightedAvg + recallWeightedAvg)

// fmt.Printf("%-15s %-10.2f %-10.2f %-10.2f %-10d\n", "weighted avg",
// precisionWeightedAvg, recallWeightedAvg, f1ScoreWeightedAvg, int(totals["true"]))

// fmt.Println()
// }