Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
.moon/cache
.eslintcache
.compiled
.pnpm-store
.rendered
.test
node_modules
Expand Down
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ This repo is a pnpm workspace and uses Moon as the task runner.

- Repo tasks are run via Moon (not `package.json` scripts). The only root script is `prepare` (Husky).
- `pnpm` is for dependency management only. Never use it as a prefix to run another Node binary (no `pnpm exec`, no `pnpm <tool>`, etc.). The only exception is inside Husky hooks (for example `.husky/pre-commit`).
- Do not bypass pnpm's `minimumReleaseAge` policy. When adding or updating dependencies, choose versions that satisfy the policy in `pnpm-workspace.yaml`.
- Bun is BANNED in this repository. NEVER use it (use `tsx` for running TypeScript scripts).

## Commits and PR titles
Expand Down
26 changes: 26 additions & 0 deletions packages/canispam/THIRD_PARTY_NOTICES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Third-Party Notices

## Spamscanner Classifier Data

`src/classifier/spamscanner-classifier.json` is derived from the `classifier.json`
file published in `spamscanner@6.1.5`.

- Source: https://github.com/spamscanner/spamscanner
- Package: https://www.npmjs.com/package/spamscanner/v/6.1.5
- Copyright: Niftylettuce, LLC
- Original license: Business Source License 1.1
- Change date: 2025-04-20
- Change license: Apache License, Version 2.0

The original BSL 1.1 license text is included at
https://github.com/spamscanner/spamscanner/blob/master/LICENSE.

Relevant upstream license parameters:

```text
Licensor: Niftylettuce, LLC.
Licensed Work: Spam Scanner
The Licensed Work is (c) 2020 Niftylettuce, LLC.
Change Date: 2025-04-20
Change License: Apache License, Version 2.0
```
17 changes: 17 additions & 0 deletions packages/canispam/moon.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# https://moonrepo.dev/docs/config/tasks
$schema: 'https://moonrepo.dev/schemas/project.json'

tasks:
test:
command: vitest --config vitest.config.ts
deps:
- ~:build
inputs:
- src
- test
- package.json
env:
FORCE_COLOR: '1'
options:
cache: false
outputStyle: 'stream'
47 changes: 47 additions & 0 deletions packages/canispam/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"name": "canispam",
"version": "0.0.0",
"publishConfig": {
"access": "public"
},
"description": "Browser-safe spam analysis for generated email messages",
"license": "MIT",
"repository": {
"type": "git",
"url": "https://github.com/shellscape/jsx-email.git",
"directory": "packages/canispam"
},
"author": "Andrew Powell <andrew@shellscape.org>",
"homepage": "https://jsx.email/",
"type": "module",
"exports": {
"./package.json": "./package.json",
".": {
"types": "./dist/index.d.ts",
"default": "./dist/index.js"
}
},
"engines": {
"node": ">=22.0.0"
},
"files": [
"dist/**",
"THIRD_PARTY_NOTICES.md"
],
"keywords": [
"browser",
"email",
"eml",
"jsx-email",
"spam"
],
"types": "./dist/index.d.ts",
"funding": {
"type": "github",
"url": "https://github.com/sponsors/shellscape"
},
"sideEffects": false,
"dependencies": {
"@ladjs/naivebayes": "^0.1.0"
}
}
20 changes: 20 additions & 0 deletions packages/canispam/src/classifier.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
declare module '@ladjs/naivebayes' {
export interface NaiveBayesProbability {
category: string;
probability: number;
}

export interface NaiveBayesClassifier {
categorize(text: string, probability?: false): string;
categorize(text: string, probability: true): NaiveBayesProbability;
probabilities(text: string): NaiveBayesProbability[];
tokenizer: (text: string) => string[];
}

export interface NaiveBayesConstructor {
fromJson(json: unknown, limit?: number): NaiveBayesClassifier;
}

const NaiveBayes: NaiveBayesConstructor;
export default NaiveBayes;
}
37 changes: 37 additions & 0 deletions packages/canispam/src/classifier/classifier.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import NaiveBayes from '@ladjs/naivebayes';

import classifierData from './spamscanner-classifier.json' with { type: 'json' };
import { getClassifierTokens } from './tokenize.js';
import type { CanispamClassifierResult, CanispamFinding, ParsedEml } from '../types.js';

const classifier = NaiveBayes.fromJson(classifierData);
classifier.tokenizer = (text: string) => text.split(/\s+/).filter(Boolean);

const classifierScore = 6;

export const classifyEmail = async (parsed: ParsedEml): Promise<CanispamClassifierResult> => {
const tokens = await getClassifierTokens(parsed);
const input = tokens.join(' ');
const probabilities = classifier.probabilities(input);
const category = probabilities[0]?.category || 'ham';

return {
category,
isSpam: category === 'spam',
probabilities,
tokenCount: tokens.length
};
};

export const getClassifierFindings = (result: CanispamClassifierResult): CanispamFinding[] => {
if (!result.isSpam) return [];

return [
{
evidence: `category=${result.category}; tokens=${result.tokenCount}`,
message: 'Naive Bayes classifier predicts spam.',
rule: 'naive-bayes',
score: classifierScore
}
];
};

Large diffs are not rendered by default.

64 changes: 64 additions & 0 deletions packages/canispam/src/classifier/tokenize.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import { stripHtml } from '../html.js';

const genericTokenizer =
/[^a-zá-úÁ-Úà-úÀ-Úñü\dа-яёæøåàáảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđäöëïîûœçążśźęćńł-]+/i;
const encoder = new TextEncoder();

const spamScannerStopwords = new Set([
'a',
'an',
'and',
'are',
'as',
'at',
'be',
'by',
'for',
'from',
'has',
'in',
'is',
'it',
'of',
'on',
'or',
'that',
'the',
'to',
'was',
'were',
'will',
'with'
]);

const toHex = (bytes: ArrayBuffer) =>
[...new Uint8Array(bytes)].map((byte) => byte.toString(16).padStart(2, '0')).join('');

const hashToken = async (token: string) => {
const digest = await globalThis.crypto.subtle.digest('SHA-256', encoder.encode(token));
return toHex(digest).slice(0, 16);
};

const preprocessText = (value: string) =>
value
.replaceAll(/\b(?:\d{1,2}[/-]){2}\d{2,4}\b/g, ' DATE_PATTERN ')
.replaceAll(/\bhttps?:\/\/[^\s"'<>]+/gi, ' URL_LINK ')
.replaceAll(/\b[\w.+-]+@[\w.-]+\.[a-z]{2,}\b/gi, ' EMAIL_ADDRESS ')
.replaceAll(/\b(?:\d{1,3}\.){3}\d{1,3}\b/g, ' IP_ADDRESS ');

export const getClassifierTokens = async (params: {
html: string;
subject: string;
text: string;
}) => {
const content = preprocessText(`${params.text} ${stripHtml(params.html)} ${params.subject}`);
const tokens = content
.normalize('NFKC')
.split(genericTokenizer)
.map((token) => token.toLowerCase().trim())
.filter((token) => token.length > 0 && token.length <= 50)
.filter((token) => !spamScannerStopwords.has(token))
.slice(0, 2000);

return Promise.all(tokens.map(hashToken));
};
75 changes: 75 additions & 0 deletions packages/canispam/src/content-rules.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { stripHtml } from './html.js';
import type { CanispamFinding } from './types.js';

const spamKeywords = new Map([
['free', 1],
['winner', 2],
['prize', 2],
['lottery', 3],
['act now', 2],
['click here', 1],
['verify your account', 2],
['suspended', 2],
['wire transfer', 3],
['western union', 3],
['viagra', 3],
['cialis', 3],
['bitcoin', 1],
['guaranteed', 1],
['risk free', 2]
]);

const phrasePatterns = [
/\bcongratulations.*won\b/i,
/\bconfirm your identity\b/i,
/\bupdate your account\b/i,
/\blimited time offer\b/i,
/\binvestment opportunity\b/i
];

export const scanContent = (text: string, html: string): CanispamFinding[] => {
const findings: CanispamFinding[] = [];
const content = `${text} ${stripHtml(html)}`.toLowerCase();
const rawContent = `${text} ${html}`;

if (rawContent.includes('XJS*C4JDBQADN1.NSBN3*2IDNEN*GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X')) {
findings.push({
message: 'GTUBE spam test pattern detected.',
rule: 'gtube',
score: 10
});
}

for (const [keyword, score] of spamKeywords) {
if (content.includes(keyword)) {
findings.push({
evidence: keyword,
message: `Spam keyword detected: ${keyword}.`,
rule: 'spam-keyword',
score
});
}
}

for (const pattern of phrasePatterns) {
const match = content.match(pattern);
if (match) {
findings.push({
evidence: match[0],
message: 'Spam phrase pattern detected.',
rule: 'spam-phrase',
score: 2
});
}
}

if (html && !text.trim()) {
findings.push({
message: 'HTML email is missing a plain text part.',
rule: 'missing-plain-text',
score: 1
});
}

return findings;
};
Loading
Loading