From 9beda5e76b42b8218d6b0149b76c8ef5b5297bcd Mon Sep 17 00:00:00 2001 From: Harald Schilly Date: Fri, 28 Jun 2024 18:23:21 +0200 Subject: [PATCH 1/2] llm/abuse: change monitoring, rewrite requirements --- src/packages/server/llm/abuse.ts | 53 ++++++++++---------------------- 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/src/packages/server/llm/abuse.ts b/src/packages/server/llm/abuse.ts index aa5a91847c..49ad96acdf 100644 --- a/src/packages/server/llm/abuse.ts +++ b/src/packages/server/llm/abuse.ts @@ -1,28 +1,17 @@ /* -We initially just implement some very simple rate limitations to prevent very -blatant abuse. - -- at most $10^5$ tokens per signed in user per hour \(that's \$0.20\); that allows for major usage... - but if somebody tried to do something really abusive, it would stop it. Nobody - would hit this in practice unless they are really trying to abuse cocalc... - WRONG: it's very easy to hit this due to large inputs, e.g., analyzing a paper. -- at most $10^6$ tokens per hour across all users \-\- that's \$2/hour. That would - come out to a bit more if sustained than my budget, but allows for bursts. - -See https://help.openai.com/en/articles/7039783-chatgpt-api-faq for the upstream rate limits, -where they limit per minute, not per hour (like below): - - What's the rate limits for the ChatGPT API? - - Free trial users: 20 RPM 40000 TPM - Pay-as-you-go users (first 48 hours): 60 RPM 60000 TPM - Pay-as-you-go users (after 48 hours): 3500 RPM 90000 TPM - - RPM = requests per minute - TPM = tokens per minute +This is a basic rate limitation for free and metered usage of LLMs. +- any call must be identified by an account (we had by a token, but it got abused) +- There is a distinction between "cocalc.com" and "on-prem": + - cocalc.com has some models (the more expensive ones) which are metered per token and some which are free + - on-prem: there is only rate limiting, no metered usage +- quotas are adjustable +- at it's core, this should limit individual users from too much free usage, and overall cap the usage +- monitoring as necessary, to give feedback for tweaking the parameters */ -import { newCounter, newHistogram } from "@cocalc/backend/metrics"; +import { isObject } from "lodash"; + +import { newCounter, newGauge } from "@cocalc/backend/metrics"; import { process_env_int } from "@cocalc/backend/misc"; import getPool, { CacheTime } from "@cocalc/database/pool"; import { getServerSettings } from "@cocalc/database/settings"; @@ -41,7 +30,6 @@ import { } from "@cocalc/util/db-schema/llm-utils"; import { KUCALC_COCALC_COM } from "@cocalc/util/db-schema/site-defaults"; import { isValidUUID } from "@cocalc/util/misc"; -import { isObject } from "lodash"; // These are tokens over a given period of time – summed by account/analytics_cookie or global. const QUOTAS = { @@ -50,18 +38,11 @@ const QUOTAS = { global: process_env_int("COCALC_LLM_QUOTA_GLOBAL", 10 ** 6), } as const; -const prom_quotas = newHistogram( +const prom_quotas = newGauge( "llm", - "abuse_usage", - "Language model abuse usage", - { - buckets: - // 10 buckets evenly spaced from 0 to QUOTAS.global - Array.from({ length: 10 }, (_, i) => - Math.floor((i * QUOTAS.global) / 10), - ), - labels: ["usage"], - }, + "abuse_usage_pct", + "Language model abuse, 0 to 100 percent of limit", + ["quota"], ); const prom_rejected = newCounter( @@ -122,7 +103,7 @@ export async function checkForAbuse({ analytics_cookie, }); - prom_quotas.labels("recent").observe(usage); + prom_quotas.labels("account").set(100 * (usage / QUOTAS.account)); // console.log("usage = ", usage); if (account_id) { @@ -146,7 +127,7 @@ export async function checkForAbuse({ // Prevent more sophisticated abuse, e.g., changing analytics_cookie or account frequently, // or just a general huge surge in usage. const overallUsage = await recentUsage({ cache: "long", period: "1 hour" }); - prom_quotas.labels("global").observe(overallUsage); + prom_quotas.labels("global").set(100 * (overallUsage / QUOTAS.global)); // console.log("overallUsage = ", usage); if (overallUsage > QUOTAS.global) { prom_rejected.labels("global").inc(); From 51e77b98c99238ba2599813d98cd081d5c5ea488 Mon Sep 17 00:00:00 2001 From: Harald Schilly Date: Tue, 9 Jul 2024 11:57:35 +0200 Subject: [PATCH 2/2] server/llm/abuse: use existing isValidAccount with caching and explicit histogram buckets for accounts --- src/packages/server/llm/abuse.ts | 34 +++++++++++++++++++------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/packages/server/llm/abuse.ts b/src/packages/server/llm/abuse.ts index 49ad96acdf..e75192547b 100644 --- a/src/packages/server/llm/abuse.ts +++ b/src/packages/server/llm/abuse.ts @@ -1,6 +1,6 @@ /* This is a basic rate limitation for free and metered usage of LLMs. -- any call must be identified by an account (we had by a token, but it got abused) +- any call must be identified by an account (we had just by a cookie ID, but it got abused, hence noAccount=0) - There is a distinction between "cocalc.com" and "on-prem": - cocalc.com has some models (the more expensive ones) which are metered per token and some which are free - on-prem: there is only rate limiting, no metered usage @@ -11,7 +11,7 @@ This is a basic rate limitation for free and metered usage of LLMs. import { isObject } from "lodash"; -import { newCounter, newGauge } from "@cocalc/backend/metrics"; +import { newCounter, newGauge, newHistogram } from "@cocalc/backend/metrics"; import { process_env_int } from "@cocalc/backend/misc"; import getPool, { CacheTime } from "@cocalc/database/pool"; import { getServerSettings } from "@cocalc/database/settings"; @@ -30,6 +30,7 @@ import { } from "@cocalc/util/db-schema/llm-utils"; import { KUCALC_COCALC_COM } from "@cocalc/util/db-schema/site-defaults"; import { isValidUUID } from "@cocalc/util/misc"; +import isValidAccount from "../accounts/is-valid-account"; // These are tokens over a given period of time – summed by account/analytics_cookie or global. const QUOTAS = { @@ -38,13 +39,20 @@ const QUOTAS = { global: process_env_int("COCALC_LLM_QUOTA_GLOBAL", 10 ** 6), } as const; -const prom_quotas = newGauge( +const prom_quota_global = newGauge( "llm", - "abuse_usage_pct", - "Language model abuse, 0 to 100 percent of limit", + "abuse_usage_global_pct", + "Language model abuse limit, global, 0 to 100 percent of limit, rounded", ["quota"], ); +const prom_quota_per_account = newHistogram( + "llm", + "abuse_usage_account_pct", + "Language model usage per account, to see if users reach certain thresholds for their account usage.", + { buckets: [25, 50, 75, 100, 110] }, +); + const prom_rejected = newCounter( "llm", "abuse_rejected_total", @@ -85,7 +93,6 @@ export async function checkForAbuse({ (await getServerSettings()).kucalc === KUCALC_COCALC_COM; if (!isFreeModel(model, is_cocalc_com)) { - // we exclude Ollama (string), because it is free. const service = model2service(model) as LanguageServiceCore; // This is a for-pay product, so let's make sure user can purchase it. await assertPurchaseAllowed({ account_id, service }); @@ -103,7 +110,9 @@ export async function checkForAbuse({ analytics_cookie, }); - prom_quotas.labels("account").set(100 * (usage / QUOTAS.account)); + // this fluctuates for each account, we'll tally up how often users end up in certain usage buckets + // that's more explicit than a histogram + prom_quota_per_account.observe(100 * (usage / QUOTAS.account)); // console.log("usage = ", usage); if (account_id) { @@ -127,8 +136,9 @@ export async function checkForAbuse({ // Prevent more sophisticated abuse, e.g., changing analytics_cookie or account frequently, // or just a general huge surge in usage. const overallUsage = await recentUsage({ cache: "long", period: "1 hour" }); - prom_quotas.labels("global").set(100 * (overallUsage / QUOTAS.global)); - // console.log("overallUsage = ", usage); + prom_quota_global + .labels("global") + .set(Math.round(100 * (overallUsage / QUOTAS.global))); if (overallUsage > QUOTAS.global) { prom_rejected.labels("global").inc(); throw new Error( @@ -156,11 +166,7 @@ async function recentUsage({ const pool = getPool(cache); let query, args; if (account_id) { - const { rows } = await pool.query( - "SELECT COUNT(*) FROM accounts WHERE account_id=$1", - [account_id], - ); - if (rows.length == 0) { + if (!(await isValidAccount(account_id))) { throw Error(`invalid account_id ${account_id}`); } query = `SELECT SUM(total_tokens) AS usage FROM openai_chatgpt_log WHERE account_id=$1 AND time >= NOW() - INTERVAL '${period}'`;