From 9beda5e76b42b8218d6b0149b76c8ef5b5297bcd Mon Sep 17 00:00:00 2001
From: Harald Schilly <harald.schilly@gmail.com>
Date: Fri, 28 Jun 2024 18:23:21 +0200
Subject: [PATCH 1/2] llm/abuse: change monitoring, rewrite requirements

---
 src/packages/server/llm/abuse.ts | 53 ++++++++++----------------------
 1 file changed, 17 insertions(+), 36 deletions(-)

diff --git a/src/packages/server/llm/abuse.ts b/src/packages/server/llm/abuse.ts
index aa5a91847c..49ad96acdf 100644
--- a/src/packages/server/llm/abuse.ts
+++ b/src/packages/server/llm/abuse.ts
@@ -1,28 +1,17 @@
 /*
-We initially just implement some very simple rate limitations to prevent very
-blatant abuse.
-
-- at most $10^5$ tokens per signed in user per hour \(that's \$0.20\); that allows for major usage...
-  but if somebody tried to do something really abusive, it would stop it.  Nobody
-  would hit this in practice unless they are really trying to abuse cocalc...
-  WRONG: it's very easy to hit this due to large inputs, e.g., analyzing a paper.
-- at most $10^6$ tokens per hour across all users \-\- that's \$2/hour. That would
-  come out to a bit more if sustained than my budget, but allows for bursts.
-
-See https://help.openai.com/en/articles/7039783-chatgpt-api-faq for the upstream rate limits,
-where they limit per minute, not per hour (like below):
-
-    What's the rate limits for the ChatGPT API?
-
-    Free trial users: 20 RPM 40000 TPM
-    Pay-as-you-go users (first 48 hours): 60 RPM 60000 TPM
-    Pay-as-you-go users (after 48 hours): 3500 RPM 90000 TPM
-
-    RPM = requests per minute
-    TPM = tokens per minute
+This is a basic rate limitation for free and metered usage of LLMs.
+- any call must be identified by an account (we had by a token, but it got abused)
+- There is a distinction between "cocalc.com" and "on-prem":
+   - cocalc.com has some models (the more expensive ones) which are metered per token and some which are free
+   - on-prem: there is only rate limiting, no metered usage
+- quotas are adjustable
+- at it's core, this should limit individual users from too much free usage, and overall cap the usage
+- monitoring as necessary, to give feedback for tweaking the parameters
 */
 
-import { newCounter, newHistogram } from "@cocalc/backend/metrics";
+import { isObject } from "lodash";
+
+import { newCounter, newGauge } from "@cocalc/backend/metrics";
 import { process_env_int } from "@cocalc/backend/misc";
 import getPool, { CacheTime } from "@cocalc/database/pool";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -41,7 +30,6 @@ import {
 } from "@cocalc/util/db-schema/llm-utils";
 import { KUCALC_COCALC_COM } from "@cocalc/util/db-schema/site-defaults";
 import { isValidUUID } from "@cocalc/util/misc";
-import { isObject } from "lodash";
 
 // These are tokens over a given period of time – summed by account/analytics_cookie or global.
 const QUOTAS = {
@@ -50,18 +38,11 @@ const QUOTAS = {
   global: process_env_int("COCALC_LLM_QUOTA_GLOBAL", 10 ** 6),
 } as const;
 
-const prom_quotas = newHistogram(
+const prom_quotas = newGauge(
   "llm",
-  "abuse_usage",
-  "Language model abuse usage",
-  {
-    buckets:
-      // 10 buckets evenly spaced from 0 to QUOTAS.global
-      Array.from({ length: 10 }, (_, i) =>
-        Math.floor((i * QUOTAS.global) / 10),
-      ),
-    labels: ["usage"],
-  },
+  "abuse_usage_pct",
+  "Language model abuse, 0 to 100 percent of limit",
+  ["quota"],
 );
 
 const prom_rejected = newCounter(
@@ -122,7 +103,7 @@ export async function checkForAbuse({
     analytics_cookie,
   });
 
-  prom_quotas.labels("recent").observe(usage);
+  prom_quotas.labels("account").set(100 * (usage / QUOTAS.account));
 
   // console.log("usage = ", usage);
   if (account_id) {
@@ -146,7 +127,7 @@ export async function checkForAbuse({
   // Prevent more sophisticated abuse, e.g., changing analytics_cookie or account frequently,
   // or just a general huge surge in usage.
   const overallUsage = await recentUsage({ cache: "long", period: "1 hour" });
-  prom_quotas.labels("global").observe(overallUsage);
+  prom_quotas.labels("global").set(100 * (overallUsage / QUOTAS.global));
   // console.log("overallUsage = ", usage);
   if (overallUsage > QUOTAS.global) {
     prom_rejected.labels("global").inc();

From 51e77b98c99238ba2599813d98cd081d5c5ea488 Mon Sep 17 00:00:00 2001
From: Harald Schilly <harald.schilly@gmail.com>
Date: Tue, 9 Jul 2024 11:57:35 +0200
Subject: [PATCH 2/2] server/llm/abuse: use existing isValidAccount with
 caching and explicit histogram buckets for accounts

---
 src/packages/server/llm/abuse.ts | 34 +++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/packages/server/llm/abuse.ts b/src/packages/server/llm/abuse.ts
index 49ad96acdf..e75192547b 100644
--- a/src/packages/server/llm/abuse.ts
+++ b/src/packages/server/llm/abuse.ts
@@ -1,6 +1,6 @@
 /*
 This is a basic rate limitation for free and metered usage of LLMs.
-- any call must be identified by an account (we had by a token, but it got abused)
+- any call must be identified by an account (we had just by a cookie ID, but it got abused, hence noAccount=0)
 - There is a distinction between "cocalc.com" and "on-prem":
    - cocalc.com has some models (the more expensive ones) which are metered per token and some which are free
    - on-prem: there is only rate limiting, no metered usage
@@ -11,7 +11,7 @@ This is a basic rate limitation for free and metered usage of LLMs.
 
 import { isObject } from "lodash";
 
-import { newCounter, newGauge } from "@cocalc/backend/metrics";
+import { newCounter, newGauge, newHistogram } from "@cocalc/backend/metrics";
 import { process_env_int } from "@cocalc/backend/misc";
 import getPool, { CacheTime } from "@cocalc/database/pool";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -30,6 +30,7 @@ import {
 } from "@cocalc/util/db-schema/llm-utils";
 import { KUCALC_COCALC_COM } from "@cocalc/util/db-schema/site-defaults";
 import { isValidUUID } from "@cocalc/util/misc";
+import isValidAccount from "../accounts/is-valid-account";
 
 // These are tokens over a given period of time – summed by account/analytics_cookie or global.
 const QUOTAS = {
@@ -38,13 +39,20 @@ const QUOTAS = {
   global: process_env_int("COCALC_LLM_QUOTA_GLOBAL", 10 ** 6),
 } as const;
 
-const prom_quotas = newGauge(
+const prom_quota_global = newGauge(
   "llm",
-  "abuse_usage_pct",
-  "Language model abuse, 0 to 100 percent of limit",
+  "abuse_usage_global_pct",
+  "Language model abuse limit, global, 0 to 100 percent of limit, rounded",
   ["quota"],
 );
 
+const prom_quota_per_account = newHistogram(
+  "llm",
+  "abuse_usage_account_pct",
+  "Language model usage per account, to see if users reach certain thresholds for their account usage.",
+  { buckets: [25, 50, 75, 100, 110] },
+);
+
 const prom_rejected = newCounter(
   "llm",
   "abuse_rejected_total",
@@ -85,7 +93,6 @@ export async function checkForAbuse({
     (await getServerSettings()).kucalc === KUCALC_COCALC_COM;
 
   if (!isFreeModel(model, is_cocalc_com)) {
-    // we exclude Ollama (string), because it is free.
     const service = model2service(model) as LanguageServiceCore;
     // This is a for-pay product, so let's make sure user can purchase it.
     await assertPurchaseAllowed({ account_id, service });
@@ -103,7 +110,9 @@ export async function checkForAbuse({
     analytics_cookie,
   });
 
-  prom_quotas.labels("account").set(100 * (usage / QUOTAS.account));
+  // this fluctuates for each account, we'll tally up how often users end up in certain usage buckets
+  // that's more explicit than a histogram
+  prom_quota_per_account.observe(100 * (usage / QUOTAS.account));
 
   // console.log("usage = ", usage);
   if (account_id) {
@@ -127,8 +136,9 @@ export async function checkForAbuse({
   // Prevent more sophisticated abuse, e.g., changing analytics_cookie or account frequently,
   // or just a general huge surge in usage.
   const overallUsage = await recentUsage({ cache: "long", period: "1 hour" });
-  prom_quotas.labels("global").set(100 * (overallUsage / QUOTAS.global));
-  // console.log("overallUsage = ", usage);
+  prom_quota_global
+    .labels("global")
+    .set(Math.round(100 * (overallUsage / QUOTAS.global)));
   if (overallUsage > QUOTAS.global) {
     prom_rejected.labels("global").inc();
     throw new Error(
@@ -156,11 +166,7 @@ async function recentUsage({
   const pool = getPool(cache);
   let query, args;
   if (account_id) {
-    const { rows } = await pool.query(
-      "SELECT COUNT(*) FROM accounts WHERE account_id=$1",
-      [account_id],
-    );
-    if (rows.length == 0) {
+    if (!(await isValidAccount(account_id))) {
       throw Error(`invalid account_id ${account_id}`);
     }
     query = `SELECT SUM(total_tokens) AS usage FROM openai_chatgpt_log WHERE account_id=$1 AND time >= NOW() - INTERVAL '${period}'`;