In [None]:
{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "851d188e",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "851d188e",
        "outputId": "c88db24d-fc96-4949-fd89-2331b2274863"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Torch 2.3.1+cu121 CUDA True\n",
            "Triton 2.3.0\n",
            "Versions 2.0.1 4.43.3 0.34.2 0.11.1 2.20.0\n"
          ]
        }
      ],
      "source": [
        "GPU_TRAINING=True\n",
        "FORCE_DISABLE_TRITON=False\n",
        "TORCH_VERSION='2.3.1'\n",
        "TRITON_VERSION='2.3.0'\n",
        "import os,importlib,sys,subprocess\n",
        "UNINSTALL_UNUSED=True\n",
        "# Remove prior torch to avoid mix\n",
        "subprocess.run([sys.executable,'-m','pip','uninstall','-y','torch','torchvision','torchaudio','torchtext','triton'],stdout=subprocess.DEVNULL,stderr=subprocess.DEVNULL)\n",
        "if UNINSTALL_UNUSED:\n",
        "    subprocess.run([sys.executable,'-m','pip','uninstall','-y','fastai','timm','sentence-transformers','bigframes','cudf-cu12','cuml-cu12','dask-cudf-cu12','torchtune','spacy','albumentations','albucore','xgboost'],stdout=subprocess.DEVNULL,stderr=subprocess.DEVNULL)\n",
        "subprocess.check_call([sys.executable,'-m','pip','install','-q','--upgrade','pip'])\n",
        "subprocess.check_call([sys.executable,'-m','pip','install','-q','numpy==2.0.1'])\n",
        "if GPU_TRAINING:\n",
        "    subprocess.check_call([sys.executable,'-m','pip','install','-q',f'torch=={TORCH_VERSION}','--index-url','https://download.pytorch.org/whl/cu121'])\n",
        "    if not FORCE_DISABLE_TRITON:\n",
        "        subprocess.check_call([sys.executable,'-m','pip','install','-q',f'triton=={TRITON_VERSION}'])\n",
        "else:\n",
        "    subprocess.check_call([sys.executable,'-m','pip','install','-q',f'torch=={TORCH_VERSION}+cpu','-f','https://download.pytorch.org/whl/torch_stable.html'])\n",
        "subprocess.check_call([sys.executable,'-m','pip','install','-q','fsspec==2024.5.0','gcsfs==2024.5.0','transformers==4.43.3','accelerate==0.34.2','datasets==2.20.0','trl==0.10.1','peft==0.11.1','bitsandbytes==0.44.0','safetensors==0.4.3','sentencepiece==0.2.0'])\n",
        "import torch\n",
        "print('Torch',torch.__version__,'CUDA',torch.cuda.is_available())\n",
        "has_triton=importlib.util.find_spec('triton') is not None\n",
        "if has_triton:\n",
        "    import triton; print('Triton',getattr(triton,'__version__','?'))\n",
        "if FORCE_DISABLE_TRITON:\n",
        "    os.environ['USE_TRITON']='0'; os.environ['TRITON_DISABLE']='1'\n",
        "USE_4BIT_RUNTIME_OK=True\n",
        "if GPU_TRAINING and torch.cuda.is_available():\n",
        "    try:\n",
        "        import bitsandbytes as bnb; assert hasattr(bnb,'nn')\n",
        "    except Exception:\n",
        "        USE_4BIT_RUNTIME_OK=False\n",
        "else:\n",
        "    USE_4BIT_RUNTIME_OK=False\n",
        "os.environ['AUTO_DISABLE_4BIT']='1' if not USE_4BIT_RUNTIME_OK else '0'\n",
        "import numpy,transformers,accelerate,peft,datasets\n",
        "print('Versions',numpy.__version__,transformers.__version__,accelerate.__version__,peft.__version__,datasets.__version__)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "828cca11",
      "metadata": {
        "id": "828cca11"
      },
      "outputs": [],
      "source": [
        "BASE_MODEL='microsoft/Phi-3-mini-4k-instruct'\n",
        "OUTPUT_DIR='phi3_lora_adapter'\n",
        "MERGED_OUTPUT_DIR='phi3_merged'\n",
        "USE_4BIT=True\n",
        "LORA_R=32\n",
        "LORA_ALPHA=64\n",
        "LORA_DROPOUT=0.05\n",
        "NUM_EPOCHS=5\n",
        "BATCH_SIZE=4\n",
        "GR_ACCUM=4\n",
        "LEARNING_RATE=2e-4\n",
        "WARMUP_RATIO=0.05\n",
        "MAX_SEQ_LEN=1024\n",
        "SEED=42\n",
        "import torch,os,random\n",
        "if not torch.cuda.is_available():\n",
        "    USE_4BIT=False\n",
        "# Optional Hugging Face token (set env HF_TOKEN or assign directly below)\n",
        "HF_TOKEN=os.environ.get('HF_TOKEN') or None\n",
        "# Candidate public model IDs to try\n",
        "CANDIDATE_MODELS=[\n",
        "    BASE_MODEL\n",
        "]\n",
        "random.seed(SEED); torch.manual_seed(SEED)\n",
        "os.makedirs(OUTPUT_DIR,exist_ok=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ba01a788",
      "metadata": {
        "id": "ba01a788",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 125
        },
        "outputId": "984aa03f-5aa9-4427-8a74-8511bc275fd0"
      },
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "..."
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saving kql_eval.jsonl to kql_eval.jsonl\n",
            "Saving kql_train.jsonl to kql_train.jsonl\n",
            "Train records: 49  Eval records (external): 5\n"
          ]
        }
      ],
      "source": [
        "train_records=[]\n",
        "eval_records=[]\n",
        "import os,sys,json\n",
        "DATA_PATHS=[]\n",
        "# Upload JSONL files. Any filename containing 'eval' or 'validation' (case-insensitive) treated as eval set.\n",
        "if 'google.colab' in sys.modules or 'COLAB_RELEASE_TAG' in os.environ:\n",
        "    from google.colab import files\n",
        "    uploaded=files.upload()\n",
        "    for name,data in uploaded.items():\n",
        "        with open(name,'wb') as f: f.write(data)\n",
        "        DATA_PATHS.append(name)\n",
        "else:\n",
        "    raise RuntimeError('Colab upload environment not detected.')\n",
        "if not DATA_PATHS:\n",
        "    raise ValueError('No files uploaded.')\n",
        "\n",
        "def load_jsonl(path):\n",
        "    out=[]\n",
        "    with open(path,'r',encoding='utf-8') as f:\n",
        "        for line in f:\n",
        "            line=line.strip()\n",
        "            if not line: continue\n",
        "            try:\n",
        "                obj=json.loads(line)\n",
        "            except Exception:\n",
        "                continue\n",
        "            instr=obj.get('instruction') or obj.get('prompt') or obj.get('input')\n",
        "            kql=obj.get('kql') or obj.get('output')\n",
        "            if instr and kql:\n",
        "                out.append({'instruction':instr,'kql':kql})\n",
        "    return out\n",
        "for p in DATA_PATHS:\n",
        "    if os.path.exists(p):\n",
        "        recs=load_jsonl(p)\n",
        "        lname=p.lower()\n",
        "        if ('eval' in lname) or ('validation' in lname):\n",
        "            eval_records.extend(recs)\n",
        "        else:\n",
        "            train_records.extend(recs)\n",
        "print(f'Train records: {len(train_records)}  Eval records (external): {len(eval_records)}')\n",
        "if not train_records:\n",
        "    raise ValueError('No valid training instruction/kql records found.')\n",
        "# If no external eval records, a split will be created later."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# !rm -r *"
      ],
      "metadata": {
        "id": "iQ0g0wwL9eBC"
      },
      "id": "iQ0g0wwL9eBC",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d2e44d53",
      "metadata": {
        "id": "d2e44d53"
      },
      "outputs": [],
      "source": [
        "INSTRUCTION_PREFIX= \"\"\"You are an expert KQL assistant for Microsoft Sentinel. Your task is to convert a user's natural language request into a single, valid KQL query.\n",
        "\n",
        "Rules:\n",
        "1.  Output ONLY the raw KQL query. Do not include any commentary, explanations, or markdown backticks.\n",
        "2.  Always include a bounded time filter (e.g., `... | where TimeGenerated > ago(24h)`).\n",
        "3.  If the user provides a specific entity (like a username, IP, or filename), use it directly in the query.\n",
        "4.  If the user's request is generic (e.g., \"a user\" or \"an IP\"), use a realistic but clearly example entity like `'john.doe@example.com'` or `'198.51.100.99'`.\n",
        "5.  Use the most relevant and common fields for the specified log table and task.\n",
        "\n",
        "---\n",
        "Example:\n",
        "\n",
        "User Request: Show me failed logins for john.doe@example.com in the last day.\n",
        "\n",
        "KQL Output:\n",
        "SigninLogs | where UserPrincipalName =~ 'john.doe@example.com' and ResultType != 0 and TimeGenerated > ago(1d) | project TimeGenerated, IPAddress, Location, AppDisplayName, ResultType, ResultDescription\n",
        "---\n",
        "\n",
        "User Request: {instruction}\n",
        "\n",
        "KQL Output:\n",
        "\"\"\"\n",
        "\n",
        "def build_training_example(rec):\n",
        "    return f\"{INSTRUCTION_PREFIX}{rec['instruction']}\\nKQL:\\n{rec['kql']}\"\n",
        "\n",
        "formatted_train=[build_training_example(r) for r in train_records]\n",
        "formatted_eval=[build_training_example(r) for r in eval_records] if eval_records else []"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d4e016c8",
      "metadata": {
        "id": "d4e016c8"
      },
      "outputs": [],
      "source": [
        "from datasets import Dataset,DatasetDict\n",
        "train_ds=Dataset.from_list([{'text':t} for t in formatted_train])\n",
        "if formatted_eval:\n",
        "    eval_ds=Dataset.from_list([{'text':t} for t in formatted_eval])\n",
        "    ds=DatasetDict({'train':train_ds,'eval':eval_ds})\n",
        "else:\n",
        "    if len(train_ds)>5:\n",
        "        split=train_ds.train_test_split(test_size=0.1,seed=42)\n",
        "        ds=DatasetDict({'train':split['train'],'eval':split['test']})\n",
        "    else:\n",
        "        ds=DatasetDict({'train':train_ds,'eval':train_ds})"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import userdata\n",
        "from huggingface_hub import login\n",
        "\n",
        "HF_TOKEN = userdata.get('HF_TOKEN')\n",
        "login(token=HF_TOKEN)"
      ],
      "metadata": {
        "id": "ahVVsMVj6o8M"
      },
      "id": "ahVVsMVj6o8M",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "944d3a17",
      "metadata": {
        "id": "944d3a17",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 153
        },
        "outputId": "668d6ff2-43d1-49ed-834e-f6c96843b1d1"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "..."
          ]
        }
      ],
      "source": [
        "from transformers import AutoTokenizer\n",
        "MAX_SEQ_LEN=1024 if 'MAX_SEQ_LEN' not in globals() else MAX_SEQ_LEN\n",
        "if 'tokenizer' not in globals():\n",
        "    last_err=None\n",
        "    for mid in CANDIDATE_MODELS:\n",
        "        try:\n",
        "            tokenizer=AutoTokenizer.from_pretrained(mid,use_fast=True,token=HF_TOKEN)\n",
        "            print('Loaded tokenizer from',mid)\n",
        "            BASE_MODEL=mid\n",
        "            break\n",
        "        except Exception as e:\n",
        "            last_err=e\n",
        "            print('Tokenizer load failed for',mid,'->',type(e).__name__,str(e)[:120])\n",
        "    if 'tokenizer' not in globals():\n",
        "        raise last_err\n",
        "if tokenizer.pad_token is None:\n",
        "    tokenizer.pad_token=tokenizer.eos_token\n",
        "print('Train examples:',len(ds['train']),' Eval examples:',len(ds['eval']))\n",
        "\n",
        "def tokenize(batch):\n",
        "    return tokenizer(batch['text'],max_length=MAX_SEQ_LEN,truncation=True)\n",
        "\n",
        "tokenized=ds.map(tokenize,batched=True,remove_columns=['text'])"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# !pip install -U bitsandbytes"
      ],
      "metadata": {
        "id": "lNWJohTRAZHA",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 690
        },
        "outputId": "5c6a2775-c062-4a04-c009-3ab3c3a7ec75"
      },
      "id": "lNWJohTRAZHA",
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "..."
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b59e3473",
      "metadata": {
        "id": "b59e3473",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 813
        },
        "outputId": "35ead144-38a8-415e-c921-7f84ec2ca701"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "..."
          ]
        }
      ],
      "source": [
        "from transformers import BitsAndBytesConfig,DataCollatorForLanguageModeling,TrainingArguments,Trainer,AutoModelForCausalLM,set_seed\n",
        "from peft import LoraConfig,get_peft_model,prepare_model_for_kbit_training\n",
        "import math,torch,os,shutil\n",
        "bnb_config=None\n",
        "if USE_4BIT and torch.cuda.is_available():\n",
        "    bnb_config=BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_quant_type='nf4',bnb_4bit_use_double_quant=True,bnb_4bit_compute_dtype=torch.bfloat16)\n",
        "model=None\n",
        "last_err=None\n",
        "for mid in CANDIDATE_MODELS:\n",
        "    try:\n",
        "        model=AutoModelForCausalLM.from_pretrained(\n",
        "            mid,\n",
        "            quantization_config=bnb_config,\n",
        "            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,\n",
        "            device_map='auto' if (torch.cuda.is_available() and bnb_config) else None,\n",
        "            trust_remote_code=True,\n",
        "            token=HF_TOKEN\n",
        "        ) if bnb_config else AutoModelForCausalLM.from_pretrained(\n",
        "            mid,\n",
        "            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,\n",
        "            trust_remote_code=True,\n",
        "            token=HF_TOKEN\n",
        "        )\n",
        "        print('Loaded model from',mid)\n",
        "        BASE_MODEL=mid\n",
        "        break\n",
        "    except Exception as e:\n",
        "        last_err=e\n",
        "        print('Model load failed for',mid,'->',type(e).__name__,str(e)[:160])\n",
        "if model is None:\n",
        "    raise last_err\n",
        "if bnb_config: model=prepare_model_for_kbit_training(model)\n",
        "lora_cfg=LoraConfig(r=LORA_R,lora_alpha=LORA_ALPHA,lora_dropout=LORA_DROPOUT,bias='none',task_type='CAUSAL_LM',target_modules=['q_proj','k_proj','v_proj','o_proj','dense'])\n",
        "model=get_peft_model(model,lora_cfg)\n",
        "set_seed(SEED if 'SEED' in globals() else 42)\n",
        "collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)\n",
        "steps_per_epoch=math.ceil(len(tokenized['train'])/(BATCH_SIZE*GR_ACCUM))\n",
        "warmup_steps=2\n",
        "training_args=TrainingArguments(output_dir='train_out',per_device_train_batch_size=BATCH_SIZE,per_device_eval_batch_size=BATCH_SIZE,gradient_accumulation_steps=GR_ACCUM,learning_rate=LEARNING_RATE,warmup_steps=warmup_steps,num_train_epochs=NUM_EPOCHS,logging_steps=1,evaluation_strategy='epoch',save_strategy='epoch',bf16=torch.cuda.is_available(),gradient_checkpointing=torch.cuda.is_available(),report_to=[],optim='paged_adamw_8bit' if (bnb_config is not None) else 'adamw_torch')\n",
        "trainer=Trainer(model=model,args=training_args,train_dataset=tokenized['train'],eval_dataset=tokenized['eval'],data_collator=collator,tokenizer=tokenizer)\n",
        "trainer.train(); trainer.save_state()\n",
        "model.save_pretrained(OUTPUT_DIR); tokenizer.save_pretrained(OUTPUT_DIR)\n",
        "prompt='failed logins last 1 hour'\n",
        "inputs=tokenizer(f\"{INSTRUCTION_PREFIX}{prompt}\\nKQL:\\n\",return_tensors='pt')\n",
        "if torch.cuda.is_available(): inputs=inputs.to(model.device)\n",
        "with torch.no_grad(): out=model.generate(**inputs,max_new_tokens=120,temperature=0.2,do_sample=False,pad_token_id=tokenizer.eos_token_id)\n",
        "full=tokenizer.decode(out[0],skip_special_tokens=True)\n",
        "open('gen_sample.txt','w').write(full)\n",
        "zip_name=OUTPUT_DIR+'.zip'\n",
        "if os.path.exists(zip_name): os.remove(zip_name)\n",
        "shutil.make_archive(OUTPUT_DIR,'zip',OUTPUT_DIR)\n",
        "try:\n",
        "    from google.colab import files; files.download(zip_name)\n",
        "except Exception:\n",
        "    pass"
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "SwPm6xJ6hbwQ"
      },
      "id": "SwPm6xJ6hbwQ",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "language_info": {
      "name": "python"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "accelerator": "GPU",
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "..."
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}