In [1]:
from pathlib import Path

cfg_path = Path.home().resolve() / "config"

if cfg_path.is_file():
    print(f"Found config: {cfg_path}")
else: 
    cfg_dir = input("Enter directory with config: ")
    cfg_path = Path(cfg_dir).resolve() / "config"
    if cfg_path.is_file():
        print(f"Found config: {cfg_path}")
    else:
        print("ERROR: Re-run this cell and give correct directory")

Found config: /home/user1-selab3/Documents/research-shradha/kube/kube1-sklearn/config


In [2]:
from shutil import copy
from pathlib import Path

dest_dir = Path.home().resolve() / ".kube"
dest_dir.mkdir(exist_ok=True)
dest_path = dest_dir / "config"

if dest_path.is_file():
    print("SUCCESS: Config correctly configured")
elif cfg_path.is_file() and dest_dir.is_dir():
    copy(cfg_path, dest_path)
    print("SUCCESS: Copied config")
else:
    print("ERROR: Ensure you have correct config path")

SUCCESS: Config correctly configured


In [2]:
! kubectl config view --minify -o jsonpath='{..namespace}'

gp-engine-unoselab01

In [3]:
from jinja2 import Template

# read in the template
with open('/home/user1-selab3/Documents/research-shradha/kube/kube2/example1-kube/CODE-RajulShakywar/CODE/example/yaml/yaml_templates/pvc_template.yml') as file_:
    template = Template(file_.read())

In [4]:
# replace None 
PVC_NAME = 'pvc-shradha-mlm-gp-engine-unoselab01'

pvc_spec = template.render(name=PVC_NAME)

In [5]:
print(pvc_spec)

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: pvc-shradha-mlm-gp-engine-unoselab01
spec:
  storageClassName: rook-cephfs-central
  accessModes:
  - ReadWriteMany
  resources:
    requests:
      storage: 50Gi


In [6]:
with open("/home/user1-selab3/Documents/research-shradha/kube/kube1-sklearn/1-kube-pvc-mlm.yml", "w") as file:
    file.write(pvc_spec)

In [7]:
! kubectl create -f ../kube1-sklearn/1-kube-pvc-mlm.yml

persistentvolumeclaim/pvc-shradha-mlm-gp-engine-unoselab01 created


In [1]:
! kubectl get pvc

NAME                                   STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS          AGE
pvc-gp-engine-unoselab01               Bound    pvc-3689372b-bf1f-40b3-b2aa-f2ed83257150   50Gi       RWX            rook-cephfs-central   56d
pvc-shradha-gp-engine-unoselab01       Bound    pvc-d12e42d5-6421-4540-ac64-3beeae853d13   50Gi       RWX            rook-cephfs-central   12d
pvc-shradha-mlm-gp-engine-unoselab01   Bound    pvc-9c211907-c650-4926-9235-9e064b050d5d   50Gi       RWX            rook-cephfs-central   18h


In [11]:
from jinja2 import Template

# read in the template
with open('/home/user1-selab3/Documents/research-shradha/kube/kube2/example1-kube/CODE-RajulShakywar/CODE/example/yaml/yaml_templates/pod_pvc.yml') as file_:
    template = Template(file_.read())

In [15]:
# render the pod spec
pod_pvc_spec = template.render(
    pod_name="shradha-mlm-gp-engine-unoselab01-pod",
    persistentVolume_name="pvc-shradha-mlm-gp-engine-unoselab01"
)
# print the pod spec
print(pod_pvc_spec)

apiVersion: v1
kind: Pod
metadata:
  name: pod-name-sso
spec:
  automountServiceAccountToken: false
  affinity:
    nodeAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
        - matchExpressions:
          - key: topology.kubernetes.io/region
            operator: In
            values:
            - us-central 
  containers:
  - name: pod-name-sso
    image: ubuntu:20.04
    command: ["sh", "-c", "echo 'Im a new pod' && sleep infinity"]
    resources:
      limits:
        memory: 12Gi
        cpu: 2
      requests:
        memory: 10Gi
        cpu: 2
    volumeMounts:
    - mountPath: /data
      name: persistentVolume-name
  volumes:
    - name: persistentVolume-name
      persistentVolumeClaim:
        claimName: persistentVolume-name


In [14]:

with open("/home/user1-selab3/Documents/research-shradha/kube/kube1-sklearn/2-kube-pod-mlm.yml", "w") as file:
    file.write(pod_pvc_spec)

In [8]:
! kubectl create -f ../kube1-sklearn/2-kube-pod-mlm.yml

pod/shradha-mlm-gp-engine-unoselab01-pod created


In [12]:
! kubectl get pvc

NAME                                   STATUS        VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS          AGE
pvc-shradha-mlm-gp-engine-unoselab01   Terminating   pvc-9c211907-c650-4926-9235-9e064b050d5d   50Gi       RWX            rook-cephfs-central   27h


In [13]:
! kubectl get pods

NAME                                   READY   STATUS   RESTARTS   AGE
shradha-mlm-gp-engine-unoselab01-pod   0/1     Error    0          8h


In [9]:
! kubectl delete pvc pvc-shradha-mlm-gp-engine-unoselab01

persistentvolumeclaim "pvc-shradha-mlm-gp-engine-unoselab01" deleted
^C


In [11]:
! kubectl delete job job02-mlm-shradha-gp-engine-unoselab01

job.batch "job02-mlm-shradha-gp-engine-unoselab01" deleted


In [10]:
! kubectl cp /home/user1-selab3/Documents/research-shradha/kube/kube1-sklearn/train_mlm.py shradha-mlm-gp-engine-unoselab01-pod:/data/train_mlm.py

In [11]:
! kubectl cp /home/user1-selab3/Documents/research-shradha/kube/kube1-sklearn/run_install.sh shradha-mlm-gp-engine-unoselab01-pod:/data/run_install.sh

In [12]:
! kubectl exec shradha-mlm-gp-engine-unoselab01-pod -- cat /data/train_mlm.py

import os
from collections import defaultdict
from tqdm import tqdm
import datasets
from datasets import Dataset, load_dataset, DatasetDict
import transformers
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator,notebook_launcher
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from transformers import AutoModelForMaskedLM
from transformers import default_data_collator
import collections
import numpy as np
import math
import time
import argparse
import logging
import sys
from prettytable import PrettyTable 
import platform
import multiprocessing


parser = argparse.ArgumentParser(description="Data Splitting")
parser.add_argument("-train", type=int, required=True, help="Size 

In [13]:
! kubectl exec shradha-mlm-gp-engine-unoselab01-pod -- cat /data/run_install.sh

#!/bin/bash

pip3 install tqdm
conda install -y -c conda-forge git-lfs
git config --global user.email 'myoungkyu@unomaha.edu'
git config --global user.name 'Myoungkyu Song'
python -m pip install scikit-learn transformers datasets sentencepiece sacremoses accelerate
pip3 install --upgrade huggingface_hub
pip3 install 'huggingface_hub[cli,torch]'
pip3 install ipywidgets
pip3 install numpy pandas matplotlib
pip3 install ipykernel
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip3 install prettytable
echo '--------------------------------------------------------'
echo 'Installation completed'
echo '--------------------------------------------------------'

# conda env create -f env_mlm.yml
# echo $SHELL
# conda init bash
# # source .bashrc
# conda activate env_mlm


In [23]:
from jinja2 import Template

# read in the template
with open('/home/user1-selab3/Documents/research-shradha/kube/kube2/example2-kube-mlm/masked-lang-model/3-kube-job-mlm.yml') as file_:
    template = Template(file_.read())

In [24]:
# render the job spec
job_spec = template.render(
    job_name="job02-mlm-shradha-gp-engine-unoselab01",
    pvc_name="pvc-shradha-mlm-gp-engine-unoselab01",
    runAsUser="0"
)
# print the job spec
print(job_spec)

apiVersion: batch/v1
kind: Job
metadata:
  name: job09-train-mlm-model-gp-engine-unoselab01
spec:
  ttlSecondsAfterFinished: 86400 # a day
  template:
    spec:
      containers:
        - name: job-mlm-model-train-container
          image: gitlab-registry.nrp-nautilus.io/msong/research/env_mlm:v1
          workingDir: /data
          command: ["/bin/bash","-c"]
          args: ["cd /data;
                source activate env_mlm;
                pip install psutil gputil;
                accelerate env;
                accelerate launch --multi_gpu --num_processes=${NGPU} train_mlm.py -train ${TRAIN} -test ${TEST} -ngpu ${NGPU} -epoch ${EPOCH} -logfile ${LOGFILE}"]
                # python py_ver.py"]
          env:
            - name: TRAIN
              value: "100000"
            - name: TEST
              value: "10000"
            - name: NGPU
              value: "8"
            - name: EPOCH
              value: "10"
            - name: LOGFILE
              value: "log.txt"
  

In [25]:
with open("/home/user1-selab3/Documents/research-shradha/kube/kube1-sklearn/3-kube-job-mlm.yml", "w") as file:
    file.write(job_spec)

In [19]:
! kubectl create -f ../kube1-sklearn/3-kube-job-mlm.yml

job.batch/job02-mlm-shradha-gp-engine-unoselab01 created


In [23]:
! kubectl get pods

NAME                                           READY   STATUS    RESTARTS   AGE
gp-engine-unoselab01-pod1                      0/1     Error     0          7d4h
job02-mlm-shradha-gp-engine-unoselab01-j8vcv   1/1     Running   0          36m
shradha-gp-engine-unoselab01-pod               0/1     Error     0          6d
shradha-mlm-gp-engine-unoselab01-pod           1/1     Running   0          41m


In [24]:
!kubectl get jobs

NAME                                     COMPLETIONS   DURATION   AGE
job02-mlm-shradha-gp-engine-unoselab01   0/1           36m        36m


In [25]:
! kubectl logs -f job02-mlm-shradha-gp-engine-unoselab01-j8vcv

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py): started
  Building wheel for gputil (setup.py): finished with status 'done'
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7394 sha256=d273611e059d0876fcdade182eb0d245666b82398f56766f6e0ff1e19bf5b975
  Stored in directory: /root/.cache/pip/wheels/2b/4d/8f/55fb4f7b9b591891e8d3f72977c4ec6c7763b39c19f0861595
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0

Copy-and-paste the text below in your GitHub issue

- `Accelerate` version: 0.23.0
- Platform: Linux-5.4.0-189-generic-x86_64-with-glibc2.31
- Python version: 3.11.3
- Numpy version: 1.26.4
- PyTorch version (GPU?): 2.1.0+cu121 (True)
- PyTorch XPU available: False
- PyTorch NPU available: False
- System RAM: 251.75

In [32]:
!kubectl logs -f job02-mlm-shradha-gp-engine-unoselab01-6mxtt > train_mlm.txt

^C


In [42]:
! kubectl delete pod job02-mlm-shradha-gp-engine-unoselab01-6mxtt

pod "job02-mlm-shradha-gp-engine-unoselab01-6mxtt" deleted


In [35]:
! kubectl delete pod job02-mlm-shradha-gp-engine-unoselab01-9gljk

pod "job02-mlm-shradha-gp-engine-unoselab01-9gljk" deleted


In [18]:
! kubectl delete job job02-mlm-shradha-gp-engine-unoselab01

job.batch "job02-mlm-shradha-gp-engine-unoselab01" deleted


In [9]:
! kubectl create -f ../kube1-sklearn/2-kube-pod-mlm.yml

error: no objects passed to create


In [17]:
! kubectl get pods

NAME                               READY   STATUS    RESTARTS   AGE
gp-engine-unoselab01-pod1          0/1     Error     0          27h
shradha-gp-engine-unoselab01-pod   1/1     Running   0          30s


In [23]:
! kubectl cp /home/user1-selab3/Documents/research-shradha/kube/kube1-sklearn/RandomForestMNIST.py shradha-gp-engine-unoselab01-pod:/data/RandomForestMNIST.py

In [24]:
! kubectl exec shradha-gp-engine-unoselab01-pod -- cat /data/RandomForestMNIST.py

from torchvision.datasets import MNIST
from skimage.feature import hog
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from tqdm import tqdm
import numpy as np
import os

NUM_TREES = int(os.environ.get("SK_NUM_TREES", "3"))
NUM_JOBS = int(os.environ.get("SK_NUM_JOBS", "1"))

print(f"Running random forest with {NUM_TREES} trees and {NUM_JOBS} jobs")

######
# Download MNIST
######
train_dataset = MNIST(download=True, root="~/data", train=True)
test_dataset = MNIST(download=True, root="~/data", train=False)

##### 
# Generate Train Features
#####
print("Generating Train Features")
train_features = np.empty((len(train_dataset), 108))
train_labels = np.empty(len(train_dataset), np.int32)
for i, (img, label) in tqdm(enumerate(train_dataset), ncols=80, total=len(train_dataset)):
    train_features[i] = hog(np.asarray(img), orientations=12, cells_per_block=(3,3))
    train_labels[i] = label

#####
# Generate Test Features
#####
print("Generating Test Features")

In [25]:
! ls

1-kube-pvc-create-sklean.yml  3-kube-job-sklean.yml  RandomForestMNIST.py
1-kube-pvc-sklean.yml	      config
2-kube-pod-sklean.yml	      learn.ipynb


In [26]:
from jinja2 import Template

# read in the template
with open('/home/user1-selab3/Documents/research-shradha/kube/kube2/example1-kube/CODE-RajulShakywar/CODE/example/yaml/yaml_templates/sklearn_job_template.yml') as file_:
    template = Template(file_.read())

In [27]:
# render the job spec
job_spec = template.render(
    job_name="job01-sklearn-shradha-gp-engine-unoselab01",
    pvc_name="pvc-shradha-gp-engine-unoselab01",
    num_trees=1,
    num_jobs=1
)

# print the job spec
print(job_spec)

apiVersion: batch/v1
kind: Job
metadata:
  name: job01-sklearn-shradha-gp-engine-unoselab01
spec:
  template:
    spec:
      automountServiceAccountToken: false
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: topology.kubernetes.io/region
                operator: In
                values:
                - us-central 
      containers:
      - name: sklearn-train-container
        image: gitlab-registry.nrp-nautilus.io/gp-engine/jupyter-stacks/bigdata-2023:latest
        workingDir: /data
        env:
            - name: SK_NUM_TREES
              value: "1"
            - name: SK_NUM_JOBS
              value: "1"
        command: ["python3", "/data/RandomForestMNIST.py"]
        volumeMounts:
            - name: pvc-shradha-gp-engine-unoselab01
              mountPath: /data
        resources:
            limits:
              memory: 1Gi
             

In [28]:
with open("/home/user1-selab3/Documents/research-shradha/kube/kube1-sklearn/3-kube-job-sklean.yml", "w") as file:
    file.write(job_spec)

In [29]:
! kubectl create -f ../kube1-sklearn/3-kube-job-sklean.yml

job.batch/job01-sklearn-shradha-gp-engine-unoselab01 created


In [6]:
! kubectl get pods

NAME                                   READY   STATUS   RESTARTS   AGE
gp-engine-unoselab01-pod1              0/1     Error    0          7d3h
shradha-gp-engine-unoselab01-pod       0/1     Error    0          5d23h
shradha-mlm-gp-engine-unoselab01-pod   0/1     Error    0          18h


In [32]:
! kubectl logs --tail=1 job01-sklearn-shradha-gp-engine-unoselab01-72mr8

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [7]:
! kubectl delete pod shradha-mlm-gp-engine-unoselab01-pod

pod "shradha-mlm-gp-engine-unoselab01-pod" deleted


In [None]:
# ! kubectl delete pod sklearn-gp-engine-unoselab01

In [None]:
# !kubectl get pods