-
Notifications
You must be signed in to change notification settings - Fork 0
/
device_status.go
179 lines (156 loc) · 3.71 KB
/
device_status.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
package dcgm
/*
#include "dcgm_agent.h"
#include "dcgm_structs.h"
*/
import "C"
import (
"fmt"
"math/rand"
)
type PerfState uint
const (
PerfStateMax = 0
PerfStateMin = 15
PerfStateUnknown = 32
)
func (p PerfState) String() string {
if p >= PerfStateMax && p <= PerfStateMin {
return fmt.Sprintf("P%d", p)
}
return "Unknown"
}
type UtilizationInfo struct {
GPU int64 // %
Memory int64 // %
Encoder int64 // %
Decoder int64 // %
}
type ECCErrorsInfo struct {
SingleBit int64
DoubleBit int64
}
type MemoryInfo struct {
GlobalUsed int64
ECCErrors ECCErrorsInfo
}
type ClockInfo struct {
Cores int64 // MHz
Memory int64 // MHz
}
type PCIThroughputInfo struct {
Rx int64 // MB
Tx int64 // MB
Replays int64
}
type PCIStatusInfo struct {
BAR1Used int64 // MB
Throughput PCIThroughputInfo
FBUsed int64
}
type DeviceStatus struct {
Power float64 // W
Temperature int64 // °C
Utilization UtilizationInfo
Memory MemoryInfo
Clocks ClockInfo
PCI PCIStatusInfo
Performance PerfState
FanSpeed int64 // %
}
func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) {
const (
pwr int = iota
temp
sm
mem
enc
dec
smClock
memClock
bar1Used
pcieRxThroughput
pcieTxThroughput
pcieReplay
fbUsed
sbe
dbe
pstate
fanSpeed
fieldsCount
)
deviceFields := make([]Short, fieldsCount)
deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE
deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP
deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL
deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL
deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL
deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL
deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK
deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK
deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED
deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT
deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER
deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED
deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL
deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL
deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE
deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED
fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64())
fieldsId, err := FieldGroupCreate(fieldsName, deviceFields)
if err != nil {
return
}
groupName := fmt.Sprintf("devStatus%d", rand.Uint64())
groupId, err := WatchFields(gpuId, fieldsId, groupName)
if err != nil {
_ = FieldGroupDestroy(fieldsId)
return
}
values, err := GetLatestValuesForFields(gpuId, deviceFields)
if err != nil {
_ = FieldGroupDestroy(fieldsId)
_ = DestroyGroup(groupId)
return status, err
}
power := values[pwr].Float64()
gpuUtil := UtilizationInfo{
GPU: values[sm].Int64(),
Memory: values[mem].Int64(),
Encoder: values[enc].Int64(),
Decoder: values[dec].Int64(),
}
memory := MemoryInfo{
ECCErrors: ECCErrorsInfo{
SingleBit: values[sbe].Int64(),
DoubleBit: values[dbe].Int64(),
},
}
clocks := ClockInfo{
Cores: values[smClock].Int64(),
Memory: values[memClock].Int64(),
}
pci := PCIStatusInfo{
BAR1Used: values[bar1Used].Int64(),
Throughput: PCIThroughputInfo{
Rx: values[pcieRxThroughput].Int64(),
Tx: values[pcieTxThroughput].Int64(),
Replays: values[pcieReplay].Int64(),
},
FBUsed: values[fbUsed].Int64(),
}
status = DeviceStatus{
Power: power,
Temperature: values[temp].Int64(),
Utilization: gpuUtil,
Memory: memory,
Clocks: clocks,
PCI: pci,
Performance: PerfState(values[pstate].Int64()),
FanSpeed: values[fanSpeed].Int64(),
}
_ = FieldGroupDestroy(fieldsId)
_ = DestroyGroup(groupId)
return
}