/
performance.py
216 lines (196 loc) · 7.59 KB
/
performance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
__copyright__ = (
"Copyright 2020, Max-Planck-Institut für Eisenforschung GmbH - "
"Computational Materials Design (CM) Department"
)
import pandas as pd
from sqlalchemy import (
create_engine,
select,
distinct,
MetaData,
Table,
func,
or_,
false,
)
from pyiron_base.state import state
__author__ = "Muhammad Hassani"
__version__ = "1.0"
__maintainer__ = "Muhammad Hassani"
__email__ = "hassani@mpie.de"
def _checkpoints_interval(conn):
"""
returns the number of checkpoints and their intervals
"""
stmt = """
SELECT
total_checkpoints,
seconds_since_start / total_checkpoints / 60 AS minutes_between_checkpoints
FROM
(SELECT
EXTRACT(EPOCH FROM (now() - pg_postmaster_start_time())) AS seconds_since_start,
(checkpoints_timed+checkpoints_req) AS total_checkpoints
FROM pg_stat_bgwriter
) AS sub;
"""
check_points = conn.execute(stmt).fetchone()
return {"num. checkpoints": check_points[0], "checkpoint interval": check_points[1]}
def _duplicate_indices(conn):
"""
returns the duplicates in indices
"""
stmt = """
SELECT pg_size_pretty(sum(pg_relation_size(idx))::bigint) as size,
(array_agg(idx))[1] as idx1, (array_agg(idx))[2] as idx2,
(array_agg(idx))[3] as idx3, (array_agg(idx))[4] as idx4
FROM (
SELECT indexrelid::regclass as idx,
(indrelid::text ||E'\n'|| indclass::text ||E'\n'|| indkey::text ||E'\n'||
coalesce(indexprs::text,'')||E'\n' || coalesce(indpred::text,'')) as key
FROM pg_index) sub
GROUP BY key HAVING count(*)>1
ORDER BY sum(pg_relation_size(idx)) DESC;
"""
overlapping_indices = conn.execute(stmt).fetchall()
output_dict = {"duplicated indices": []}
if len(overlapping_indices) > 0:
for pair in overlapping_indices:
output_dict["duplicated indices"].append(
str(pair[1])
+ ", and "
+ str(pair[2])
+ " with total size: "
+ str(pair[0])
)
return output_dict
class DatabaseStatistics:
"""
The use case is:
>>> from pyiron_base import DatabaseStatistics
>>> db_stat = DatabaseStatistics()
>>> df = db_stat.performance()
>>> df
>>> df['duplicated indices'].values[0]
"""
def __init__(self):
connection_string = state.database.sql_connection_string
self._job_table = state.database.sql_view_table_name
if "postgresql" not in connection_string:
raise RuntimeError(
"""
The detabase statistics is only available for a Postgresql database
"""
)
self._engine = create_engine(connection_string)
self._performance_dict = {}
metadata = MetaData()
self._stat_view = Table(
"pg_stat_activity", metadata, autoload_with=self._engine
)
self._locks_view = Table("pg_locks", metadata, autoload_with=self._engine)
def _num_conn(self, conn):
"""
return the number of connections
"""
stmt = select(func.count()).select_from(self._stat_view)
result = conn.execute(stmt)
return {"total num. connection": result.fetchone()[0]}
def _num_conn_by_state(self, conn):
"""
return the number of connection, categorized by their state:
active, idle, idle in transaction, idle in transaction (aborted)
"""
stmt = (
select(self._stat_view.c.state, func.count())
.select_from(self._stat_view)
.group_by(self._stat_view.c.state)
)
results = conn.execute(stmt).fetchall()
output_dict = {}
for result in results:
key = "Number of " + str(result[0]) + " connection"
val = int(result[1])
output_dict[key] = val
return output_dict
def _num_conn_waiting_locks(self, conn):
"""
returns the number of connection waiting for locks
"""
stmt = select(func.count(distinct(self._locks_view.c.pid))).where(
self._locks_view.c.granted == false()
)
return {"num. of conn. waiting for locks": conn.execute(stmt).fetchone()[0]}
def _max_trans_age(self, conn):
"""
returns the maximum age of a transaction
"""
stmt = (
select(func.max(func.now() - self._stat_view.c.xact_start))
.select_from(self._stat_view)
.where(
or_(
self._stat_view.c.state == "idle in transaction",
self._stat_view.c.state == "active",
)
)
)
return {"max. transaction age": str(conn.execute(stmt).fetchone()[0])}
def _index_size(self, conn):
"""
returns the total size of indexes for the pyiron job table
"""
stmt = """
SELECT
t.schemaname,
t.tablename,
c.reltuples::bigint AS num_rows,
pg_size_pretty(pg_relation_size(c.oid)) AS table_size,
psai.indexrelname AS index_name,
pg_size_pretty(pg_relation_size(i.indexrelid)) AS index_size,
CASE WHEN i.indisunique THEN 'Y' ELSE 'N' END AS "unique",
psai.idx_scan AS number_of_scans,
psai.idx_tup_read AS tuples_read,
psai.idx_tup_fetch AS tuples_fetched
FROM
pg_tables t
LEFT JOIN pg_class c ON t.tablename = c.relname
LEFT JOIN pg_index i ON c.oid = i.indrelid
LEFT JOIN pg_stat_all_indexes psai ON i.indexrelid = psai.indexrelid
WHERE
t.schemaname NOT IN ('pg_catalog', 'information_schema')
ORDER BY 1, 2;
"""
rows = conn.execute(stmt).fetchall()
index_usage = 0
for row in rows:
if row[1] == self._job_table:
index_usage += int(str(row[5]).split(" ")[0])
return {"index size/usage (MB)": index_usage}
def performance(self):
"""
returns a pandas dataframe with the essential statistics of a pyiron postgres database
"""
with self._engine.connect() as conn:
self._performance_dict.update(self._num_conn(conn))
self._performance_dict.update(self._num_conn_by_state(conn))
self._performance_dict.update(self._num_conn_waiting_locks(conn))
self._performance_dict.update(self._max_trans_age(conn))
self._performance_dict.update(_checkpoints_interval(conn))
self._performance_dict.update(self._index_size(conn))
self._performance_dict.update(_duplicate_indices(conn))
return pd.DataFrame(self._performance_dict, index=["performance"])
def get_database_statistics():
"""
This function returns the statistics of pyiron postgres database in the form of a pandas dataframe.
The dataframe includes:
- total number of connection
- number of connection categorized by their state
- maximum age of a transaction
- number of checkpoints and their interval
- size of indices
- pair of duplicate indices and their total size
usage:
>>> from pyiron_base import get_database_statistics
>>> get_database_statistics()
"""
return DatabaseStatistics().performance()