Create a "relation mapping" infrastructure to support changing the re…

…lfilenodes of shared or nailed system catalogs. This has two key benefits: * The new CLUSTER-based VACUUM FULL can be applied safely to all catalogs. * We no longer have to use an unsafe reindex-in-place approach for reindexing shared catalogs. CLUSTER on nailed catalogs now works too, although I left it disabled on shared catalogs because the resulting pg_index.indisclustered update would only be visible in one database. Since reindexing shared system catalogs is now fully transactional and crash-safe, the former special cases in REINDEX behavior have been removed; shared catalogs are treated the same as non-shared. This commit does not do anything about the recently-discussed problem of deadlocks between VACUUM FULL/CLUSTER on a system catalog and other concurrent queries; will address that in a separate patch. As a stopgap, parallel_schedule has been tweaked to run vacuum.sql by itself, to avoid such failures during the regression tests.
postgres · Feb 7, 2010 · b9b8831 · b9b8831
1 parent 7fc30c4
commit b9b8831
Show file tree

Hide file tree

Showing 54 changed files with 2,315 additions and 584 deletions.
diff --git a/contrib/oid2name/oid2name.c b/contrib/oid2name/oid2name.c
@@ -5,7 +5,7 @@
  * Originally by
  * B. Palmer, bpalmer@crimelabs.net 1-17-2001
  *
- * $PostgreSQL: pgsql/contrib/oid2name/oid2name.c,v 1.36 2009/06/11 14:48:51 momjian Exp $
+ * $PostgreSQL: pgsql/contrib/oid2name/oid2name.c,v 1.37 2010/02/07 20:48:08 tgl Exp $
  */
 #include "postgres_fe.h"
 
@@ -440,7 +440,7 @@ sql_exec_dumpalldbs(PGconn *conn, struct options * opts)
 	/* get the oid and database name from the system pg_database table */
 	snprintf(todo, sizeof(todo),
 			 "SELECT d.oid AS \"Oid\", datname AS \"Database Name\", "
-	  "spcname AS \"Tablespace\" FROM pg_database d JOIN pg_tablespace t ON "
+	  "spcname AS \"Tablespace\" FROM pg_catalog.pg_database d JOIN pg_catalog.pg_tablespace t ON "
 			 "(dattablespace = t.oid) ORDER BY 2");
 
 	sql_exec(conn, todo, opts->quiet);
@@ -456,10 +456,10 @@ sql_exec_dumpalltables(PGconn *conn, struct options * opts)
 	char	   *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" ";
 
 	snprintf(todo, sizeof(todo),
-		  "SELECT relfilenode as \"Filenode\", relname as \"Table Name\" %s "
+		  "SELECT pg_catalog.pg_relation_filenode(c.oid) as \"Filenode\", relname as \"Table Name\" %s "
 			 "FROM pg_class c "
 		   "	LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace "
-	"	LEFT JOIN pg_catalog.pg_database d ON d.datname = current_database(),"
+	"	LEFT JOIN pg_catalog.pg_database d ON d.datname = pg_catalog.current_database(),"
 			 "	pg_catalog.pg_tablespace t "
 			 "WHERE relkind IN ('r'%s%s) AND "
 			 "	%s"
@@ -477,7 +477,7 @@ sql_exec_dumpalltables(PGconn *conn, struct options * opts)
 }
 
 /*
- * Show oid, relfilenode, name, schema and tablespace for each of the
+ * Show oid, filenode, name, schema and tablespace for each of the
  * given objects in the current database.
  */
 void
@@ -492,7 +492,7 @@ sql_exec_searchtables(PGconn *conn, struct options * opts)
 	bool		written = false;
 	char	   *addfields = ",c.oid AS \"Oid\", nspname AS \"Schema\", spcname as \"Tablespace\" ";
 
-	/* get tables qualifiers, whether names, relfilenodes, or OIDs */
+	/* get tables qualifiers, whether names, filenodes, or OIDs */
 	comma_oids = get_comma_elts(opts->oids);
 	comma_tables = get_comma_elts(opts->tables);
 	comma_filenodes = get_comma_elts(opts->filenodes);
@@ -511,7 +511,7 @@ sql_exec_searchtables(PGconn *conn, struct options * opts)
 	{
 		if (written)
 			ptr += sprintf(ptr, " OR ");
-		ptr += sprintf(ptr, "c.relfilenode IN (%s)", comma_filenodes);
+		ptr += sprintf(ptr, "pg_catalog.pg_relation_filenode(c.oid) IN (%s)", comma_filenodes);
 		written = true;
 	}
 	if (opts->tables->num > 0)
@@ -527,10 +527,10 @@ sql_exec_searchtables(PGconn *conn, struct options * opts)
 	/* now build the query */
 	todo = (char *) myalloc(650 + strlen(qualifiers));
 	snprintf(todo, 650 + strlen(qualifiers),
-		 "SELECT relfilenode as \"Filenode\", relname as \"Table Name\" %s\n"
-			 "FROM pg_class c \n"
+		 "SELECT pg_catalog.pg_relation_filenode(c.oid) as \"Filenode\", relname as \"Table Name\" %s\n"
+			 "FROM pg_catalog.pg_class c \n"
 		 "	LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace \n"
-			 "	LEFT JOIN pg_catalog.pg_database d ON d.datname = current_database(),\n"
+			 "	LEFT JOIN pg_catalog.pg_database d ON d.datname = pg_catalog.current_database(),\n"
 			 "	pg_catalog.pg_tablespace t \n"
 			 "WHERE relkind IN ('r', 'i', 'S', 't') AND \n"
 			 "		t.oid = CASE\n"
@@ -554,7 +554,7 @@ sql_exec_dumpalltbspc(PGconn *conn, struct options * opts)
 
 	snprintf(todo, sizeof(todo),
 			 "SELECT oid AS \"Oid\", spcname as \"Tablespace Name\"\n"
-			 "FROM pg_tablespace");
+			 "FROM pg_catalog.pg_tablespace");
 
 	sql_exec(conn, todo, opts->quiet);
 }

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.220 2010/02/03 17:25:05 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.221 2010/02/07 20:48:09 tgl Exp $ -->
 <!--
  Documentation of the system catalogs, directed toward PostgreSQL developers
  -->
@@ -1473,7 +1473,9 @@
       <entry><structfield>relfilenode</structfield></entry>
       <entry><type>oid</type></entry>
       <entry></entry>
-      <entry>Name of the on-disk file of this relation; 0 if none</entry>
+      <entry>Name of the on-disk file of this relation; zero means this
+       is a <quote>mapped</> relation whose disk file name is determined
+       by low-level state</entry>
      </row>
 
      <row>

diff --git a/doc/src/sgml/diskusage.sgml b/doc/src/sgml/diskusage.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/diskusage.sgml,v 1.19 2010/02/03 17:25:05 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/diskusage.sgml,v 1.20 2010/02/07 20:48:09 tgl Exp $ -->
 
 <chapter id="diskusage">
  <title>Monitoring Disk Usage</title>
@@ -29,30 +29,31 @@
   </para>
 
   <para>
-   You can monitor disk space three ways:  using
-   SQL functions listed in <xref linkend="functions-admin-dbsize">,
-   using <command>VACUUM</> information, and from the command line 
-   using the tools in <filename>contrib/oid2name</>.  The SQL functions
-   are the easiest to use and report information about tables, tables with
-   indexes and long value storage (TOAST), databases, and tablespaces.
+   You can monitor disk space in three ways:
+   using the SQL functions listed in <xref linkend="functions-admin-dbsize">,
+   using the tools in <filename>contrib/oid2name</>, or
+   using manual inspection of the system catalogs.
+   The SQL functions are the easiest to use and are generally recommended.
+   <filename>contrib/oid2name</> is described in <xref linkend="oid2name">.
+   The remainder of this section shows how to do it by inspection of the
+   system catalogs.
   </para>
 
   <para>
    Using <application>psql</> on a recently vacuumed or analyzed database,
    you can issue queries to see the disk usage of any table:
 <programlisting>
-SELECT relfilenode, relpages FROM pg_class WHERE relname = 'customer';
+SELECT pg_relation_filepath(oid), relpages FROM pg_class WHERE relname = 'customer';
 
- relfilenode | relpages 
--------------+----------
-       16806 |       60
+ pg_relation_filepath | relpages 
+----------------------+----------
+ base/16384/16806     |       60
 (1 row)
 </programlisting>
    Each page is typically 8 kilobytes. (Remember, <structfield>relpages</>
    is only updated by <command>VACUUM</>, <command>ANALYZE</>, and
-   a few DDL commands such as <command>CREATE INDEX</>.)  The
-   <structfield>relfilenode</> value is of interest if you want to examine
-   the table's disk file directly.
+   a few DDL commands such as <command>CREATE INDEX</>.)  The file pathname
+   is of interest if you want to examine the table's disk file directly.
   </para>
 
   <para>
@@ -107,11 +108,6 @@ ORDER BY relpages DESC;
  customer             |     3144
 </programlisting>
   </para>
-
-  <para>
-   You can also use <filename>contrib/oid2name</> to show disk usage; see
-   <xref linkend="oid2name"> for more details and examples.
-  </para>
  </sect1>
 
  <sect1 id="disk-full">

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.500 2010/02/01 15:38:21 rhaas Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.501 2010/02/07 20:48:09 tgl Exp $ -->
 
  <chapter id="functions">
   <title>Functions and Operators</title>
@@ -13434,6 +13434,68 @@ postgres=# SELECT * FROM pg_xlogfile_name_offset(pg_stop_backup());
     appropriate.
    </para>
 
+   <para>
+    The functions shown in <xref linkend="functions-admin-dblocation"> assist
+    in identifying the specific disk files associated with database objects.
+   </para>
+
+   <indexterm>
+    <primary>pg_relation_filenode</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_relation_filepath</primary>
+   </indexterm>
+
+   <table id="functions-admin-dblocation">
+    <title>Database Object Location Functions</title>
+    <tgroup cols="3">
+     <thead>
+      <row><entry>Name</entry> <entry>Return Type</entry> <entry>Description</entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry>
+        <literal><function>pg_relation_filenode</function>(<parameter>relation</parameter> <type>regclass</type>)</literal>
+        </entry>
+       <entry><type>oid</type></entry>
+       <entry>
+        Filenode number of the relation with the specified OID or name
+       </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_relation_filepath</function>(<parameter>relation</parameter> <type>regclass</type>)</literal>
+        </entry>
+       <entry><type>text</type></entry>
+       <entry>
+        File path name of the relation with the specified OID or name
+       </entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+   <para>
+    <function>pg_relation_filenode</> accepts the OID or name of a table,
+    index, sequence, or toast table, and returns the <quote>filenode</> number
+    currently assigned to it.  The filenode is the base component of the file
+    name(s) used for the relation (see <xref linkend="storage-file-layout">
+    for more information).  For most tables the result is the same as
+    <structname>pg_class</>.<structfield>relfilenode</>, but for certain
+    system catalogs <structfield>relfilenode</> is zero and this function must
+    be used to get the correct value.  The function returns NULL if passed
+    a relation that does not have storage, such as a view.
+   </para>
+
+   <para>
+    <function>pg_relation_filepath</> is similar to
+    <function>pg_relation_filenode</>, but it returns the entire file pathname
+    (relative to the database cluster's data directory <varname>PGDATA</>) of
+    the relation.
+   </para>
+
    <para>
     The functions shown in <xref
     linkend="functions-admin-genfile"> provide native access to

diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/pgbuffercache.sgml,v 2.5 2009/05/18 11:08:24 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/pgbuffercache.sgml,v 2.6 2010/02/07 20:48:09 tgl Exp $ -->
 
 <sect1 id="pgbuffercache">
  <title>pg_buffercache</title>
@@ -56,7 +56,7 @@
       <entry><structfield>relfilenode</structfield></entry>
       <entry><type>oid</type></entry>
       <entry><literal>pg_class.relfilenode</literal></entry>
-      <entry>Relfilenode of the relation</entry>
+      <entry>Filenode number of the relation</entry>
      </row>
 
      <row>
@@ -137,7 +137,7 @@
   <programlisting>
   regression=# SELECT c.relname, count(*) AS buffers
                FROM pg_buffercache b INNER JOIN pg_class c
-               ON b.relfilenode = c.relfilenode AND
+               ON b.relfilenode = pg_relation_filenode(c.oid) AND
                   b.reldatabase IN (0, (SELECT oid FROM pg_database
                                         WHERE datname = current_database()))
                GROUP BY c.relname

diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/cluster.sgml,v 1.47 2009/09/19 10:23:26 petere Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/cluster.sgml,v 1.48 2010/02/07 20:48:09 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -30,12 +30,12 @@ CLUSTER [VERBOSE]
   <title>Description</title>
 
   <para>
-   <command>CLUSTER</command> instructs <productname>PostgreSQL</productname> 
+   <command>CLUSTER</command> instructs <productname>PostgreSQL</productname>
    to cluster the table specified
    by <replaceable class="parameter">table_name</replaceable>
    based on the index specified by
    <replaceable class="parameter">index_name</replaceable>. The index must
-   already have been defined on 
+   already have been defined on
    <replaceable class="parameter">table_name</replaceable>.
   </para>
 
@@ -46,9 +46,9 @@ CLUSTER [VERBOSE]
    not clustered.  That is, no attempt is made to store new or
    updated rows according to their index order.  (If one wishes, one can
    periodically recluster by issuing the command again.  Also, setting
-   the table's <literal>FILLFACTOR</literal> storage parameter to less than 100% can aid
-   in preserving cluster ordering during updates, since updated rows
-   are preferentially kept on the same page.)
+   the table's <literal>FILLFACTOR</literal> storage parameter to less than
+   100% can aid in preserving cluster ordering during updates, since updated
+   rows are kept on the same page if enough space is available there.)
   </para>
 
   <para>

diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/reindex.sgml,v 1.38 2008/11/14 10:22:47 petere Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/reindex.sgml,v 1.39 2010/02/07 20:48:09 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -77,7 +77,7 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
    </itemizedlist>
   </para>
  </refsect1>
-  
+
  <refsect1>
   <title>Parameters</title>
 
@@ -106,9 +106,9 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
     <listitem>
      <para>
       Recreate all indexes within the current database.
-      Indexes on shared system catalogs are skipped except in stand-alone mode
-      (see below). This form of <command>REINDEX</command> cannot be executed 
-      inside a transaction block.
+      Indexes on shared system catalogs are also processed.
+      This form of <command>REINDEX</command> cannot be executed inside a
+      transaction block.
      </para>
     </listitem>
    </varlistentry>
@@ -118,8 +118,8 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
     <listitem>
      <para>
       Recreate all indexes on system catalogs within the current database.
-      Indexes on user tables are not processed.  Also, indexes on shared
-      system catalogs are skipped except in stand-alone mode (see below).
+      Indexes on shared system catalogs are included.
+      Indexes on user tables are not processed.
       This form of <command>REINDEX</command> cannot be executed inside a
       transaction block.
      </para>
@@ -134,7 +134,7 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
       reindexed.  Index and table names can be schema-qualified.
       Presently, <command>REINDEX DATABASE</> and <command>REINDEX SYSTEM</>
       can only reindex the current database, so their parameter must match
-      the current database's name. 
+      the current database's name.
      </para>
     </listitem>
    </varlistentry>
@@ -156,7 +156,7 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
   <para>
    If you suspect corruption of an index on a user table, you can
    simply rebuild that index, or all indexes on the table, using
-   <command>REINDEX INDEX</command> or <command>REINDEX TABLE</command>.  
+   <command>REINDEX INDEX</command> or <command>REINDEX TABLE</command>.
   </para>
 
   <para>
@@ -197,30 +197,6 @@ REINDEX { INDEX | TABLE | DATABASE | SYSTEM } <replaceable class="PARAMETER">nam
    have been completed.
   </para>
 
-  <para>
-   If corruption is suspected in the indexes of any of the shared
-   system catalogs (which are <structname>pg_authid</structname>,
-   <structname>pg_auth_members</structname>,
-   <structname>pg_database</structname>,
-   <structname>pg_pltemplate</structname>,
-   <structname>pg_shdepend</structname>,
-   <structname>pg_shdescription</structname>, and
-   <structname>pg_tablespace</structname>), then a standalone server
-   must be used to repair it.  <command>REINDEX</> will not process
-   shared catalogs in multiuser mode.
-  </para>
-
-  <para>
-   For all indexes except the shared system catalogs, <command>REINDEX</>
-   is crash-safe and transaction-safe.  <command>REINDEX</> is not
-   crash-safe for shared indexes, which is why this case is disallowed
-   during normal operation.  If a failure occurs while reindexing one
-   of these catalogs in standalone mode, it will not be possible to
-   restart the regular server until the problem is rectified.  (The
-   typical symptom of a partially rebuilt shared index is <quote>index is not
-   a btree</> errors.)
-  </para>
-
   <para>
    <command>REINDEX</command> is similar to a drop and recreate of the index
    in that the index contents are rebuilt from scratch.  However, the locking
@@ -290,7 +266,7 @@ broken_db=&gt; \q
 </programlisting>
   </para>
  </refsect1>
- 
+
  <refsect1>
   <title>Compatibility</title>