Skip to content

Commit

Permalink
HAMMER VFS - Refactor merged search function to try to avoid missed e…
Browse files Browse the repository at this point in the history
…ntries

Refactor the merged B-Tree + In-Memory search function to try to avoid races
where an in-memory record is flushed to the media during a search, causing
the search to miss the record.

Add another flag to hammer_record_t to indicate that the record was deleted
because it was committed to the media (verses simply being deleted).

Better-separate HAMMER_RECF_DELETED_FE and HAMMER_RECF_DELETED_BE.  These
flags indicate whether the frontend or backend deleted an in-memory record.
The backend ignores frontend deletions that occur after the record has been
associated with a flush group.

Remove some console Warnings that are no longer applicable.
  • Loading branch information
Matthew Dillon committed May 6, 2009
1 parent 22c7946 commit 3214ade
Show file tree
Hide file tree
Showing 7 changed files with 381 additions and 175 deletions.
3 changes: 1 addition & 2 deletions sys/vfs/hammer/hammer.h
Expand Up @@ -296,6 +296,7 @@ struct hammer_inode {
struct hammer_btree_leaf_elm ino_leaf; /* in-memory cache */
struct hammer_inode_data ino_data; /* in-memory cache */
struct hammer_rec_rb_tree rec_tree; /* in-memory cache */
int rec_generation;
struct hammer_node_cache cache[2]; /* search initiate cache */

/*
Expand Down Expand Up @@ -1069,8 +1070,6 @@ hammer_off_t hammer_blockmap_alloc(hammer_transaction_t trans, int zone,
int bytes, int *errorp);
hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone,
int bytes, hammer_off_t *zone_offp, int *errorp);
void hammer_blockmap_reserve_undo(hammer_mount_t hmp, hammer_reserve_t resv,
hammer_off_t zone_offset, int bytes);
void hammer_blockmap_reserve_complete(hammer_mount_t hmp,
hammer_reserve_t resv);
void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv);
Expand Down
13 changes: 0 additions & 13 deletions sys/vfs/hammer/hammer_blockmap.c
Expand Up @@ -565,19 +565,6 @@ hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes,
return(resv);
}

#if 0
/*
* Backend function - undo a portion of a reservation.
*/
void
hammer_blockmap_reserve_undo(hammer_mount_t hmp, hammer_reserve_t resv,
hammer_off_t zone_offset, int bytes)
{
resv->bytes_freed += bytes;
}

#endif

/*
* Dereference a reservation structure. Upon the final release the
* underlying big-block is checked and if it is entirely free we delete
Expand Down
26 changes: 20 additions & 6 deletions sys/vfs/hammer/hammer_btree.c
Expand Up @@ -614,6 +614,10 @@ hammer_btree_lookup(hammer_cursor_t cursor)
* Execute the logic required to start an iteration. The first record
* located within the specified range is returned and iteration control
* flags are adjusted for successive hammer_btree_iterate() calls.
*
* Set ATEDISK so a low-level caller can call btree_first/btree_iterate
* in a loop without worrying about it. Higher-level merged searches will
* adjust the flag appropriately.
*/
int
hammer_btree_first(hammer_cursor_t cursor)
Expand All @@ -634,6 +638,10 @@ hammer_btree_first(hammer_cursor_t cursor)
*
* Set ATEDISK when iterating backwards to skip the current entry,
* which after an ENOENT lookup will be pointing beyond our end point.
*
* Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse
* in a loop without worrying about it. Higher-level merged searches will
* adjust the flag appropriately.
*/
int
hammer_btree_last(hammer_cursor_t cursor)
Expand Down Expand Up @@ -2159,16 +2167,22 @@ btree_remove(hammer_cursor_t cursor)
hammer_flush_node(node);
hammer_delete_node(cursor->trans, node);
} else {
kprintf("Warning: BTREE_REMOVE: Defering "
"parent removal1 @ %016llx, skipping\n",
node->node_offset);
/*
* Defer parent removal because we could not
* get the lock, just let the leaf remain
* empty.
*/
/**/
}
hammer_unlock(&node->lock);
hammer_rel_node(node);
} else {
kprintf("Warning: BTREE_REMOVE: Defering parent "
"removal2 @ %016llx, skipping\n",
node->node_offset);
/*
* Defer parent removal because we could not
* get the lock, just let the leaf remain
* empty.
*/
/**/
}
} else {
KKASSERT(parent->ondisk->count > 1);
Expand Down
2 changes: 2 additions & 0 deletions sys/vfs/hammer/hammer_cursor.h
Expand Up @@ -107,6 +107,7 @@ struct hammer_cursor {
* Iteration and extraction control variables
*/
int flags;
int rec_generation;

/*
* Merged in-memory/on-disk iterations also use these fields.
Expand Down Expand Up @@ -138,6 +139,7 @@ typedef struct hammer_cursor *hammer_cursor_t;
#define HAMMER_CURSOR_REBLOCKING 0x00020000
#define HAMMER_CURSOR_TRACKED 0x00040000
#define HAMMER_CURSOR_TRACKED_RIPOUT 0x00080000
#define HAMMER_CURSOR_LASTWASMEM 0x00100000 /* hammer_ip_next logic */

/*
* Flags we can clear when reusing a cursor (we can clear all of them)
Expand Down
70 changes: 51 additions & 19 deletions sys/vfs/hammer/hammer_inode.c
Expand Up @@ -924,6 +924,9 @@ hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
cursor.asof = HAMMER_MAX_TID;
cursor.flags |= HAMMER_CURSOR_ASOF;

/*
* Replace any in-memory version of the record.
*/
error = hammer_ip_lookup(&cursor);
if (error == 0 && hammer_cursor_inmem(&cursor)) {
record = cursor.iprec;
Expand All @@ -937,6 +940,11 @@ hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
error = 0;
}
}

/*
* Allocate replacement general record. The backend flush will
* delete any on-disk version of the record.
*/
if (error == 0 || error == ENOENT) {
record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
record->type = HAMMER_MEM_RECORD_GENERAL;
Expand Down Expand Up @@ -1159,12 +1167,13 @@ hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
}

/*
* The record isn't managed by the inode's record tree,
* destroy it whether we succeed or fail.
* Note: The record was never on the inode's record tree
* so just wave our hands importantly and destroy it.
*/
record->flags |= HAMMER_RECF_COMMITTED;
record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
record->flags |= HAMMER_RECF_DELETED_FE | HAMMER_RECF_COMMITTED;
record->flush_state = HAMMER_FST_IDLE;
++ip->rec_generation;
hammer_rel_mem_record(record);

/*
Expand Down Expand Up @@ -1391,8 +1400,9 @@ hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
KKASSERT(rec->lock.refs == 1);
rec->flush_state = HAMMER_FST_IDLE;
rec->flush_group = NULL;
rec->flags |= HAMMER_RECF_DELETED_FE;
rec->flags |= HAMMER_RECF_DELETED_BE;
rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
++ip->rec_generation;
hammer_rel_mem_record(rec);
}
ip->flags &= ~HAMMER_INODE_MODMASK;
Expand Down Expand Up @@ -1949,16 +1959,20 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
int r;

/*
* Deleted records are ignored. Note that the flush detects deleted
* front-end records at multiple points to deal with races. This is
* just the first line of defense. The only time DELETED_FE cannot
* be set is when HAMMER_RECF_INTERLOCK_BE is set.
* Records deleted or committed by the backend are ignored.
* Note that the flush detects deleted frontend records at
* multiple points to deal with races. This is just the first
* line of defense. The only time HAMMER_RECF_DELETED_FE cannot
* be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
* messes up link-count calculations.
*
* Don't get confused between record deletion and, say, directory
* entry deletion. The deletion of a directory entry that is on
* the media has nothing to do with the record deletion flags.
* NOTE: Don't get confused between record deletion and, say,
* directory entry deletion. The deletion of a directory entry
* which is on-media has nothing to do with the record deletion
* flags.
*/
if (rec->flags & (HAMMER_RECF_DELETED_FE|HAMMER_RECF_DELETED_BE)) {
if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
HAMMER_RECF_COMMITTED)) {
if (rec->flush_state == HAMMER_FST_FLUSH) {
KKASSERT(rec->flush_group == rec->ip->flush_group);
r = 1;
Expand Down Expand Up @@ -2343,9 +2357,9 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
record->flags |= HAMMER_RECF_INTERLOCK_BE;

/*
* The backend may have already disposed of the record.
* The backend has already disposed of the record.
*/
if (record->flags & HAMMER_RECF_DELETED_BE) {
if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
error = 0;
goto done;
}
Expand All @@ -2369,8 +2383,13 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
*/
/* fall through */
case HAMMER_MEM_RECORD_GENERAL:
record->flags |= HAMMER_RECF_DELETED_FE;
/*
* Set deleted-by-backend flag. Do not set the
* backend committed flag, because we are throwing
* the record away.
*/
record->flags |= HAMMER_RECF_DELETED_BE;
++record->ip->rec_generation;
error = 0;
goto done;
case HAMMER_MEM_RECORD_ADD:
Expand Down Expand Up @@ -2408,10 +2427,19 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
*/
if (record->flags & HAMMER_RECF_DELETED_FE) {
if (record->type == HAMMER_MEM_RECORD_ADD) {
/*
* Convert a front-end deleted directory-add to
* a directory-delete entry later.
*/
record->flags |= HAMMER_RECF_CONVERT_DELETE;
} else {
/*
* Dispose of the record (race case). Mark as
* deleted by backend (and not committed).
*/
KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
record->flags |= HAMMER_RECF_DELETED_BE;
++record->ip->rec_generation;
error = 0;
goto done;
}
Expand All @@ -2421,9 +2449,10 @@ hammer_sync_record_callback(hammer_record_t record, void *data)
* Assign the create_tid for new records. Deletions already
* have the record's entire key properly set up.
*/
if (record->type != HAMMER_MEM_RECORD_DEL)
if (record->type != HAMMER_MEM_RECORD_DEL) {
record->leaf.base.create_tid = trans->tid;
record->leaf.create_ts = trans->time32;
}
for (;;) {
error = hammer_ip_sync_record_cursor(cursor, record);
if (error != EDEADLK)
Expand Down Expand Up @@ -2515,7 +2544,10 @@ hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
}
} else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
/*
* Not part of our flush group
* Not part of our flush group and not deleted by
* the front-end, adjust the link count synced to
* the media (undo what the frontend did when it
* queued the record).
*/
KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
switch(depend->type) {
Expand Down Expand Up @@ -2719,8 +2751,8 @@ hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
hammer_record_t record = RB_ROOT(&ip->rec_tree);
hammer_ref(&record->lock);
KKASSERT(record->lock.refs == 1);
record->flags |= HAMMER_RECF_DELETED_FE;
record->flags |= HAMMER_RECF_DELETED_BE;
++record->ip->rec_generation;
hammer_rel_mem_record(record);
}
break;
Expand Down
3 changes: 2 additions & 1 deletion sys/vfs/hammer/hammer_io.c
Expand Up @@ -1252,7 +1252,8 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record,
KKASSERT(error == 0);
} else {
/*
* Major suckage occured.
* Major suckage occured. Also note: The record was never added
* to the tree so we do not have to worry about the backend.
*/
kprintf("hammer_direct_write: failed @ %016llx\n",
leaf->data_offset);
Expand Down

0 comments on commit 3214ade

Please sign in to comment.