Skip to content

Commit

Permalink
edac: avoid mce decoding crash after edac driver unloaded
Browse files Browse the repository at this point in the history
commit e35fca4 upstream.

Some edac drivers register themselves as mce decoders via
notifier_chain. But in current notifier_chain implementation logic,
it doesn't accept same notifier registered twice. If so, it will be
wrong when adding/removing the element from the list. For example,
on one SandyBridge platform, remove module sb_edac and then trigger
one error, it will hit oops because it has no mce decoder registered
but related notifier_chain still points to an invalid callback
function. Here is an example:

Call Trace:
 [<ffffffff8150ef6a>] atomic_notifier_call_chain+0x1a/0x20
 [<ffffffff8102b936>] mce_log+0x46/0x180
 [<ffffffff8102eaea>] apei_mce_report_mem_error+0x4a/0x60
 [<ffffffff812e19d2>] ghes_do_proc+0x192/0x210
 [<ffffffff812e2066>] ghes_proc+0x46/0x70
 [<ffffffff812e20d8>] ghes_notify_sci+0x48/0x80
 [<ffffffff8150ef05>] notifier_call_chain+0x55/0x80
 [<ffffffff81076f1a>] __blocking_notifier_call_chain+0x5a/0x80
 [<ffffffff812aea11>] ? acpi_os_wait_events_complete+0x23/0x23
 [<ffffffff81076f56>] blocking_notifier_call_chain+0x16/0x20
 [<ffffffff812ddc4d>] acpi_hed_notify+0x19/0x1b
 [<ffffffff812b16bd>] acpi_device_notify+0x19/0x1b
 [<ffffffff812beb38>] acpi_ev_notify_dispatch+0x67/0x7f
 [<ffffffff812aea3a>] acpi_os_execute_deferred+0x29/0x36
 [<ffffffff81069dc2>] process_one_work+0x132/0x450
 [<ffffffff8106bbcb>] worker_thread+0x17b/0x3c0
 [<ffffffff8106ba50>] ? manage_workers+0x120/0x120
 [<ffffffff81070aee>] kthread+0x9e/0xb0
 [<ffffffff81514724>] kernel_thread_helper+0x4/0x10
 [<ffffffff81070a50>] ? kthread_freezable_should_stop+0x70/0x70
 [<ffffffff81514720>] ? gs_change+0x13/0x13
Code: f3 49 89 d4 45 85 ed 4d 89 c6 48 8b 0f 74 48 48 85 c9 75 17 eb 41
0f 1f 80 00 00 00 00 41 83 ed 01 4c 89 f9 74 22 4d 85 ff 74 1d <4c> 8b
79 08 4c 89 e2 48 89 de 48 89 cf ff 11 4d 85 f6 74 04 41
RIP  [<ffffffff8150eef6>] notifier_call_chain+0x46/0x80
 RSP <ffff88042868fb20>
CR2: ffffffffa01af838
---[ end trace 0100930068e73e6f ]---
BUG: unable to handle kernel paging request at fffffffffffffff8
IP: [<ffffffff810705b0>] kthread_data+0x10/0x20
PGD 1a0d067 PUD 1a0e067 PMD 0
Oops: 0000 [aosp-mirror#2] SMP

Only i7core_edac and sb_edac have such issues because they have more
than one memory controller which means they have to register mce
decoder many times.

Signed-off-by: Chen Gong <gong.chen@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  • Loading branch information
Chen Gong authored and poondog committed Nov 19, 2014
1 parent f1cda8a commit afc9d9e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 15 deletions.
15 changes: 4 additions & 11 deletions drivers/edac/i7core_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -1932,12 +1932,6 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val,
if (mce->bank != 8)
return NOTIFY_DONE;

#ifdef CONFIG_SMP
/* Only handle if it is the right mc controller */
if (mce->socketid != pvt->i7core_dev->socket)
return NOTIFY_DONE;
#endif

smp_rmb();
if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) {
smp_wmb();
Expand Down Expand Up @@ -2234,8 +2228,6 @@ static void i7core_unregister_mci(struct i7core_dev *i7core_dev)
if (pvt->enable_scrub)
disable_sdram_scrub_setting(mci);

mce_unregister_decode_chain(&i7_mce_dec);

/* Disable EDAC polling */
i7core_pci_ctl_release(pvt);

Expand Down Expand Up @@ -2336,8 +2328,6 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev)
/* DCLK for scrub rate setting */
pvt->dclk_freq = get_dclk_freq();

mce_register_decode_chain(&i7_mce_dec);

return 0;

fail0:
Expand Down Expand Up @@ -2481,8 +2471,10 @@ static int __init i7core_init(void)

pci_rc = pci_register_driver(&i7core_driver);

if (pci_rc >= 0)
if (pci_rc >= 0) {
mce_register_decode_chain(&i7_mce_dec);
return 0;
}

i7core_printk(KERN_ERR, "Failed to register device with error %d.\n",
pci_rc);
Expand All @@ -2498,6 +2490,7 @@ static void __exit i7core_exit(void)
{
debugf2("MC: " __FILE__ ": %s()\n", __func__);
pci_unregister_driver(&i7core_driver);
mce_unregister_decode_chain(&i7_mce_dec);
}

module_init(i7core_init);
Expand Down
8 changes: 4 additions & 4 deletions drivers/edac/sb_edac.c
Original file line number Diff line number Diff line change
Expand Up @@ -1669,8 +1669,6 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev)
debugf0("MC: " __FILE__ ": %s(): mci = %p, dev = %p\n",
__func__, mci, &sbridge_dev->pdev[0]->dev);

mce_unregister_decode_chain(&sbridge_mce_dec);

/* Remove MC sysfs nodes */
edac_mc_del_mc(mci->dev);

Expand Down Expand Up @@ -1738,7 +1736,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev)
goto fail0;
}

mce_register_decode_chain(&sbridge_mce_dec);
return 0;

fail0:
Expand Down Expand Up @@ -1867,8 +1864,10 @@ static int __init sbridge_init(void)

pci_rc = pci_register_driver(&sbridge_driver);

if (pci_rc >= 0)
if (pci_rc >= 0) {
mce_register_decode_chain(&sbridge_mce_dec);
return 0;
}

sbridge_printk(KERN_ERR, "Failed to register device with error %d.\n",
pci_rc);
Expand All @@ -1884,6 +1883,7 @@ static void __exit sbridge_exit(void)
{
debugf2("MC: " __FILE__ ": %s()\n", __func__);
pci_unregister_driver(&sbridge_driver);
mce_unregister_decode_chain(&sbridge_mce_dec);
}

module_init(sbridge_init);
Expand Down

0 comments on commit afc9d9e

Please sign in to comment.