From f92c0b4c113bc8ea5c143a94c0ab9c8bec259d40 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 12 Aug 2016 10:15:34 -0700
Subject: [PATCH 01/17] BCM270X: Connect V3D to its power domain.

We were doing this with manual firmware calls before, but to backport
runtime PM we want to be using a proper power domain.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 arch/arm/boot/dts/bcm2708_common.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/bcm2708_common.dtsi b/arch/arm/boot/dts/bcm2708_common.dtsi
index 52d49d3181ce9c..bf76c283b16294 100644
--- a/arch/arm/boot/dts/bcm2708_common.dtsi
+++ b/arch/arm/boot/dts/bcm2708_common.dtsi
@@ -366,6 +366,7 @@
 		v3d: v3d@7ec00000 {
 			compatible = "brcm,vc4-v3d";
 			reg = <0x7ec00000 0x1000>;
+			power-domains = <&power RPI_POWER_DOMAIN_V3D>;
 			status = "disabled";
 		};
 

From 860f6235aa2d56c7294b91fde2d716b52139b31b Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 5 Feb 2016 17:41:49 -0800
Subject: [PATCH 02/17] drm/vc4: Enable runtime PM.

This may actually get us a feature that the closed driver didn't have:
turning off the GPU in between rendering jobs, while the V3D device is
still opened by the client.

There may be some tuning to be applied here to use autosuspend so that
we don't bounce the device's power so much, but in steady-state
GPU-bound rendering we keep the power on (since we keep multiple jobs
outstanding) and even if we power cycle on every job we can still
manage at least 680 fps.

More importantly, though, runtime PM will allow us to power off the
device to do a GPU reset.

v2: Switch #ifdef to CONFIG_PM not CONFIG_PM_SLEEP (caught by kbuild
    test robot)

Signed-off-by: Eric Anholt <eric@anholt.net>
(cherry picked from commit 001bdb55d9eb72a9e2d5b623bacfc52da74ae03e)
---
 drivers/gpu/drm/vc4/vc4_drv.h |  1 +
 drivers/gpu/drm/vc4/vc4_gem.c | 10 ++++++
 drivers/gpu/drm/vc4/vc4_v3d.c | 59 +++++++++++++++++++++--------------
 3 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 4dad5abf42dede..cf48c8d49e59d7 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -155,6 +155,7 @@ struct vc4_seqno_cb {
 };
 
 struct vc4_v3d {
+	struct vc4_dev *vc4;
 	struct platform_device *pdev;
 	void __iomem *regs;
 };
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 9b03a900c8391e..bb9513bf1a4030 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -23,6 +23,7 @@
 
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/device.h>
 #include <linux/io.h>
 
@@ -689,6 +690,7 @@ vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 static void
 vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 {
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	unsigned i;
 
 	/* Need the struct lock for drm_gem_object_unreference(). */
@@ -707,6 +709,8 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 	}
 	mutex_unlock(&dev->struct_mutex);
 
+	pm_runtime_put(&vc4->v3d->pdev->dev);
+
 	kfree(exec);
 }
 
@@ -860,6 +864,12 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 		return -ENOMEM;
 	}
 
+	ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+	if (ret < 0) {
+		kfree(exec);
+		return ret;
+	}
+
 	exec->args = args;
 	INIT_LIST_HEAD(&exec->unref_list);
 
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 3422306049b2f9..e6d3c6028341e4 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -17,7 +17,7 @@
  */
 
 #include "linux/component.h"
-#include "soc/bcm2835/raspberrypi-firmware.h"
+#include "linux/pm_runtime.h"
 #include "vc4_drv.h"
 #include "vc4_regs.h"
 
@@ -145,22 +145,6 @@ int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused)
 }
 #endif /* CONFIG_DEBUG_FS */
 
-/*
- * Asks the firmware to turn on power to the V3D engine.
- *
- * This may be doable with just the clocks interface, though this
- * packet does some other register setup from the firmware, too.
- */
-int
-vc4_v3d_set_power(struct vc4_dev *vc4, bool on)
-{
-	u32 packet = on;
-
-	return rpi_firmware_property(vc4->firmware,
-				     RPI_FIRMWARE_SET_ENABLE_QPU,
-				     &packet, sizeof(packet));
-}
-
 static void vc4_v3d_init_hw(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
@@ -172,6 +156,29 @@ static void vc4_v3d_init_hw(struct drm_device *dev)
 	V3D_WRITE(V3D_VPMBASE, 0);
 }
 
+#ifdef CONFIG_PM
+static int vc4_v3d_runtime_suspend(struct device *dev)
+{
+	struct vc4_v3d *v3d = dev_get_drvdata(dev);
+	struct vc4_dev *vc4 = v3d->vc4;
+
+	vc4_irq_uninstall(vc4->dev);
+
+	return 0;
+}
+
+static int vc4_v3d_runtime_resume(struct device *dev)
+{
+	struct vc4_v3d *v3d = dev_get_drvdata(dev);
+	struct vc4_dev *vc4 = v3d->vc4;
+
+	vc4_v3d_init_hw(vc4->dev);
+	vc4_irq_postinstall(vc4->dev);
+
+	return 0;
+}
+#endif
+
 static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 {
 	struct platform_device *pdev = to_platform_device(dev);
@@ -184,6 +191,8 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 	if (!v3d)
 		return -ENOMEM;
 
+	dev_set_drvdata(dev, v3d);
+
 	v3d->pdev = pdev;
 
 	v3d->regs = vc4_ioremap_regs(pdev, 0);
@@ -191,10 +200,7 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 		return PTR_ERR(v3d->regs);
 
 	vc4->v3d = v3d;
-
-	ret = vc4_v3d_set_power(vc4, true);
-	if (ret)
-		return ret;
+	v3d->vc4 = vc4;
 
 	if (V3D_READ(V3D_IDENT0) != V3D_EXPECTED_IDENT0) {
 		DRM_ERROR("V3D_IDENT0 read 0x%08x instead of 0x%08x\n",
@@ -216,6 +222,8 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 		return ret;
 	}
 
+	pm_runtime_enable(dev);
+
 	return 0;
 }
 
@@ -225,6 +233,8 @@ static void vc4_v3d_unbind(struct device *dev, struct device *master,
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = to_vc4_dev(drm);
 
+	pm_runtime_disable(dev);
+
 	drm_irq_uninstall(drm);
 
 	/* Disable the binner's overflow memory address, so the next
@@ -234,11 +244,13 @@ static void vc4_v3d_unbind(struct device *dev, struct device *master,
 	V3D_WRITE(V3D_BPOA, 0);
 	V3D_WRITE(V3D_BPOS, 0);
 
-	vc4_v3d_set_power(vc4, false);
-
 	vc4->v3d = NULL;
 }
 
+static const struct dev_pm_ops vc4_v3d_pm_ops = {
+	SET_RUNTIME_PM_OPS(vc4_v3d_runtime_suspend, vc4_v3d_runtime_resume, NULL)
+};
+
 static const struct component_ops vc4_v3d_ops = {
 	.bind   = vc4_v3d_bind,
 	.unbind = vc4_v3d_unbind,
@@ -267,5 +279,6 @@ struct platform_driver vc4_v3d_driver = {
 	.driver = {
 		.name = "vc4_v3d",
 		.of_match_table = vc4_v3d_dt_match,
+		.pm = &vc4_v3d_pm_ops,
 	},
 };

From 0de214b9f0bbee993ae3b7d2685100a87beaf672 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 12 Aug 2016 10:45:04 -0700
Subject: [PATCH 03/17] drm/vc4: Drop firmware node getting now that we use the
 power domain.

---
 drivers/gpu/drm/vc4/vc4_drv.c | 10 ----------
 drivers/gpu/drm/vc4/vc4_drv.h |  1 -
 2 files changed, 11 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 696210843c948e..2a660a763ae030 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
-#include <soc/bcm2835/raspberrypi-firmware.h>
 #include "drm_fb_cma_helper.h"
 
 #include "uapi/drm/vc4_drm.h"
@@ -204,7 +203,6 @@ static int vc4_drm_bind(struct device *dev)
 	struct drm_device *drm;
 	struct drm_connector *connector;
 	struct vc4_dev *vc4;
-	struct device_node *firmware_node;
 	int ret = 0;
 
 	dev->coherent_dma_mask = DMA_BIT_MASK(32);
@@ -213,14 +211,6 @@ static int vc4_drm_bind(struct device *dev)
 	if (!vc4)
 		return -ENOMEM;
 
-	firmware_node = of_parse_phandle(dev->of_node, "firmware", 0);
-	vc4->firmware = rpi_firmware_get(firmware_node);
-	if (!vc4->firmware) {
-		DRM_DEBUG("Failed to get Raspberry Pi firmware reference.\n");
-		return -EPROBE_DEFER;
-	}
-	of_node_put(firmware_node);
-
 	drm = drm_dev_alloc(&vc4_drm_driver, dev);
 	if (!drm)
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index cf48c8d49e59d7..700b8cf9eb24d4 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -23,7 +23,6 @@ struct vc4_dev {
 	struct vc4_dsi *dsi1;
 
 	struct drm_fbdev_cma *fbdev;
-	struct rpi_firmware *firmware;
 
 	struct vc4_hang_state *hang_state;
 

From 4effd4d6b9a0b827f918c96b98ca1172f3101e5f Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 8 Feb 2016 12:59:02 -0800
Subject: [PATCH 04/17] drm/vc4: Use runtime PM to power cycle the device when
 the GPU hangs.

This gets us functional GPU reset again, like we had until a refactor
at merge time.  Tested with a little patch to stuff in a broken binner
job every 100 frames.

Signed-off-by: Eric Anholt <eric@anholt.net>
(cherry picked from commit 36cb6253f9383fd9a59ee7b8458c6232ef48577c)
---
 drivers/gpu/drm/vc4/vc4_drv.h |  6 +++++-
 drivers/gpu/drm/vc4/vc4_gem.c | 26 +++++++++++++++++++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 700b8cf9eb24d4..24e269f136d9ba 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -104,6 +104,11 @@ struct vc4_dev {
 	struct vc4_bo *overflow_mem;
 	struct work_struct overflow_mem_work;
 
+	int power_refcount;
+
+	/* Mutex controlling the power refcount. */
+	struct mutex power_lock;
+
 	struct {
 		struct timer_list timer;
 		struct work_struct reset_work;
@@ -495,7 +500,6 @@ void vc4_plane_async_set_fb(struct drm_plane *plane,
 extern struct platform_driver vc4_v3d_driver;
 int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
 int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
-int vc4_v3d_set_power(struct vc4_dev *vc4, bool on);
 
 /* vc4_validate.c */
 int
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index bb9513bf1a4030..aa4517c294540a 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -261,8 +261,16 @@ vc4_reset(struct drm_device *dev)
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 
 	DRM_INFO("Resetting GPU.\n");
-	vc4_v3d_set_power(vc4, false);
-	vc4_v3d_set_power(vc4, true);
+
+	mutex_lock(&vc4->power_lock);
+	if (vc4->power_refcount) {
+		/* Power the device off and back on the by dropping the
+		 * reference on runtime PM.
+		 */
+		pm_runtime_put_sync_suspend(&vc4->v3d->pdev->dev);
+		pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+	}
+	mutex_unlock(&vc4->power_lock);
 
 	vc4_irq_reset(dev);
 
@@ -709,7 +717,10 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 	}
 	mutex_unlock(&dev->struct_mutex);
 
-	pm_runtime_put(&vc4->v3d->pdev->dev);
+	mutex_lock(&vc4->power_lock);
+	if (--vc4->power_refcount == 0)
+		pm_runtime_put(&vc4->v3d->pdev->dev);
+	mutex_unlock(&vc4->power_lock);
 
 	kfree(exec);
 }
@@ -851,7 +862,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct drm_vc4_submit_cl *args = data;
 	struct vc4_exec_info *exec;
-	int ret;
+	int ret = 0;
 
 	if ((args->flags & ~VC4_SUBMIT_CL_USE_CLEAR_COLOR) != 0) {
 		DRM_ERROR("Unknown flags: 0x%02x\n", args->flags);
@@ -864,7 +875,10 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
 		return -ENOMEM;
 	}
 
-	ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+	mutex_lock(&vc4->power_lock);
+	if (vc4->power_refcount++ == 0)
+		ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+	mutex_unlock(&vc4->power_lock);
 	if (ret < 0) {
 		kfree(exec);
 		return ret;
@@ -925,6 +939,8 @@ vc4_gem_init(struct drm_device *dev)
 		    (unsigned long)dev);
 
 	INIT_WORK(&vc4->job_done_work, vc4_job_done_work);
+
+	mutex_init(&vc4->power_lock);
 }
 
 void

From b4fc3f7aeced31c6d1217361667e410682feed13 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 12 Aug 2016 10:55:53 -0700
Subject: [PATCH 05/17] drm/panel: Drop debug printf from the Raspberry Pi
 touchscreen.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
index dbc507cf2ef125..be2c19ba827408 100644
--- a/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
+++ b/drivers/gpu/drm/panel/panel-raspberrypi-touchscreen.c
@@ -106,7 +106,6 @@ struct regdump {
 static int rpi_touchscreen_disable(struct drm_panel *panel)
 {
 	struct rpi_touchscreen *ts = panel_to_ts(panel);
-	pr_err("disable\n");
 
 	if (!ts->enabled)
 		return 0;

From aa8cbd1ff120c27ce2c3206b1ad57cbf231b4572 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 12 Aug 2016 10:57:41 -0700
Subject: [PATCH 06/17] drm/vc4: Replace HDMI force-connected with an EDID
 probe.

The force-connected started out because I didn't know how to read the
HPD pin successfully, which required the hpd_active_low check and
getting the correct active level into the DTs.  It stayed because we
don't have the Pi3's HPD line exposed to Linux, so this was the only
way to bring up graphics on it.

However, with the DSI panel support now present, users want to be able
to run DSI-only systems, and forcing HDMI on is interfering with
default screen configurations.  Work around the Pi3's missing HPD by
probing the DDC on I2C and see if it's present at all.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_hdmi.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c
index c0e6b8d36ffce8..f562f3de0e3f0a 100644
--- a/drivers/gpu/drm/vc4/vc4_hdmi.c
+++ b/drivers/gpu/drm/vc4/vc4_hdmi.c
@@ -166,8 +166,6 @@ vc4_hdmi_connector_detect(struct drm_connector *connector, bool force)
 	struct drm_device *dev = connector->dev;
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 
-	return connector_status_connected;
-
 	if (vc4->hdmi->hpd_gpio) {
 		if (gpio_get_value_cansleep(vc4->hdmi->hpd_gpio) ^
 		    vc4->hdmi->hpd_active_low)
@@ -176,6 +174,9 @@ vc4_hdmi_connector_detect(struct drm_connector *connector, bool force)
 			return connector_status_disconnected;
 	}
 
+	if (drm_probe_ddc(vc4->hdmi->ddc))
+		return connector_status_connected;
+
 	if (HDMI_READ(VC4_HDMI_HOTPLUG) & VC4_HDMI_HOTPLUG_CONNECTED)
 		return connector_status_connected;
 	else

From e0d06fc4a7ffe8d03b34e4f61c8379f3453c359f Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 15 Aug 2016 10:08:58 -0700
Subject: [PATCH 07/17] BCM270X: Drop HPD setting from the common dtsi.

The HPD is quite board-specific, so we need to set it in the per-board
DT.
---
 arch/arm/boot/dts/bcm2708_common.dtsi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/boot/dts/bcm2708_common.dtsi b/arch/arm/boot/dts/bcm2708_common.dtsi
index bf76c283b16294..449428027992c1 100644
--- a/arch/arm/boot/dts/bcm2708_common.dtsi
+++ b/arch/arm/boot/dts/bcm2708_common.dtsi
@@ -347,7 +347,6 @@
 			reg = <0x7e902000 0x600>,
 			      <0x7e808000 0x100>;
 			ddc = <&i2c2>;
-			hpd-gpios = <&gpio 46 GPIO_ACTIVE_HIGH>;
 			clocks = <&cprman BCM2835_PLLH_PIX>,
 				 <&cprman BCM2835_CLOCK_HSM>;
 			clock-names = "pixel", "hdmi";

From 6f91ee02c2e014f424b820eb789f9523ba411d1d Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 12 Aug 2016 11:02:14 -0700
Subject: [PATCH 08/17] BCM2710: Drop incorrect HDMI HPD line from the DT.

It's actually off on the GPIO expander, which I don't think we have
access to.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 arch/arm/boot/dts/bcm2710-rpi-3-b.dts | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm/boot/dts/bcm2710-rpi-3-b.dts b/arch/arm/boot/dts/bcm2710-rpi-3-b.dts
index a72e6e5a0ac0b2..98352b51f96730 100644
--- a/arch/arm/boot/dts/bcm2710-rpi-3-b.dts
+++ b/arch/arm/boot/dts/bcm2710-rpi-3-b.dts
@@ -166,10 +166,6 @@
 	};
 };
 
-&hdmi {
-	hpd-gpios = <&gpio 46 GPIO_ACTIVE_LOW>;
-};
-
 &audio {
 	pinctrl-names = "default";
 	pinctrl-0 = <&audio_pins>;

From 27eb1a7703406b11b2779d2a92695fe8f9dff0f5 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 1 Jul 2016 13:10:38 -0700
Subject: [PATCH 09/17] drm/vc4: Add a getparam ioctl for getting the V3D
 identity regs.

As I extend the driver to support different V3D revisions, userspace
needs to know what version it's targeting.  This is most easily
detected using the V3D identity registers.

v2: Make sure V3D is runtime PM on when reading the registers.
v3: Switch to a 64-bit param value (suggested by Rob Clark in review)

Signed-off-by: Eric Anholt <eric@anholt.net>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> (v2)
Reviewed-by: Rob Clark <robdclark@gmail.com> (v3, over irc)
(cherry picked from commit af713795c59fea36161a7debf97dbc10bf652cf7)

v4: Squashed in "drm/vc4: Fix handling of a pm_runtime_get_sync() success case."
---
 drivers/gpu/drm/vc4/vc4_drv.c | 42 +++++++++++++++++++++++++++++++++++
 include/uapi/drm/vc4_drm.h    | 12 ++++++++++
 2 files changed, 54 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 2a660a763ae030..8edcf9a6fd1cbf 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include "drm_fb_cma_helper.h"
 
 #include "uapi/drm/vc4_drm.h"
@@ -64,6 +65,46 @@ void vc4_dump_regs32(const struct debugfs_reg32 *regs, unsigned int num_regs,
 	}
 }
 
+static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
+			       struct drm_file *file_priv)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_vc4_get_param *args = data;
+	int ret;
+
+	if (args->pad != 0)
+		return -EINVAL;
+
+	switch (args->param) {
+	case DRM_VC4_PARAM_V3D_IDENT0:
+		ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+		if (ret < 0)
+			return ret;
+		args->value = V3D_READ(V3D_IDENT0);
+		pm_runtime_put(&vc4->v3d->pdev->dev);
+		break;
+	case DRM_VC4_PARAM_V3D_IDENT1:
+		ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+		if (ret < 0)
+			return ret;
+		args->value = V3D_READ(V3D_IDENT1);
+		pm_runtime_put(&vc4->v3d->pdev->dev);
+		break;
+	case DRM_VC4_PARAM_V3D_IDENT2:
+		ret = pm_runtime_get_sync(&vc4->v3d->pdev->dev);
+		if (ret < 0)
+			return ret;
+		args->value = V3D_READ(V3D_IDENT2);
+		pm_runtime_put(&vc4->v3d->pdev->dev);
+		break;
+	default:
+		DRM_DEBUG("Unknown parameter %d\n", args->param);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static void vc4_lastclose(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
@@ -95,6 +136,7 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(VC4_GET_HANG_STATE, vc4_get_hang_state_ioctl,
 			  DRM_ROOT_ONLY),
+	DRM_IOCTL_DEF_DRV(VC4_GET_PARAM, vc4_get_param_ioctl, DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver vc4_drm_driver = {
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index af12e8a184c82b..1143e954048d1f 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -37,6 +37,7 @@ extern "C" {
 #define DRM_VC4_MMAP_BO                           0x04
 #define DRM_VC4_CREATE_SHADER_BO                  0x05
 #define DRM_VC4_GET_HANG_STATE                    0x06
+#define DRM_VC4_GET_PARAM                         0x07
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -45,6 +46,7 @@ extern "C" {
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
 #define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 #define DRM_IOCTL_VC4_GET_HANG_STATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state)
+#define DRM_IOCTL_VC4_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_PARAM, struct drm_vc4_get_param)
 
 struct drm_vc4_submit_rcl_surface {
 	__u32 hindex; /* Handle index, or ~0 if not present. */
@@ -280,6 +282,16 @@ struct drm_vc4_get_hang_state {
 	__u32 pad[16];
 };
 
+#define DRM_VC4_PARAM_V3D_IDENT0		0
+#define DRM_VC4_PARAM_V3D_IDENT1		1
+#define DRM_VC4_PARAM_V3D_IDENT2		2
+
+struct drm_vc4_get_param {
+	__u32 param;
+	__u32 pad;
+	__u64 value;
+};
+
 #if defined(__cplusplus)
 }
 #endif

From fc7b1c9ec5732ed06778ced102c483c4d3a00a70 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Sat, 2 Jul 2016 09:57:07 -0700
Subject: [PATCH 10/17] drm/vc4: Move validation's current/max ip into the
 validation struct.

Reduces the argument count for some of the functions, and will be used
more with the upcoming looping support.

Signed-off-by: Eric Anholt <eric@anholt.net>
(cherry picked from commit d0566c2a2f2baacefe1eb75be8a001fdd6fe84a3)
---
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 54 ++++++++++++----------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
index aae96fe4302291..464113adf8244c 100644
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -40,6 +40,14 @@
 #include "vc4_qpu_defines.h"
 
 struct vc4_shader_validation_state {
+	/* Current IP being validated. */
+	uint32_t ip;
+
+	/* IP at the end of the BO, do not read shader[max_ip] */
+	uint32_t max_ip;
+
+	uint64_t *shader;
+
 	struct vc4_texture_sample_info tmu_setup[2];
 	int tmu_write_count[2];
 
@@ -129,11 +137,11 @@ record_texture_sample(struct vc4_validated_shader_info *validated_shader,
 }
 
 static bool
-check_tmu_write(uint64_t inst,
-		struct vc4_validated_shader_info *validated_shader,
+check_tmu_write(struct vc4_validated_shader_info *validated_shader,
 		struct vc4_shader_validation_state *validation_state,
 		bool is_mul)
 {
+	uint64_t inst = validation_state->shader[validation_state->ip];
 	uint32_t waddr = (is_mul ?
 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
@@ -228,11 +236,11 @@ check_tmu_write(uint64_t inst,
 }
 
 static bool
-check_reg_write(uint64_t inst,
-		struct vc4_validated_shader_info *validated_shader,
+check_reg_write(struct vc4_validated_shader_info *validated_shader,
 		struct vc4_shader_validation_state *validation_state,
 		bool is_mul)
 {
+	uint64_t inst = validation_state->shader[validation_state->ip];
 	uint32_t waddr = (is_mul ?
 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
@@ -261,7 +269,7 @@ check_reg_write(uint64_t inst,
 	case QPU_W_TMU1_T:
 	case QPU_W_TMU1_R:
 	case QPU_W_TMU1_B:
-		return check_tmu_write(inst, validated_shader, validation_state,
+		return check_tmu_write(validated_shader, validation_state,
 				       is_mul);
 
 	case QPU_W_HOST_INT:
@@ -294,10 +302,10 @@ check_reg_write(uint64_t inst,
 }
 
 static void
-track_live_clamps(uint64_t inst,
-		  struct vc4_validated_shader_info *validated_shader,
+track_live_clamps(struct vc4_validated_shader_info *validated_shader,
 		  struct vc4_shader_validation_state *validation_state)
 {
+	uint64_t inst = validation_state->shader[validation_state->ip];
 	uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
 	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
@@ -369,10 +377,10 @@ track_live_clamps(uint64_t inst,
 }
 
 static bool
-check_instruction_writes(uint64_t inst,
-			 struct vc4_validated_shader_info *validated_shader,
+check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
 			 struct vc4_shader_validation_state *validation_state)
 {
+	uint64_t inst = validation_state->shader[validation_state->ip];
 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
 	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
 	bool ok;
@@ -382,12 +390,10 @@ check_instruction_writes(uint64_t inst,
 		return false;
 	}
 
-	ok = (check_reg_write(inst, validated_shader, validation_state,
-			      false) &&
-	      check_reg_write(inst, validated_shader, validation_state,
-			      true));
+	ok = (check_reg_write(validated_shader, validation_state, false) &&
+	      check_reg_write(validated_shader, validation_state, true));
 
-	track_live_clamps(inst, validated_shader, validation_state);
+	track_live_clamps(validated_shader, validation_state);
 
 	return ok;
 }
@@ -417,30 +423,30 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
 	bool found_shader_end = false;
 	int shader_end_ip = 0;
-	uint32_t ip, max_ip;
-	uint64_t *shader;
+	uint32_t ip;
 	struct vc4_validated_shader_info *validated_shader;
 	struct vc4_shader_validation_state validation_state;
 	int i;
 
 	memset(&validation_state, 0, sizeof(validation_state));
+	validation_state.shader = shader_obj->vaddr;
+	validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
 
 	for (i = 0; i < 8; i++)
 		validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
 	for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
 		validation_state.live_min_clamp_offsets[i] = ~0;
 
-	shader = shader_obj->vaddr;
-	max_ip = shader_obj->base.size / sizeof(uint64_t);
-
 	validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
 	if (!validated_shader)
 		return NULL;
 
-	for (ip = 0; ip < max_ip; ip++) {
-		uint64_t inst = shader[ip];
+	for (ip = 0; ip < validation_state.max_ip; ip++) {
+		uint64_t inst = validation_state.shader[ip];
 		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
+		validation_state.ip = ip;
+
 		switch (sig) {
 		case QPU_SIG_NONE:
 		case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -450,7 +456,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		case QPU_SIG_LOAD_TMU1:
 		case QPU_SIG_PROG_END:
 		case QPU_SIG_SMALL_IMM:
-			if (!check_instruction_writes(inst, validated_shader,
+			if (!check_instruction_writes(validated_shader,
 						      &validation_state)) {
 				DRM_ERROR("Bad write at ip %d\n", ip);
 				goto fail;
@@ -467,7 +473,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 			break;
 
 		case QPU_SIG_LOAD_IMM:
-			if (!check_instruction_writes(inst, validated_shader,
+			if (!check_instruction_writes(validated_shader,
 						      &validation_state)) {
 				DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip);
 				goto fail;
@@ -487,7 +493,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 			break;
 	}
 
-	if (ip == max_ip) {
+	if (ip == validation_state.max_ip) {
 		DRM_ERROR("shader failed to terminate before "
 			  "shader BO end at %zd\n",
 			  shader_obj->base.size);

From ac0196772c6d88d70055ed3cbbaa9d0c29628687 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Sat, 2 Jul 2016 10:10:24 -0700
Subject: [PATCH 11/17] drm/vc4: Add a bitmap of branch targets during shader
 validation.

This isn't used yet, it's just a first step toward loop validation.
During the main parsing of instructions, we need to know when we hit a
new basic block so that we can reset validated state.

v2: Fix a stray semicolon after an if block.  (caught by kbuild test).

Signed-off-by: Eric Anholt <eric@anholt.net>
(cherry picked from commit 93aa9ae3e5523e49e4e5abacd4dbee0e4ab2d931)
---
 drivers/gpu/drm/vc4/vc4_qpu_defines.h      |  12 +++
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 114 ++++++++++++++++++++-
 2 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_qpu_defines.h b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
index 736fb9d8cd1f1c..f4f2f0dd523e3d 100644
--- a/drivers/gpu/drm/vc4/vc4_qpu_defines.h
+++ b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
@@ -230,6 +230,15 @@ enum qpu_unpack_r4 {
 #define QPU_COND_MUL_SHIFT              46
 #define QPU_COND_MUL_MASK               QPU_MASK(48, 46)
 
+#define QPU_BRANCH_COND_SHIFT           52
+#define QPU_BRANCH_COND_MASK            QPU_MASK(55, 52)
+
+#define QPU_BRANCH_REL                  ((uint64_t)1 << 51)
+#define QPU_BRANCH_REG                  ((uint64_t)1 << 50)
+
+#define QPU_BRANCH_RADDR_A_SHIFT        45
+#define QPU_BRANCH_RADDR_A_MASK         QPU_MASK(49, 45)
+
 #define QPU_SF                          ((uint64_t)1 << 45)
 
 #define QPU_WADDR_ADD_SHIFT             38
@@ -261,4 +270,7 @@ enum qpu_unpack_r4 {
 #define QPU_OP_ADD_SHIFT                24
 #define QPU_OP_ADD_MASK                 QPU_MASK(28, 24)
 
+#define QPU_BRANCH_TARGET_SHIFT         0
+#define QPU_BRANCH_TARGET_MASK          QPU_MASK(31, 0)
+
 #endif /* VC4_QPU_DEFINES_H */
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
index 464113adf8244c..9940674126e086 100644
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -59,6 +59,13 @@ struct vc4_shader_validation_state {
 	 */
 	uint32_t live_min_clamp_offsets[32 + 32 + 4];
 	bool live_max_clamp_regs[32 + 32 + 4];
+
+	/* Bitfield of which IPs are used as branch targets.
+	 *
+	 * Used for validation that the uniform stream is updated at the right
+	 * points and clearing the texturing/clamping state.
+	 */
+	unsigned long *branch_targets;
 };
 
 static uint32_t
@@ -418,13 +425,104 @@ check_instruction_reads(uint64_t inst,
 	return true;
 }
 
+/* Make sure that all branches are absolute and point within the shader, and
+ * note their targets for later.
+ */
+static bool
+vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
+{
+	uint32_t max_branch_target = 0;
+	bool found_shader_end = false;
+	int ip;
+	int shader_end_ip = 0;
+	int last_branch = -2;
+
+	for (ip = 0; ip < validation_state->max_ip; ip++) {
+		uint64_t inst = validation_state->shader[ip];
+		int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
+		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+		uint32_t after_delay_ip = ip + 4;
+		uint32_t branch_target_ip;
+
+		if (sig == QPU_SIG_PROG_END) {
+			shader_end_ip = ip;
+			found_shader_end = true;
+			continue;
+		}
+
+		if (sig != QPU_SIG_BRANCH)
+			continue;
+
+		if (ip - last_branch < 4) {
+			DRM_ERROR("Branch at %d during delay slots\n", ip);
+			return false;
+		}
+		last_branch = ip;
+
+		if (inst & QPU_BRANCH_REG) {
+			DRM_ERROR("branching from register relative "
+				  "not supported\n");
+			return false;
+		}
+
+		if (!(inst & QPU_BRANCH_REL)) {
+			DRM_ERROR("relative branching required\n");
+			return false;
+		}
+
+		/* The actual branch target is the instruction after the delay
+		 * slots, plus whatever byte offset is in the low 32 bits of
+		 * the instruction.  Make sure we're not branching beyond the
+		 * end of the shader object.
+		 */
+		if (branch_imm % sizeof(inst) != 0) {
+			DRM_ERROR("branch target not aligned\n");
+			return false;
+		}
+
+		branch_target_ip = after_delay_ip + (branch_imm >> 3);
+		if (branch_target_ip >= validation_state->max_ip) {
+			DRM_ERROR("Branch at %d outside of shader (ip %d/%d)\n",
+				  ip, branch_target_ip,
+				  validation_state->max_ip);
+			return false;
+		}
+		set_bit(branch_target_ip, validation_state->branch_targets);
+
+		/* Make sure that the non-branching path is also not outside
+		 * the shader.
+		 */
+		if (after_delay_ip >= validation_state->max_ip) {
+			DRM_ERROR("Branch at %d continues past shader end "
+				  "(%d/%d)\n",
+				  ip, after_delay_ip, validation_state->max_ip);
+			return false;
+		}
+		set_bit(after_delay_ip, validation_state->branch_targets);
+		max_branch_target = max(max_branch_target, after_delay_ip);
+
+		/* There are two delay slots after program end is signaled
+		 * that are still executed, then we're finished.
+		 */
+		if (found_shader_end && ip == shader_end_ip + 2)
+			break;
+	}
+
+	if (max_branch_target > shader_end_ip) {
+		DRM_ERROR("Branch landed after QPU_SIG_PROG_END");
+		return false;
+	}
+
+	return true;
+}
+
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
 	bool found_shader_end = false;
 	int shader_end_ip = 0;
 	uint32_t ip;
-	struct vc4_validated_shader_info *validated_shader;
+	struct vc4_validated_shader_info *validated_shader = NULL;
 	struct vc4_shader_validation_state validation_state;
 	int i;
 
@@ -437,9 +535,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 	for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
 		validation_state.live_min_clamp_offsets[i] = ~0;
 
+	validation_state.branch_targets =
+		kcalloc(BITS_TO_LONGS(validation_state.max_ip),
+			sizeof(unsigned long), GFP_KERNEL);
+	if (!validation_state.branch_targets)
+		goto fail;
+
 	validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
 	if (!validated_shader)
-		return NULL;
+		goto fail;
+
+	if (!vc4_validate_branches(&validation_state))
+		goto fail;
 
 	for (ip = 0; ip < validation_state.max_ip; ip++) {
 		uint64_t inst = validation_state.shader[ip];
@@ -508,9 +615,12 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		(validated_shader->uniforms_size +
 		 4 * validated_shader->num_texture_samples);
 
+	kfree(validation_state.branch_targets);
+
 	return validated_shader;
 
 fail:
+	kfree(validation_state.branch_targets);
 	if (validated_shader) {
 		kfree(validated_shader->texture_samples);
 		kfree(validated_shader);

From c7ffcfac16add2977b14ed1487c7b9858953029e Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Sat, 2 Jul 2016 12:17:10 -0700
Subject: [PATCH 12/17] drm/vc4: Add support for branching in shader
 validation.

We're already checking that branch instructions are between the start
of the shader and the proper PROG_END sequence.  The other thing we
need to make branching safe is to verify that the shader doesn't read
past the end of the uniforms stream.

To do that, we require that at any basic block reading uniforms have
the following instructions:

load_imm temp, <next offset within uniform stream>
add unif_addr, temp, unif

The instructions are generated by userspace, and the kernel verifies
that the load_imm is of the expected offset, and that the add adds it
to a uniform.  We track which uniform in the stream that is, and at
draw call time fix up the uniform stream to have the address of the
start of the shader's uniforms at that location.

Signed-off-by: Eric Anholt <eric@anholt.net>
(cherry picked from commit 6d45c81d229d71da54d374143e7d6abad4c0cf31)
---
 drivers/gpu/drm/vc4/vc4_drv.h              |   3 +
 drivers/gpu/drm/vc4/vc4_qpu_defines.h      |   3 +
 drivers/gpu/drm/vc4/vc4_validate.c         |  13 +-
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 281 +++++++++++++++++++--
 4 files changed, 283 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 24e269f136d9ba..f1f82735d3f8d7 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -363,6 +363,9 @@ struct vc4_validated_shader_info {
 	uint32_t uniforms_src_size;
 	uint32_t num_texture_samples;
 	struct vc4_texture_sample_info *texture_samples;
+
+	uint32_t num_uniform_addr_offsets;
+	uint32_t *uniform_addr_offsets;
 };
 
 /**
diff --git a/drivers/gpu/drm/vc4/vc4_qpu_defines.h b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
index f4f2f0dd523e3d..f4e795a0d3f612 100644
--- a/drivers/gpu/drm/vc4/vc4_qpu_defines.h
+++ b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
@@ -270,6 +270,9 @@ enum qpu_unpack_r4 {
 #define QPU_OP_ADD_SHIFT                24
 #define QPU_OP_ADD_MASK                 QPU_MASK(28, 24)
 
+#define QPU_LOAD_IMM_SHIFT              0
+#define QPU_LOAD_IMM_MASK               QPU_MASK(31, 0)
+
 #define QPU_BRANCH_TARGET_SHIFT         0
 #define QPU_BRANCH_TARGET_MASK          QPU_MASK(31, 0)
 
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index 24c2c746e8f397..9ce1d0adf8826e 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -802,7 +802,7 @@ validate_gl_shader_rec(struct drm_device *dev,
 		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
 		uint32_t *texture_handles_u;
 		void *uniform_data_u;
-		uint32_t tex;
+		uint32_t tex, uni;
 
 		*(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
 
@@ -840,6 +840,17 @@ validate_gl_shader_rec(struct drm_device *dev,
 			}
 		}
 
+		/* Fill in the uniform slots that need this shader's
+		 * start-of-uniforms address (used for resetting the uniform
+		 * stream in the presence of control flow).
+		 */
+		for (uni = 0;
+		     uni < validated_shader->num_uniform_addr_offsets;
+		     uni++) {
+			uint32_t o = validated_shader->uniform_addr_offsets[uni];
+			((uint32_t *)exec->uniforms_v)[o] = exec->uniforms_p;
+		}
+
 		*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
 
 		exec->uniforms_u += validated_shader->uniforms_src_size;
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
index 9940674126e086..46527e989ce378 100644
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -39,6 +39,8 @@
 #include "vc4_drv.h"
 #include "vc4_qpu_defines.h"
 
+#define LIVE_REG_COUNT (32 + 32 + 4)
+
 struct vc4_shader_validation_state {
 	/* Current IP being validated. */
 	uint32_t ip;
@@ -57,8 +59,9 @@ struct vc4_shader_validation_state {
 	 *
 	 * This is used for the validation of direct address memory reads.
 	 */
-	uint32_t live_min_clamp_offsets[32 + 32 + 4];
-	bool live_max_clamp_regs[32 + 32 + 4];
+	uint32_t live_min_clamp_offsets[LIVE_REG_COUNT];
+	bool live_max_clamp_regs[LIVE_REG_COUNT];
+	uint32_t live_immediates[LIVE_REG_COUNT];
 
 	/* Bitfield of which IPs are used as branch targets.
 	 *
@@ -66,6 +69,20 @@ struct vc4_shader_validation_state {
 	 * points and clearing the texturing/clamping state.
 	 */
 	unsigned long *branch_targets;
+
+	/* Set when entering a basic block, and cleared when the uniform
+	 * address update is found.  This is used to make sure that we don't
+	 * read uniforms when the address is undefined.
+	 */
+	bool needs_uniform_address_update;
+
+	/* Set when we find a backwards branch.  If the branch is backwards,
+	 * the taraget is probably doing an address reset to read uniforms,
+	 * and so we need to be sure that a uniforms address is present in the
+	 * stream, even if the shader didn't need to read uniforms in later
+	 * basic blocks.
+	 */
+	bool needs_uniform_address_for_loop;
 };
 
 static uint32_t
@@ -227,8 +244,14 @@ check_tmu_write(struct vc4_validated_shader_info *validated_shader,
 	/* Since direct uses a RADDR uniform reference, it will get counted in
 	 * check_instruction_reads()
 	 */
-	if (!is_direct)
+	if (!is_direct) {
+		if (validation_state->needs_uniform_address_update) {
+			DRM_ERROR("Texturing with undefined uniform address\n");
+			return false;
+		}
+
 		validated_shader->uniforms_size += 4;
+	}
 
 	if (submit) {
 		if (!record_texture_sample(validated_shader,
@@ -242,6 +265,98 @@ check_tmu_write(struct vc4_validated_shader_info *validated_shader,
 	return true;
 }
 
+static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader)
+{
+	uint32_t o = validated_shader->num_uniform_addr_offsets;
+	uint32_t num_uniforms = validated_shader->uniforms_size / 4;
+
+	validated_shader->uniform_addr_offsets =
+		krealloc(validated_shader->uniform_addr_offsets,
+			 (o + 1) *
+			 sizeof(*validated_shader->uniform_addr_offsets),
+			 GFP_KERNEL);
+	if (!validated_shader->uniform_addr_offsets)
+		return false;
+
+	validated_shader->uniform_addr_offsets[o] = num_uniforms;
+	validated_shader->num_uniform_addr_offsets++;
+
+	return true;
+}
+
+static bool
+validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader,
+			       struct vc4_shader_validation_state *validation_state,
+			       bool is_mul)
+{
+	uint64_t inst = validation_state->shader[validation_state->ip];
+	u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
+	u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+	u32 add_lri = raddr_add_a_to_live_reg_index(inst);
+	/* We want our reset to be pointing at whatever uniform follows the
+	 * uniforms base address.
+	 */
+	u32 expected_offset = validated_shader->uniforms_size + 4;
+
+	/* We only support absolute uniform address changes, and we
+	 * require that they be in the current basic block before any
+	 * of its uniform reads.
+	 *
+	 * One could potentially emit more efficient QPU code, by
+	 * noticing that (say) an if statement does uniform control
+	 * flow for all threads and that the if reads the same number
+	 * of uniforms on each side.  However, this scheme is easy to
+	 * validate so it's all we allow for now.
+	 */
+
+	if (QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_NONE) {
+		DRM_ERROR("uniforms address change must be "
+			  "normal math\n");
+		return false;
+	}
+
+	if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
+		DRM_ERROR("Uniform address reset must be an ADD.\n");
+		return false;
+	}
+
+	if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) {
+		DRM_ERROR("Uniform address reset must be unconditional.\n");
+		return false;
+	}
+
+	if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP &&
+	    !(inst & QPU_PM)) {
+		DRM_ERROR("No packing allowed on uniforms reset\n");
+		return false;
+	}
+
+	if (add_lri == -1) {
+		DRM_ERROR("First argument of uniform address write must be "
+			  "an immediate value.\n");
+		return false;
+	}
+
+	if (validation_state->live_immediates[add_lri] != expected_offset) {
+		DRM_ERROR("Resetting uniforms with offset %db instead of %db\n",
+			  validation_state->live_immediates[add_lri],
+			  expected_offset);
+		return false;
+	}
+
+	if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+	    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
+		DRM_ERROR("Second argument of uniform address write must be "
+			  "a uniform.\n");
+		return false;
+	}
+
+	validation_state->needs_uniform_address_update = false;
+	validation_state->needs_uniform_address_for_loop = false;
+	return require_uniform_address_uniform(validated_shader);
+}
+
 static bool
 check_reg_write(struct vc4_validated_shader_info *validated_shader,
 		struct vc4_shader_validation_state *validation_state,
@@ -251,14 +366,37 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader,
 	uint32_t waddr = (is_mul ?
 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+	bool ws = inst & QPU_WS;
+	bool is_b = is_mul ^ ws;
+	u32 lri = waddr_to_live_reg_index(waddr, is_b);
+
+	if (lri != -1) {
+		uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
+		uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL);
+
+		if (sig == QPU_SIG_LOAD_IMM &&
+		    QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP &&
+		    ((is_mul && cond_mul == QPU_COND_ALWAYS) ||
+		     (!is_mul && cond_add == QPU_COND_ALWAYS))) {
+			validation_state->live_immediates[lri] =
+				QPU_GET_FIELD(inst, QPU_LOAD_IMM);
+		} else {
+			validation_state->live_immediates[lri] = ~0;
+		}
+	}
 
 	switch (waddr) {
 	case QPU_W_UNIFORMS_ADDRESS:
-		/* XXX: We'll probably need to support this for reladdr, but
-		 * it's definitely a security-related one.
-		 */
-		DRM_ERROR("uniforms address load unsupported\n");
-		return false;
+		if (is_b) {
+			DRM_ERROR("relative uniforms address change "
+				  "unsupported\n");
+			return false;
+		}
+
+		return validate_uniform_address_write(validated_shader,
+						      validation_state,
+						      is_mul);
 
 	case QPU_W_TLB_COLOR_MS:
 	case QPU_W_TLB_COLOR_ALL:
@@ -406,9 +544,35 @@ check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
 }
 
 static bool
-check_instruction_reads(uint64_t inst,
-			struct vc4_validated_shader_info *validated_shader)
+check_branch(uint64_t inst,
+	     struct vc4_validated_shader_info *validated_shader,
+	     struct vc4_shader_validation_state *validation_state,
+	     int ip)
+{
+	int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+
+	if ((int)branch_imm < 0)
+		validation_state->needs_uniform_address_for_loop = true;
+
+	/* We don't want to have to worry about validation of this, and
+	 * there's no need for it.
+	 */
+	if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) {
+		DRM_ERROR("branch instruction at %d wrote a register.\n",
+			  validation_state->ip);
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
+			struct vc4_shader_validation_state *validation_state)
 {
+	uint64_t inst = validation_state->shader[validation_state->ip];
 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
@@ -420,6 +584,12 @@ check_instruction_reads(uint64_t inst,
 		 * already be OOM.
 		 */
 		validated_shader->uniforms_size += 4;
+
+		if (validation_state->needs_uniform_address_update) {
+			DRM_ERROR("Uniform read with undefined uniform "
+				  "address\n");
+			return false;
+		}
 	}
 
 	return true;
@@ -516,6 +686,65 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
 	return true;
 }
 
+/* Resets any known state for the shader, used when we may be branched to from
+ * multiple locations in the program (or at shader start).
+ */
+static void
+reset_validation_state(struct vc4_shader_validation_state *validation_state)
+{
+	int i;
+
+	for (i = 0; i < 8; i++)
+		validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0;
+
+	for (i = 0; i < LIVE_REG_COUNT; i++) {
+		validation_state->live_min_clamp_offsets[i] = ~0;
+		validation_state->live_max_clamp_regs[i] = false;
+		validation_state->live_immediates[i] = ~0;
+	}
+}
+
+static bool
+texturing_in_progress(struct vc4_shader_validation_state *validation_state)
+{
+	return (validation_state->tmu_write_count[0] != 0 ||
+		validation_state->tmu_write_count[1] != 0);
+}
+
+static bool
+vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state)
+{
+	uint32_t ip = validation_state->ip;
+
+	if (!test_bit(ip, validation_state->branch_targets))
+		return true;
+
+	if (texturing_in_progress(validation_state)) {
+		DRM_ERROR("Branch target landed during TMU setup\n");
+		return false;
+	}
+
+	/* Reset our live values tracking, since this instruction may have
+	 * multiple predecessors.
+	 *
+	 * One could potentially do analysis to determine that, for
+	 * example, all predecessors have a live max clamp in the same
+	 * register, but we don't bother with that.
+	 */
+	reset_validation_state(validation_state);
+
+	/* Since we've entered a basic block from potentially multiple
+	 * predecessors, we need the uniforms address to be updated before any
+	 * unforms are read.  We require that after any branch point, the next
+	 * uniform to be loaded is a uniform address offset.  That uniform's
+	 * offset will be marked by the uniform address register write
+	 * validation, or a one-off the end-of-program check.
+	 */
+	validation_state->needs_uniform_address_update = true;
+
+	return true;
+}
+
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
@@ -524,16 +753,12 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 	uint32_t ip;
 	struct vc4_validated_shader_info *validated_shader = NULL;
 	struct vc4_shader_validation_state validation_state;
-	int i;
 
 	memset(&validation_state, 0, sizeof(validation_state));
 	validation_state.shader = shader_obj->vaddr;
 	validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
 
-	for (i = 0; i < 8; i++)
-		validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
-	for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
-		validation_state.live_min_clamp_offsets[i] = ~0;
+	reset_validation_state(&validation_state);
 
 	validation_state.branch_targets =
 		kcalloc(BITS_TO_LONGS(validation_state.max_ip),
@@ -554,6 +779,9 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 
 		validation_state.ip = ip;
 
+		if (!vc4_handle_branch_target(&validation_state))
+			goto fail;
+
 		switch (sig) {
 		case QPU_SIG_NONE:
 		case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -569,7 +797,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 				goto fail;
 			}
 
-			if (!check_instruction_reads(inst, validated_shader))
+			if (!check_instruction_reads(validated_shader,
+						     &validation_state))
 				goto fail;
 
 			if (sig == QPU_SIG_PROG_END) {
@@ -587,6 +816,11 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 			}
 			break;
 
+		case QPU_SIG_BRANCH:
+			if (!check_branch(inst, validated_shader,
+					  &validation_state, ip))
+				goto fail;
+			break;
 		default:
 			DRM_ERROR("Unsupported QPU signal %d at "
 				  "instruction %d\n", sig, ip);
@@ -607,6 +841,21 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 		goto fail;
 	}
 
+	/* If we did a backwards branch and we haven't emitted a uniforms
+	 * reset since then, we still need the uniforms stream to have the
+	 * uniforms address available so that the backwards branch can do its
+	 * uniforms reset.
+	 *
+	 * We could potentially prove that the backwards branch doesn't
+	 * contain any uses of uniforms until program exit, but that doesn't
+	 * seem to be worth the trouble.
+	 */
+	if (validation_state.needs_uniform_address_for_loop) {
+		if (!require_uniform_address_uniform(validated_shader))
+			goto fail;
+		validated_shader->uniforms_size += 4;
+	}
+
 	/* Again, no chance of integer overflow here because the worst case
 	 * scenario is 8 bytes of uniforms plus handles per 8-byte
 	 * instruction.

From a3c039184ae49cc7fed9ca5aeec4bdb70708d8e2 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Sat, 2 Jul 2016 14:14:27 -0700
Subject: [PATCH 13/17] drm/vc4: Add a getparam to signal support for branches.

Userspace needs to know if it can create shaders that do branching.
Otherwise, for backwards compatibility with old kernels it needs to
lower if statements to conditional assignments.

Signed-off-by: Eric Anholt <eric@anholt.net>
(cherry picked from commit 7363cee5b467c31dc3af2ac98df0634bb8bbc668)
---
 drivers/gpu/drm/vc4/vc4_drv.c | 3 +++
 include/uapi/drm/vc4_drm.h    | 1 +
 2 files changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 8edcf9a6fd1cbf..834fa9f031d103 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -97,6 +97,9 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
 		args->value = V3D_READ(V3D_IDENT2);
 		pm_runtime_put(&vc4->v3d->pdev->dev);
 		break;
+	case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
+		args->value = true;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", args->param);
 		return -EINVAL;
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 1143e954048d1f..ad7edc3edf7ca1 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -285,6 +285,7 @@ struct drm_vc4_get_hang_state {
 #define DRM_VC4_PARAM_V3D_IDENT0		0
 #define DRM_VC4_PARAM_V3D_IDENT1		1
 #define DRM_VC4_PARAM_V3D_IDENT2		2
+#define DRM_VC4_PARAM_SUPPORTS_BRANCHES		3
 
 struct drm_vc4_get_param {
 	__u32 param;

From 07e3a19708fd94cbd461e6016f4e90ea73d13c11 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 2 Aug 2016 17:17:52 -0700
Subject: [PATCH 14/17] drm/vc4: Don't force new binner overflow allocation per
 draw.

This came from the initial bringup code, which always idled the GPU
and always reset the overflow.  That massively increases the size of
the working set when you're doing lots of small draws, though, as is
common on X desktops or piglit.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_gem.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index aa4517c294540a..d50de9c9c9ddb9 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -435,10 +435,6 @@ vc4_submit_next_bin_job(struct drm_device *dev)
 
 	vc4_flush_caches(dev);
 
-	/* Disable the binner's pre-loaded overflow memory address */
-	V3D_WRITE(V3D_BPOA, 0);
-	V3D_WRITE(V3D_BPOS, 0);
-
 	/* Either put the job in the binner if it uses the binner, or
 	 * immediately move it to the to-be-rendered queue.
 	 */

From 71cfcb02ef7153ec6c2fbc113cec886901afe937 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 19 Jul 2016 11:31:19 -0700
Subject: [PATCH 15/17] drm/vc4: Use drm_free_large() on handles to match its
 allocation.

If you managed to exceed the limit to switch to vmalloc, we'd use the
wrong free.

Signed-off-by: Eric Anholt <eric@anholt.net>
Fixes: d5b1a78a772f ("drm/vc4: Add support for drawing 3D frames.")
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/vc4/vc4_gem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index d50de9c9c9ddb9..f04c73f0687247 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -584,7 +584,7 @@ vc4_cl_lookup_bos(struct drm_device *dev,
 	spin_unlock(&file_priv->table_lock);
 
 fail:
-	kfree(handles);
+	drm_free_large(handles);
 	return 0;
 }
 

From 5474e11698b5336613ac9210bd13c7bbd5fe3593 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 25 Jul 2016 16:10:04 -0700
Subject: [PATCH 16/17] drm/vc4: Fix oops when userspace hands in a bad BO.

We'd end up NULL pointer dereferencing because we didn't take the
error path out in the parent.  Fixes igt vc4_lookup_fail test.

Signed-off-by: Eric Anholt <eric@anholt.net>
Fixes: d5b1a78a772f ("drm/vc4: Add support for drawing 3D frames.")
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/vc4/vc4_gem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index f04c73f0687247..8f021d422ed620 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -585,7 +585,7 @@ vc4_cl_lookup_bos(struct drm_device *dev,
 
 fail:
 	drm_free_large(handles);
-	return 0;
+	return ret;
 }
 
 static int

From eed354707164676dbb744b96f0ebaf0982012473 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 21 Jul 2016 13:39:11 -0700
Subject: [PATCH 17/17] drm/vc4: Fix overflow mem unreferencing when the binner
 runs dry.

Overflow memory handling is tricky: While it's still referenced by the
BPO registers, we want to keep it from being freed.  When we are
putting a new set of overflow memory in the registers, we need to
assign the old one to the last rendering job using it.

We were looking at "what's currently running in the binner", but since
the bin/render submission split, we may end up with the binner
completing and having no new job while the renderer is still
processing.  So, if we don't find a bin job at all, look at the
highest-seqno (last) render job to attach our overflow to.

Signed-off-by: Eric Anholt <eric@anholt.net>
Fixes: ca26d28bbaa3 ("drm/vc4: improve throughput by pipelining binning and rendering jobs")
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/vc4/vc4_drv.h | 9 +++++++++
 drivers/gpu/drm/vc4/vc4_irq.c | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index f1f82735d3f8d7..312c8489bfb9cb 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -329,6 +329,15 @@ vc4_first_render_job(struct vc4_dev *vc4)
 				struct vc4_exec_info, head);
 }
 
+static inline struct vc4_exec_info *
+vc4_last_render_job(struct vc4_dev *vc4)
+{
+	if (list_empty(&vc4->render_job_list))
+		return NULL;
+	return list_last_entry(&vc4->render_job_list,
+			       struct vc4_exec_info, head);
+}
+
 /**
  * struct vc4_texture_sample_info - saves the offsets into the UBO for texture
  * setup parameters.
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
index b0104a346a74eb..094bc6a475c177 100644
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -83,8 +83,10 @@ vc4_overflow_mem_work(struct work_struct *work)
 
 		spin_lock_irqsave(&vc4->job_lock, irqflags);
 		current_exec = vc4_first_bin_job(vc4);
+		if (!current_exec)
+			current_exec = vc4_last_render_job(vc4);
 		if (current_exec) {
-			vc4->overflow_mem->seqno = vc4->finished_seqno + 1;
+			vc4->overflow_mem->seqno = current_exec->seqno;
 			list_add_tail(&vc4->overflow_mem->unref_head,
 				      &current_exec->unref_list);
 			vc4->overflow_mem = NULL;