Use broadcasting to remove shared code

quantopian · Jul 19, 2016 · c9058e8 · c9058e8
1 parent 790e0d8
commit c9058e8
Showing 1 changed file with 22 additions and 27 deletions.
diff --git a/zipline/pipeline/factors/statistical.py b/zipline/pipeline/factors/statistical.py
@@ -1,4 +1,5 @@
 
+from numpy import broadcast_arrays
 from scipy.stats import (
     linregress,
     pearsonr,
@@ -73,15 +74,13 @@ class RollingPearson(_RollingCorrelation):
     instance of this class.
     """
     def compute(self, today, assets, out, base_data, target_data):
-        if target_data.shape[1] > 1:
-            # Both inputs are 2D, so compute sid-by-sid.
-            for i in range(len(out)):
-                out[i] = pearsonr(base_data[:, i], target_data[:, i])[0]
-        else:
-            # Second input is a slice, so always compute with its only column.
-            slice_data = target_data[:, 0]
-            for i in range(len(out)):
-                out[i] = pearsonr(base_data[:, i], slice_data)[0]
+        # If `target_data` is a Slice or single column of data, broadcast it
+        # out to the same shape as `base_data`, then compute column-wise. This
+        # is efficient because each column of the broadcasted array only refers
+        # to a single memory location.
+        target_data = broadcast_arrays(target_data, base_data)[0]
+        for i in range(len(out)):
+            out[i] = pearsonr(base_data[:, i], target_data[:, i])[0]
 
 
 class RollingSpearman(_RollingCorrelation):
@@ -119,15 +118,13 @@ class RollingSpearman(_RollingCorrelation):
     instance of this class.
     """
     def compute(self, today, assets, out, base_data, target_data):
-        if target_data.shape[1] > 1:
-            # Both inputs are 2D, so compute sid-by-sid.
-            for i in range(len(out)):
-                out[i] = spearmanr(base_data[:, i], target_data[:, i])[0]
-        else:
-            # Second input is a slice, so always compute with its only column.
-            slice_data = target_data[:, 0]
-            for i in range(len(out)):
-                out[i] = spearmanr(base_data[:, i], slice_data)[0]
+        # If `target_data` is a Slice or single column of data, broadcast it
+        # out to the same shape as `base_data`, then compute column-wise. This
+        # is efficient because each column of the broadcasted array only refers
+        # to a single memory location.
+        target_data = broadcast_arrays(target_data, base_data)[0]
+        for i in range(len(out)):
+            out[i] = spearmanr(base_data[:, i], target_data[:, i])[0]
 
 
 class RollingLinearRegression(CustomFactor, SingleInputMixin):
@@ -201,15 +198,13 @@ def regress(y, x):
             p_value[i] = regr_results[3]
             stderr[i] = regr_results[4]
 
-        if independent.shape[1] > 1:
-            # Both inputs are 2D, so compute sid-by-sid.
-            for i in range(len(out)):
-                regress(y=dependent[:, i], x=independent[:, i])
-        else:
-            # Second input is a slice, so always compute with its only column.
-            slice_data = independent[:, 0]
-            for i in range(len(out)):
-                regress(y=dependent[:, i], x=slice_data)
+        # If `independent` is a Slice or single column of data, broadcast it
+        # out to the same shape as `dependent`, then compute column-wise. This
+        # is efficient because each column of the broadcasted array only refers
+        # to a single memory location.
+        independent = broadcast_arrays(independent, dependent)[0]
+        for i in range(len(out)):
+            regress(y=dependent[:, i], x=independent[:, i])
 
 
 class RollingPearsonOfReturns(RollingPearson):