Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

CC: detect "no autovivification" and optimize aelemfast for it

Add blog post updates
  • Loading branch information...
commit cc90753d69000453856f4746fd885e058c30ff4b 1 parent 9b399bd
Reini Urban authored October 08, 2012
2  .gdbinit
... ...
@@ -1,6 +1,8 @@
1 1
 #directory /usr/src/perl/perl-5.10.1/perl-5.10.1
2 2
 #directory /usr/src/perl/perl-5.6.2
3 3
 
  4
+set breakpoint pending on
  5
+break XS_B__CC__autovivification
4 6
 break __asan_report_error
5 7
 
6 8
 define run10plc
45  C.xs
@@ -17,9 +17,14 @@ typedef struct magic  *B__MAGIC;
17 17
 #if PERL_VERSION >= 11
18 18
 typedef struct p5rx  *B__REGEXP;
19 19
 #endif
20  
-#if PERL_VERSION >= 15
21 20
 typedef COP  *B__COP;
22  
-#endif
  21
+
  22
+STATIC U32 a_hash = 0;
  23
+
  24
+typedef struct {
  25
+  U32 bits;
  26
+  IV  require_tag;
  27
+} a_hint_t;
23 28
 
24 29
 static int
25 30
 my_runops(pTHX)
@@ -132,6 +137,42 @@ COP_stashflags(o)
132 137
 
133 138
 #endif
134 139
 
  140
+MODULE = B__CC	PACKAGE = B::CC
  141
+
  142
+PROTOTYPES: DISABLE
  143
+
  144
+U32
  145
+_autovivification(cop)
  146
+	B::COP	cop
  147
+CODE:
  148
+    {
  149
+      SV *hint;
  150
+      IV h;
  151
+
  152
+      RETVAL = 1;
  153
+      if (PL_check[OP_PADSV] != MEMBER_TO_FPTR(Perl_ck_null)) {
  154
+	char *package = CopSTASHPV(cop);
  155
+#ifdef cop_hints_fetch_pvn
  156
+	hint = cop_hints_fetch_pvn(cop, "autovivification", strlen("autovivification"), a_hash, 0);
  157
+#elif PERL_VERSION > 9
  158
+	hint = Perl_refcounted_he_fetch(aTHX_ cop->cop_hints_hash,
  159
+					NULL, "autovivification", strlen("autovivification"), 0, a_hash);
  160
+#else
  161
+	SV **val = hv_fetch(GvHV(PL_hintgv), "autovivification", strlen("autovivification"), 0);
  162
+	if (!val)
  163
+	  return;
  164
+	hint = *val;
  165
+#endif
  166
+	if (!(hint && SvIOK(hint)))
  167
+	  return;
  168
+	h = SvIVX(hint);
  169
+	if (h & 4)  /* A_HINT_FETCH  4 */
  170
+	  RETVAL = 0;
  171
+      }
  172
+    }
  173
+OUTPUT:
  174
+  RETVAL
  175
+
135 176
 MODULE = B__C	PACKAGE = B::C
136 177
 
137 178
 PROTOTYPES: DISABLE
62  lib/B/CC.pm
@@ -362,6 +362,11 @@ sub CxTYPE_no_LOOP  {
362 362
     ? ( $_[0]->{type} < 4 or $_[0]->{type} > 7 )
363 363
     : $_[0]->{type} != 3
364 364
 }
  365
+if ($PERL510) {
  366
+  sub SVs_RMG {0x00800000}
  367
+} else {
  368
+  sub SVs_RMG {0x8000}
  369
+}
365 370
 
366 371
 # Could rewrite push_runtime() and output_runtime() to use a
367 372
 # temporary file if memory is at a premium.
@@ -841,6 +846,10 @@ sub reload_lexicals {
841 846
     $obj->[0] = $newval;
842 847
   }
843 848
 
  849
+  sub value {
  850
+    return $_[0]->[0];
  851
+  }
  852
+
844 853
   sub write_back {
845 854
     my $obj = shift;
846 855
     if ( !( $obj->[1] ) ) {
@@ -853,7 +862,8 @@ sub reload_lexicals {
853 862
 
854 863
 my $curcop = B::Shadow->new(
855 864
   sub {
856  
-    my $opsym = shift->save;
  865
+    my $op = shift;
  866
+    my $opsym = $op->save;
857 867
     runtime("PL_curcop = (COP*)$opsym;");
858 868
   }
859 869
 );
@@ -1407,7 +1417,7 @@ sub pp_nextstate {
1407 1417
   debug( sprintf( "%s:%d\n", $op->file, $op->line ) ) if $debug{lineno};
1408 1418
   debug( sprintf( "CopLABEL %s\n", $op->label ) ) if $op->label and $debug{cxstack};
1409 1419
   runtime("TAINT_NOT;") unless $omit_taint;
1410  
-  runtime("sp = PL_stack_base + cxstack[cxstack_ix].blk_oldsp;");
  1420
+  runtime("sp = PL_stack_base + cxstack[cxstack_ix].blk_oldsp;"); # TODO reset sp not needed always
1411 1421
   if ( $freetmps_each_bblock || $freetmps_each_loop ) {
1412 1422
     $need_freetmps = 1;
1413 1423
   }
@@ -1664,13 +1674,28 @@ sub pp_gvsv {
1664 1674
   return $op->next;
1665 1675
 }
1666 1676
 
  1677
+# check for faster fetch calls, returns 0 if no is in effect.
  1678
+sub autovivification {
  1679
+  if ($INC{'autovivification.pm'}) {
  1680
+    return _autovivification($curcop->[0]);
  1681
+  } else {
  1682
+    return 1;
  1683
+  }
  1684
+}
  1685
+
1667 1686
 # coverage: 16, issue44
1668 1687
 sub pp_aelemfast {
1669 1688
   my $op = shift;
1670  
-  my $av;
  1689
+  my ($av, $rmg);
1671 1690
   if ($op->flags & OPf_SPECIAL) {
1672 1691
     my $sv = $pad[ $op->targ ]->as_sv;
1673  
-    $av = $] > 5.01000 ? "MUTABLE_AV($sv)" : $sv;
  1692
+    my @c = comppadlist->ARRAY;
  1693
+    my @p = $c[1]->ARRAY;
  1694
+    my $lex = $p[ $op->targ ];
  1695
+    $rmg  = ($lex and ref $lex eq 'B::AV' and $lex->MAGICAL & SVs_RMG) ? 1 : 0;
  1696
+    # MUTABLE_AV is only needed to catch compiler const loss
  1697
+    # $av = $] > 5.01000 ? "MUTABLE_AV($sv)" : $sv;
  1698
+    $av = "(AV*)$sv";
1674 1699
   } else {
1675 1700
     my $gvsym;
1676 1701
     if ($ITHREADS) { #padop XXX if it's only a OP, no PADOP? t/CORE/op/ref.t test 36
@@ -1685,21 +1710,28 @@ sub pp_aelemfast {
1685 1710
       }
1686 1711
     }
1687 1712
     else { #svop
1688  
-      $gvsym = $op->gv->save;
  1713
+      my $gv = $op->gv;
  1714
+      $gvsym = $gv->save;
  1715
+      my $gvav = $gv->AV;
  1716
+      $rmg  = ($gvav and $gvav->MAGICAL & SVs_RMG) ? 1 : 0;
1689 1717
     }
1690 1718
     $av = "GvAV($gvsym)";
1691 1719
   }
1692 1720
   my $ix   = $op->private;
1693 1721
   my $lval = $op->flags & OPf_MOD;
1694  
-  write_back_stack();
1695  
-  runtime(
1696  
-    "{ AV* av = $av;",
1697  
-    "  SV** const svp = av_fetch(av, $ix, $lval);",
1698  
-    "  SV *sv = (svp ? *svp : &PL_sv_undef);",
1699  
-    !$lval ? "  if (SvRMAGICAL(av) && SvGMAGICAL(sv)) mg_get(sv);" : "",
1700  
-    "  PUSHs(sv);",
1701  
-    "}"
1702  
-  );
  1722
+  if (!$rmg and !autovivification()) {
  1723
+      runtime("PUSHs(AvARRAY($av)[$ix]);\t/* no autovivification */");
  1724
+  } else {
  1725
+    write_back_stack();
  1726
+    runtime(
  1727
+      "{ AV* av = $av;",
  1728
+      "  SV** const svp = av_fetch(av, $ix, $lval);",
  1729
+      "  SV *sv = (svp ? *svp : &PL_sv_undef);",
  1730
+      (!$lval and $rmg) ? "  if (SvRMAGICAL(av) && SvGMAGICAL(sv)) mg_get(sv);" : "",
  1731
+      "  PUSHs(sv);",
  1732
+      "}"
  1733
+    );
  1734
+  }
1703 1735
   return $op->next;
1704 1736
 }
1705 1737
 
@@ -3152,9 +3184,9 @@ OPTION:
3152 3184
       foreach my $ref ( values %optimise ) {
3153 3185
         $$ref = 0;
3154 3186
       }
  3187
+      $B::C::destruct = 0 unless $] < 5.008; # fast_destruct
3155 3188
       if ($arg >= 2) {
3156 3189
         $freetmps_each_loop = 1;
3157  
-        $B::C::destruct = 0 unless $] < 5.008; # fast_destruct
3158 3190
       }
3159 3191
       if ( $arg >= 1 ) {
3160 3192
         $type_attr = 1;
10  lib/B/Stackobj.pm
@@ -420,14 +420,16 @@ B<my vs our>: Note that only B<our> attributes are resolved at B<compile-time>,
420 420
 B<my> attributes are resolved at B<run-time>. So the compiler will only see
421 421
 type attributes for our variables.
422 422
 
423  
-See L<B::CC/load_pad> and L<Ctypes>.
  423
+See L<B::CC/load_pad> and L<types>.
424 424
 
425  
-TODO: To represent on this stack not only PADs,SV,IV,PV,NV,BOOL,Special
426  
-and a SV const, but also GV,CV,RV,AV,HV use B::Stackobj::Const.
  425
+TODO: Represent on this stack not only PADs,SV,IV,PV,NV,BOOL,Special
  426
+and a SV const, but also GV,CV,RV,AV,HV, esp. AELEM and HELEM.
  427
+Use B::Stackobj::Const.
427 428
 
428 429
 =head1 AUTHOR
429 430
 
430  
-Malcolm Beattie C<MICB at cpan.org> I<(retired)>
  431
+Malcolm Beattie C<MICB at cpan.org> I<(retired)>,
  432
+Reini Urban C<rurban at cpan.org>
431 433
 
432 434
 =cut
433 435
 
54  ramblings/blogs-optimizing-3.md
Source Rendered
@@ -126,18 +126,19 @@ Unrolled:
126 126
 
127 127
 So we went from **3.6s** down to **2.4s** and compiled to **1.3s**.
128 128
 
129  
-With N=50,000,000 we got **12m36.517s** uncompiled and **9m33.9213s**
130  
-compiled.  Close to jruby, even if the array accesses still goes
  129
+With N=50,000,000 we got **14m12.653s** uncompiled and **7m11.3597s**
  130
+compiled. Close to jruby, even if the array accesses still goes
131 131
 through the `av_fetch` function, magic is checked and undefined indices
132 132
 are autovivified.
133 133
 
  134
+
134 135
 Generalization
135 136
 --------------
136 137
 
137 138
 The above macro-code code looks pretty unreadable, similar to lisp
138 139
 macros, with its mix of quoted and unquoted variables.  The compiler
139 140
 needs to detect unrollable loop code which will lead to more
140  
-constants and AELEMFAST ops. And we need to define a helper function
  141
+constants and AELEMFAST ops. And we better define a helper function
141 142
 for easier generation of such unrolled loops.
142 143
 
143 144
     # unquote local vars
@@ -192,7 +193,7 @@ A naive optimization would check the index ranges beforehand, and access
192 193
 the array values directly. Something the type optimizer for arrays would
193 194
 do.
194 195
 
195  
-    my (num @xs[4], num @ys[4], num @zs[4]);
  196
+    my (num @xs[4],  num @ys[4],  num @zs[4]);
196 197
     my (num @vxs[4], num @vys[4], num @vzs[4]);
197 198
     my num @mass[4];
198 199
 
@@ -221,22 +222,22 @@ It should compile to:
221 222
 
222 223
 With the size declaration you can omit the `av_fetch()` call and undef
223 224
 check ("autovivification"), with the type `num` you do not need to get
224  
-to the `SvNVX` of the array element, the value is stored directly, and
  225
+to the `SvNV` of the array element, the value is stored directly, and
225 226
 the type also guarantees that there is no magic to be checked.  So
226  
-`AvARRAY(PL_curpad[6])[0]` returns a double.
  227
+`AvARRAY(PL_curpad[6])[0]` would return a double.
227 228
 
228 229
 And the stack handling (PUSH, PUSH, POP, POP) can also be optimized
229 230
 away, since the ops are inlined already.  That would get us close to
230 231
 an optimizing compiler as with Haskell, Lua, PyPy or LISP. Not close
231 232
 to Go or Java, as their languages are stricter.
232 233
 
233  
-I tried a simple B::CC AELEMFAST optimization together with "no autovificication"
  234
+I tried a simple B::CC AELEMFAST optimization together with "no autovivification"
234 235
 which does not yet eliminate superfluous PUSH/POP pairs but could be applied
235 236
 for typed arrays and leads to another 2x times win.
236 237
 
237  
-2.80s down to 1.67s on a slower PC with N=50000.
  238
+2.80s down to 1.67s on a slower PC with N=50,000.
238 239
 
239  
-Compiled to:
  240
+Compiled to *(perlcc /2a)*:
240 241
 
241 242
     PUSHs(AvARRAY(PL_curpad[6])[0]));
242 243
     PUSHs(AvARRAY(PL_curpad[6])[1]));
@@ -244,8 +245,10 @@ Compiled to:
244 245
     d30_tmp = rnv0 * lnv0;
245 246
 
246 247
 Without superfluous PUSH/POP pairs I suspect another 2x times win. But this
247  
-is not implemented yet.
248  
-It should look like:
  248
+is not implemented yet. With typed arrays maybe another 50% win, and we don't
  249
+need the no autovivification overhead.
  250
+
  251
+It should look like *(perlcc /2b)*:
249 252
 
250 253
     rnv0 = SvNV(AvARRAY(PL_curpad[6])[0]);
251 254
     lnv0 = SvNV(AvARRAY(PL_curpad[6])[1]);
@@ -253,3 +256,32 @@ It should look like:
253 256
 
254 257
 I'm just implementing the check for the 'no autovivification' pragma and
255 258
 the stack optimizations.
  259
+
  260
+Summary
  261
+-------
  262
+
  263
+[u64q nbody](http://shootout.alioth.debian.org/u64q/performance.php?test=nbody)
  264
+
  265
+Original numbers with N=50,000,000:
  266
+
  267
+    * Fortran       14.09s
  268
+    * C             20.72s
  269
+    * Go            32.11s
  270
+    * SBCL          42.75s
  271
+    * JRuby       8m
  272
+    * PHP        11m
  273
+    * Python 3   16m
  274
+    * Perl       23m
  275
+    * Ruby 1.9   26m
  276
+
  277
+My numbers with N=50,000,000:
  278
+
  279
+    * Perl       22m14s
  280
+    * Perl 1     21m48s         (inline sub advance, no ENTERSUB/LEAVESUB)
  281
+    * perlcc      9m52s
  282
+    * Perl 2    14m13s          (unrolled loop + AELEM => AELEMFAST)
  283
+    * perlcc 2   7m11s
  284
+    * perlcc 2a  4m52s          (no autovivification, 4.5x faster)
  285
+    * perlcc 2b  ? (~2m30)      (no autovivification + stack opt)
  286
+    * perlcc 2c  ? (~1m25s)     (typed arrays + stack opt)
  287
+
111  ramblings/blogs-optimizing-4.md
Source Rendered
... ...
@@ -0,0 +1,111 @@
  1
+nbody - More optimizations
  2
+--------------------------
  3
+
  4
+In the [first part](http://blogs.perl.org/users/rurban/2012/09/optimizing-compiler-benchmarks-part-1.html)
  5
+I showed some problems and possibilities of the B::C compiler and
  6
+B::CC optimizing compiler with an regexp example which was very bad to
  7
+optimize.
  8
+
  9
+In the [second part](http://blogs.perl.org/users/rurban/2012/10/optimizing-compiler-benchmarks-part-2.html)
  10
+I got 2 times faster run-times with the B::CC compiler with the
  11
+[nbody](http://shootout.alioth.debian.org/u32/performance.php?test=nbody) benchmark, which does a lot of arithmetic.
  12
+
  13
+In the [third part](http://blogs.perl.org/users/rurban/2012/10/optimizing-compiler-benchmarks-part-3.html)
  14
+I got 4.5 times faster run-times with perl-level AELEMFAST optimizations, and discussed optimising array accesses
  15
+via no autovifification or types.
  16
+
  17
+Optimising array accesses showed the need for autovifification detection in B::CC and better stack
  18
+handling for more ops and datatypes, esp. aelem and helem. 
  19
+
  20
+But first let's study more easier goals to accomplish. If we look at
  21
+the generated C source for a simple arithmetic function, like
  22
+`pp_sub_offset_momentum` we immediately detect more possibilities.
  23
+
  24
+    static
  25
+    CCPP(pp_sub_offset_momentum)
  26
+    {
  27
+    	SV *sv, *src, *dst, *left, *right;
  28
+    	NV rnv0, lnv0, d1_px, d2_py, d3_pz, d4_mass, d7_tmp, d10_tmp, d13_tmp, d15_tmp, d17_tmp, d19_tmp, d21_tmp, d23_tmp, d25_tmp, d27_tmp, d29_tmp, d31_tmp, d33_tmp, d35_tmp, d37_tmp, d40_tmp, d42_tmp, d44_tmp;
  29
+    	PERL_CONTEXT *cx;
  30
+    	MAGIC *mg;
  31
+    	I32 oldsave, gimme;
  32
+    	dSP;
  33
+      lab_2a41220:
  34
+    	TAINT_NOT;                 /* only needed once */
  35
+    	sp = PL_stack_base + cxstack[cxstack_ix].blk_oldsp; /* only needed once */
  36
+    	FREETMPS;                  /* only needed once */
  37
+    	SAVECLEARSV(PL_curpad[1]); /* not needed at all */
  38
+    	d1_px = 0.00;
  39
+      lab_2a41370:
  40
+    	TAINT_NOT;                 /* only needed once */
  41
+    	sp = PL_stack_base + cxstack[cxstack_ix].blk_oldsp; /* unneeded */
  42
+    	FREETMPS;                  /* only needed once */
  43
+    	SAVECLEARSV(PL_curpad[2]); /* not needed at all */
  44
+    	d2_py = 0.00;
  45
+      lab_2a50a00:
  46
+    	TAINT_NOT;                 /* only needed once */
  47
+    	sp = PL_stack_base + cxstack[cxstack_ix].blk_oldsp; /* unneeded */
  48
+    	FREETMPS;                  /* only needed once */
  49
+    	SAVECLEARSV(PL_curpad[3]); /* not needed at all */
  50
+    	d3_pz = 0.00;
  51
+      lab_2a50b30:
  52
+    	TAINT_NOT;                 /* only needed once */
  53
+    	sp = PL_stack_base + cxstack[cxstack_ix].blk_oldsp; /* unneeded */
  54
+    	FREETMPS;                  /* only needed once */
  55
+    	SAVECLEARSV(PL_curpad[4]); /* not needed at all */
  56
+      lab_2a50cc0:
  57
+    	TAINT_NOT;                 /* only needed once */
  58
+    	sp = PL_stack_base + cxstack[cxstack_ix].blk_oldsp; /* unneeded */
  59
+    	FREETMPS;                  /* only needed once */
  60
+    	PUSHs(AvARRAY(MUTABLE_AV(PL_curpad[5]))[0]);	/* no autovivification */
  61
+    	sv = POPs;
  62
+    	MAYBE_TAINT_SASSIGN_SRC(sv);    /* not needed */
  63
+    	SvSetMagicSV(PL_curpad[4], sv); /* i.e. PL_curpad[4] = sv; */
  64
+        ...
  65
+
  66
+We can study the expanded macros with:
  67
+
  68
+    cc_harness -DOPT -E -O2 -onbody.perl-2.perl-1.i nbody.perl-2.perl.c
  69
+
  70
+`TAINT_NOT` does `(PL_tainted = (0))`. It is needed only once, because nobody
  71
+changes `PL_tainted`. We can also ignore taint checks generally by setting `-fomit_taint`.
  72
+
  73
+    perl -MO=Concise,offset_momentum nbody.perl-2a.perl
  74
+
  75
+    main::offset_momentum:
  76
+    42 <1> leavesub[1 ref] K/REFC,1 ->(end)
  77
+    -     <@> lineseq KP ->42
  78
+    1        <;> nextstate(main 141 (eval 5):4) v ->2
  79
+    4        <2> sassign vKS/2 ->5
  80
+    2           <$> const(NV 0) s ->3
  81
+    3           <0> padsv[$px:141,145] sRM*/LVINTRO ->4
  82
+    ...
  83
+
  84
+`sp = PL_stack_base + cxstack[cxstack_ix].blk_oldsp;` is the 2nd part of the inlined code for 
  85
+`nextstate` and resets the stack pointer. As we keep track of the stack by ourselves we can
  86
+omit most of these resets in nextstate.
  87
+
  88
+`FREETMPS` is also part of `nextstate`, and calling it after each basic
  89
+block is optimized by -O1, and -O2 would free the temps after each
  90
+loop.  If FREETMPS is needed at all, i.e. if locals are used in the
  91
+function at all, is not checked yet.
  92
+
  93
+`SAVECLEARSV(PL_curpad[1-4])` is part of `padsv /LVINTRO`, but here unneeded, since
  94
+it is in the context of sassign. So the value of the lexical does not need to be cleared
  95
+before it is set. And btw. the setter of the lexical is already optimized to a temporary.
  96
+
  97
+`MAYBE_TAINT_SASSIGN_SRC(sv)` is part of `sassign` and can be omitted with `-fomit_taint`,
  98
+and since we are at `TAINT_NOT` we can leave it out.
  99
+
  100
+`SvSetMagicSV(PL_curpad[4], sv)` is also part of the optimized `sassign` op, just not
  101
+yet optimized enough, since sv cannot have any magic. A type declaration for the `padsv`
  102
+would have used the faster equivalent `SvNV(PL_curpad[4]) = SvNV(sv);` put on the stack.
  103
+
  104
+We can easily test this out by NOP'ing these code sections and see the costs.
  105
+
  106
+With 4m53.073s, without 4m23.265s. 30 seconds or ~10% faster. This is now in the typical
  107
+range of p5p micro-optimizations and not considered high-priority for now.
  108
+
  109
+Let's rather check out more stack optimizations.
  110
+
  111
+*TBC...*

0 notes on commit cc90753

Please sign in to comment.
Something went wrong with that request. Please try again.