-
Notifications
You must be signed in to change notification settings - Fork 0
/
limitedbzip2.c
131 lines (105 loc) · 3.96 KB
/
limitedbzip2.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// //////////////////////////////////////////////////////////
// limitedbzip2.c
// written by Stephan Brumme, 2021
// see https://create.stephan-brumme.com/length-limited-prefix-codes/
//
#include "limitedbzip2.h"
#include "moffat.h" // compute unlimited Huffman code lengths
#include <stdlib.h> // malloc/free/qsort
// the following code has shares many parts with the function moffat() in moffat.c
// (I avoid calling moffat() because it would keep sorting the same data again and again)
// helper struct for qsort()
struct KeyValue
{
unsigned int key;
unsigned int value;
};
// helper function for qsort()
static int compareKeyValue(const void* a, const void* b)
{
struct KeyValue* aa = (struct KeyValue*) a;
struct KeyValue* bb = (struct KeyValue*) b;
return aa->key - bb->key; // negative if a < b, zero if a == b, positive if a > b
}
/// adjust bit lengths based on the algorithm found in bzip2's sources
/** - histogram can be in any order and may contain zeros, the output is stored in a dedicated parameter
* @param maxLength maximum code length, e.g. 15 for DEFLATE or JPEG
* @param numCodes number of codes, equals the array size of histogram and codeLength
* @param histogram how often each code/symbol was found
* @param codeLength [out] computed code lengths
* @result actual maximum code length, 0 if error
*/
unsigned char limitedBzip2(unsigned char maxLength, unsigned int numCodes, const unsigned int histogram[], unsigned char codeLengths[])
{
// my allround variable for various loops
unsigned int i;
// count non-zero histogram values
unsigned int numNonZero = 0;
for (i = 0; i < numCodes; i++)
if (histogram[i] != 0)
numNonZero++;
// reject an empty alphabet because malloc(0) is undefined
if (numNonZero == 0)
return 0;
// initialize output
if (numNonZero < numCodes)
for (i = 0; i < numCodes; i++)
codeLengths[i] = 0;
// allocate a buffer for sorting the histogram
struct KeyValue* mapping = (struct KeyValue*) malloc(sizeof(struct KeyValue) * numNonZero);
// copy histogram to that buffer
unsigned int storeAt = 0;
for (i = 0; i < numCodes; i++)
{
// skip zeros
if (histogram[i] == 0)
continue;
mapping[storeAt].key = histogram[i];
mapping[storeAt].value = i;
storeAt++;
}
// now storeAt == numNonZero
// invoke C standard library's qsort
qsort(mapping, numNonZero, sizeof(struct KeyValue), compareKeyValue);
// extract ascendingly ordered histogram
unsigned int* sorted = (unsigned int*) malloc(sizeof(unsigned int) * numNonZero);
for (i = 0; i < numNonZero; i++)
sorted[i] = mapping[i].key;
// run Moffat algorithm ...
unsigned char result = moffatSortedInPlace(numNonZero, sorted);
// ... until a proper maximum code length is found
while (result > maxLength)
{
// more or less divide each weight by two while avoiding zero
for (i = 0; i < numNonZero; i++)
{
unsigned int weight = mapping[i].key;
// bzip2 "clears" the lowest 8 bits of the histogram
// to reach the length limit in less iterations
// but sacrifices lots of efficiency
// if you set EXTRA_SHIFT to 0 then the code may need more iterations
// but finds much better code lengths
//#define EXTRA_SHIFT 8
#define EXTRA_SHIFT 0
// sometimes dividing the weight by a bigger integer (e.g. 3)
// may lead to more efficient prefix codes
#define DIVIDE_BY 2
// the shifts do more harm than good in my opinion
weight >>= EXTRA_SHIFT;
// adding 1 avoids zero
weight = 1 + (weight / DIVIDE_BY);
weight <<= EXTRA_SHIFT;
// sorted was overwritten with code lengths
mapping[i].key = sorted[i] = weight;
}
// again: run Moffat algorithm
result = moffatSortedInPlace(numNonZero, sorted);
}
// restore original order
for (i = 0; i < numNonZero; i++)
codeLengths[mapping[i].value] = sorted[i];
// let it go ...
free(sorted);
free(mapping);
return result;
}