Permalink
Cannot retrieve contributors at this time
704 lines (574 sloc)
25.9 KB
| #include "ert.h" | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #include <math.h> | |
| #include "utils.h" | |
| #define ERT_FAVOR_CONT_AND_REP0_MATCHES (1) | |
| #define ERT_FAVOR_REP0_MATCHES (0) | |
| namespace ert | |
| { | |
| const uint32_t MAX_BLOCK_PIXELS = 12 * 12; | |
| const uint32_t MAX_BLOCK_SIZE_IN_BYTES = 256; | |
| const uint32_t MIN_MATCH_LEN = 3; | |
| const float LITERAL_BITS = 13.0f; | |
| const float MATCH_CONTINUE_BITS = 1.0f; | |
| const float MATCH_REP0_BITS = 4.0f; | |
| static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high; return value; } | |
| template<typename F> inline F lerp(F a, F b, F s) { return a + (b - a) * s; } | |
| static const uint8_t g_tdefl_small_dist_extra[512] = | |
| { | |
| 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, | |
| 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, | |
| 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, | |
| 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, | |
| 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, | |
| 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, | |
| 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, | |
| 7, 7, 7, 7, 7, 7, 7, 7 | |
| }; | |
| static const uint8_t g_tdefl_large_dist_extra[128] = | |
| { | |
| 0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, | |
| 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, | |
| 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 | |
| }; | |
| static inline uint32_t compute_match_cost_estimate(uint32_t dist, uint32_t match_len_in_bytes) | |
| { | |
| assert(match_len_in_bytes <= 258); | |
| uint32_t len_cost = 6; | |
| if (match_len_in_bytes >= 12) | |
| len_cost = 9; | |
| else if (match_len_in_bytes >= 8) | |
| len_cost = 8; | |
| else if (match_len_in_bytes >= 6) | |
| len_cost = 7; | |
| uint32_t dist_cost = 5; | |
| if (dist < 512) | |
| dist_cost += g_tdefl_small_dist_extra[dist & 511]; | |
| else | |
| { | |
| dist_cost += g_tdefl_large_dist_extra[std::min<uint32_t>(dist, 32767) >> 8]; | |
| while (dist >= 32768) | |
| { | |
| dist_cost++; | |
| dist >>= 1; | |
| } | |
| } | |
| return len_cost + dist_cost; | |
| } | |
| class tracked_stat | |
| { | |
| public: | |
| tracked_stat() { clear(); } | |
| void clear() { m_num = 0; m_total = 0; m_total2 = 0; } | |
| void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; } | |
| tracked_stat& operator += (uint32_t val) { update(val); return *this; } | |
| uint32_t get_number_of_values() { return m_num; } | |
| uint64_t get_total() const { return m_total; } | |
| uint64_t get_total2() const { return m_total2; } | |
| float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; }; | |
| float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; } | |
| float get_variance() const { float s = get_std_dev(); return s * s; } | |
| private: | |
| uint32_t m_num; | |
| uint64_t m_total; | |
| uint64_t m_total2; | |
| }; | |
| static inline float compute_block_max_std_dev(const color_rgba* pPixels, uint32_t block_width, uint32_t block_height, uint32_t num_comps) | |
| { | |
| tracked_stat comp_stats[4]; | |
| for (uint32_t y = 0; y < block_height; y++) | |
| { | |
| for (uint32_t x = 0; x < block_width; x++) | |
| { | |
| const color_rgba* pPixel = pPixels + x + y * block_width; | |
| for (uint32_t c = 0; c < num_comps; c++) | |
| comp_stats[c].update(pPixel->m_c[c]); | |
| } | |
| } | |
| float max_std_dev = 0.0f; | |
| for (uint32_t i = 0; i < num_comps; i++) | |
| max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev()); | |
| return max_std_dev; | |
| } | |
| static inline float compute_block_mse(const color_rgba* pPixelsA, const color_rgba* pPixelsB, uint32_t block_width, uint32_t block_height, uint32_t total_block_pixels, uint32_t num_comps, const uint32_t weights[4], float one_over_total_color_weight) | |
| { | |
| uint64_t total_err = 0; | |
| if ((block_width == 4) && (block_height == 4) && (num_comps == 4)) | |
| { | |
| if ((weights[0] == 1) && (weights[1] == 1) && (weights[2] == 1) && (weights[3] == 1)) | |
| { | |
| for (uint32_t i = 0; i < 16; i++) | |
| { | |
| const color_rgba* pA = pPixelsA + i; | |
| const color_rgba* pB = pPixelsB + i; | |
| const int dr = pA->m_c[0] - pB->m_c[0]; | |
| const int dg = pA->m_c[1] - pB->m_c[1]; | |
| const int db = pA->m_c[2] - pB->m_c[2]; | |
| const int da = pA->m_c[3] - pB->m_c[3]; | |
| total_err += dr * dr + dg * dg + db * db + da * da; | |
| } | |
| } | |
| else | |
| { | |
| for (uint32_t i = 0; i < 16; i++) | |
| { | |
| const color_rgba* pA = pPixelsA + i; | |
| const color_rgba* pB = pPixelsB + i; | |
| const int dr = pA->m_c[0] - pB->m_c[0]; | |
| const int dg = pA->m_c[1] - pB->m_c[1]; | |
| const int db = pA->m_c[2] - pB->m_c[2]; | |
| const int da = pA->m_c[3] - pB->m_c[3]; | |
| total_err += weights[0] * dr * dr + weights[1] * dg * dg + weights[2] * db * db + weights[3] * da * da; | |
| } | |
| } | |
| } | |
| else if ((block_width == 4) && (block_height == 4) && (num_comps == 3)) | |
| { | |
| for (uint32_t y = 0; y < 4; y++) | |
| { | |
| const uint32_t y_ofs = y * 4; | |
| for (uint32_t x = 0; x < 4; x++) | |
| { | |
| const color_rgba* pA = pPixelsA + x + y_ofs; | |
| const color_rgba* pB = pPixelsB + x + y_ofs; | |
| const int dr = pA->m_c[0] - pB->m_c[0]; | |
| const int dg = pA->m_c[1] - pB->m_c[1]; | |
| const int db = pA->m_c[2] - pB->m_c[2]; | |
| total_err += weights[0] * dr * dr + weights[1] * dg * dg + weights[2] * db * db; | |
| } | |
| } | |
| } | |
| else if ((block_width == 4) && (block_height == 4) && (num_comps == 2)) | |
| { | |
| for (uint32_t y = 0; y < 4; y++) | |
| { | |
| const uint32_t y_ofs = y * 4; | |
| for (uint32_t x = 0; x < 4; x++) | |
| { | |
| const color_rgba* pA = pPixelsA + x + y_ofs; | |
| const color_rgba* pB = pPixelsB + x + y_ofs; | |
| const int dr = pA->m_c[0] - pB->m_c[0]; | |
| const int dg = pA->m_c[1] - pB->m_c[1]; | |
| total_err += weights[0] * dr * dr + weights[1] * dg * dg; | |
| } | |
| } | |
| } | |
| else if ((block_width == 4) && (block_height == 4) && (num_comps == 1)) | |
| { | |
| for (uint32_t y = 0; y < 4; y++) | |
| { | |
| const uint32_t y_ofs = y * 4; | |
| for (uint32_t x = 0; x < 4; x++) | |
| { | |
| const color_rgba* pA = pPixelsA + x + y_ofs; | |
| const color_rgba* pB = pPixelsB + x + y_ofs; | |
| const int dr = pA->m_c[0] - pB->m_c[0]; | |
| total_err += weights[0] * dr * dr; | |
| } | |
| } | |
| } | |
| else | |
| { | |
| for (uint32_t y = 0; y < block_height; y++) | |
| { | |
| const uint32_t y_ofs = y * block_width; | |
| for (uint32_t x = 0; x < block_width; x++) | |
| { | |
| const color_rgba* pA = pPixelsA + x + y_ofs; | |
| const color_rgba* pB = pPixelsB + x + y_ofs; | |
| for (uint32_t c = 0; c < num_comps; c++) | |
| { | |
| const int d = pA->m_c[c] - pB->m_c[c]; | |
| total_err += weights[c] * d * d; | |
| } | |
| } | |
| } | |
| } | |
| return total_err * (one_over_total_color_weight / total_block_pixels); | |
| } | |
| uint32_t hash_hsieh(const uint8_t* pBuf, size_t len, uint32_t salt) | |
| { | |
| if (!pBuf || !len) | |
| return 0; | |
| uint32_t h = static_cast<uint32_t>(len + (salt << 16)); | |
| const uint32_t bytes_left = len & 3; | |
| len >>= 2; | |
| while (len--) | |
| { | |
| const uint16_t* pWords = reinterpret_cast<const uint16_t*>(pBuf); | |
| h += pWords[0]; | |
| const uint32_t t = (pWords[1] << 11) ^ h; | |
| h = (h << 16) ^ t; | |
| pBuf += sizeof(uint32_t); | |
| h += h >> 11; | |
| } | |
| switch (bytes_left) | |
| { | |
| case 1: | |
| h += *reinterpret_cast<const signed char*>(pBuf); | |
| h ^= h << 10; | |
| h += h >> 1; | |
| break; | |
| case 2: | |
| h += *reinterpret_cast<const uint16_t*>(pBuf); | |
| h ^= h << 11; | |
| h += h >> 17; | |
| break; | |
| case 3: | |
| h += *reinterpret_cast<const uint16_t*>(pBuf); | |
| h ^= h << 16; | |
| h ^= (static_cast<signed char>(pBuf[sizeof(uint16_t)])) << 18; | |
| h += h >> 11; | |
| break; | |
| default: | |
| break; | |
| } | |
| h ^= h << 3; | |
| h += h >> 5; | |
| h ^= h << 4; | |
| h += h >> 17; | |
| h ^= h << 25; | |
| h += h >> 6; | |
| return h; | |
| } | |
| // BC7 entropy reduction transform with Deflate/LZMA/LZHAM optimizations | |
| bool reduce_entropy(void* pBlocks, uint32_t num_blocks, | |
| uint32_t total_block_stride_in_bytes, uint32_t block_size_to_optimize_in_bytes, uint32_t block_width, uint32_t block_height, uint32_t num_comps, | |
| const color_rgba* pBlock_pixels, const reduce_entropy_params& params, uint32_t& total_modified, | |
| pUnpack_block_func pUnpack_block_func, void* pUnpack_block_func_user_data, | |
| std::vector<float>* pBlock_mse_scales) | |
| { | |
| assert(total_block_stride_in_bytes && block_size_to_optimize_in_bytes); | |
| assert(total_block_stride_in_bytes >= block_size_to_optimize_in_bytes); | |
| assert(num_comps >= 1 && num_comps <= 4); | |
| for (uint32_t i = num_comps; i < 4; i++) | |
| { | |
| assert(!params.m_color_weights[i]); | |
| if (params.m_color_weights[i]) | |
| return false; | |
| } | |
| const uint32_t total_color_weight = params.m_color_weights[0] + params.m_color_weights[1] + params.m_color_weights[2] + params.m_color_weights[3]; | |
| assert(total_color_weight); | |
| const float one_over_total_color_weight = 1.0f / total_color_weight; | |
| assert((block_size_to_optimize_in_bytes >= MIN_MATCH_LEN) && (block_size_to_optimize_in_bytes <= MAX_BLOCK_SIZE_IN_BYTES)); | |
| if ((block_size_to_optimize_in_bytes < MIN_MATCH_LEN) || (block_size_to_optimize_in_bytes > MAX_BLOCK_SIZE_IN_BYTES)) | |
| return false; | |
| uint8_t* pBlock_bytes = (uint8_t*)pBlocks; | |
| const uint32_t total_block_pixels = block_width * block_height; | |
| if (total_block_pixels > MAX_BLOCK_PIXELS) | |
| return false; | |
| const int total_blocks_to_check = std::max<uint32_t>(1U, params.m_lookback_window_size / total_block_stride_in_bytes); | |
| std::vector<uint32_t> len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1); | |
| std::vector<uint32_t> second_len_hist(MAX_BLOCK_SIZE_IN_BYTES + 1); | |
| uint32_t total_second_matches = 0; | |
| int prev_match_window_ofs_to_favor_cont = -1, prev_match_dist_to_favor = -1; | |
| uint32_t total_smooth_blocks = 0; | |
| const uint32_t HASH_SIZE = 8192; | |
| uint32_t hash[HASH_SIZE]; | |
| for (uint32_t block_index = 0; block_index < num_blocks; block_index++) | |
| { | |
| if ((block_index & 0xFF) == 0) | |
| memset(hash, 0, sizeof(hash)); | |
| uint8_t* pOrig_block = &pBlock_bytes[block_index * total_block_stride_in_bytes]; | |
| const color_rgba* pPixels = &pBlock_pixels[block_index * total_block_pixels]; | |
| color_rgba decoded_block[MAX_BLOCK_PIXELS]; | |
| if (!(*pUnpack_block_func)(pOrig_block, decoded_block, block_index, pUnpack_block_func_user_data)) | |
| return false; | |
| float cur_mse = compute_block_mse(pPixels, decoded_block, block_width, block_height, total_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight); | |
| if ((params.m_skip_zero_mse_blocks) && (cur_mse == 0.0f)) | |
| continue; | |
| const float max_std_dev = compute_block_max_std_dev(pPixels, block_width, block_height, num_comps); | |
| float yl = clampf(max_std_dev / params.m_max_smooth_block_std_dev, 0.0f, 1.0f); | |
| yl = yl * yl; | |
| float smooth_block_mse_scale = lerp(params.m_smooth_block_max_mse_scale, 1.0f, yl); | |
| if (pBlock_mse_scales) | |
| { | |
| if ((*pBlock_mse_scales)[block_index] > 0.0f) | |
| { | |
| smooth_block_mse_scale = (*pBlock_mse_scales)[block_index]; | |
| } | |
| } | |
| if (smooth_block_mse_scale > 1.0f) | |
| total_smooth_blocks++; | |
| float cur_bits = (LITERAL_BITS * block_size_to_optimize_in_bytes); | |
| float cur_t = cur_mse * smooth_block_mse_scale + cur_bits * params.m_lambda; | |
| int first_block_to_check = std::max<int>(0, block_index - total_blocks_to_check); | |
| int last_block_to_check = block_index - 1; | |
| uint8_t best_block[MAX_BLOCK_SIZE_IN_BYTES]; | |
| memcpy(best_block, pOrig_block, block_size_to_optimize_in_bytes); | |
| float best_t = cur_t; | |
| uint32_t best_match_len = 0, best_match_src_window_ofs = 0, best_match_dst_window_ofs = 0, best_match_src_block_ofs = 0, best_match_dst_block_ofs = 0; | |
| float best_match_bits = 0; | |
| // Don't let thresh_ms_err be 0 to let zero error blocks have slightly increased distortion | |
| const float thresh_ms_err = params.m_max_allowed_rms_increase_ratio * params.m_max_allowed_rms_increase_ratio * std::max(cur_mse, 1.0f); | |
| for (int prev_block_index = last_block_to_check; prev_block_index >= first_block_to_check; --prev_block_index) | |
| { | |
| const uint8_t* pPrev_blk = &pBlock_bytes[prev_block_index * total_block_stride_in_bytes]; | |
| for (uint32_t len = block_size_to_optimize_in_bytes; len >= MIN_MATCH_LEN; len--) | |
| { | |
| if (params.m_allow_relative_movement) | |
| { | |
| for (uint32_t src_ofs = 0; src_ofs <= (block_size_to_optimize_in_bytes - len); src_ofs++) | |
| { | |
| assert(len + src_ofs <= block_size_to_optimize_in_bytes); | |
| const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + src_ofs; | |
| for (uint32_t dst_ofs = 0; dst_ofs <= (block_size_to_optimize_in_bytes - len); dst_ofs++) | |
| { | |
| assert(len + dst_ofs <= block_size_to_optimize_in_bytes); | |
| const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + dst_ofs; | |
| const uint32_t match_dist = dst_match_window_ofs - src_match_window_ofs; | |
| float trial_match_bits, trial_total_bits; | |
| uint32_t hs = hash_hsieh(pPrev_blk + src_ofs, len, dst_ofs); | |
| #if ERT_FAVOR_CONT_AND_REP0_MATCHES | |
| // Continue a previous match (which would cross block boundaries) | |
| if (((int)src_match_window_ofs == prev_match_window_ofs_to_favor_cont) && (dst_ofs == 0)) | |
| { | |
| trial_match_bits = MATCH_CONTINUE_BITS; | |
| trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_CONTINUE_BITS; | |
| } | |
| // Exploit REP0 matches | |
| else if ((prev_match_dist_to_favor != -1) && (src_match_window_ofs == (dst_match_window_ofs - prev_match_dist_to_favor))) | |
| { | |
| trial_match_bits = MATCH_REP0_BITS; | |
| trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_REP0_BITS; | |
| } | |
| else | |
| { | |
| trial_match_bits = (float)compute_match_cost_estimate(match_dist, len); | |
| trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + trial_match_bits; | |
| uint32_t hash_check = hash[hs & (HASH_SIZE - 1)]; | |
| if ((hash_check & 0xFF) == (block_index & 0xFF)) | |
| { | |
| if ((hash_check >> 8) == (hs >> 8)) | |
| continue; | |
| } | |
| } | |
| #else | |
| uint32_t hash_check = hash[hs & (HASH_SIZE - 1)]; | |
| if ((hash_check & 0xFF) == (block_index & 0xFF)) | |
| { | |
| if ((hash_check >> 8) == (hs >> 8)) | |
| continue; | |
| } | |
| #endif | |
| hash[hs & (HASH_SIZE - 1)] = (hs & 0xFFFFFF00) | (block_index & 0xFF); | |
| const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda; | |
| uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES]; | |
| memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes); | |
| memcpy(trial_block + dst_ofs, pPrev_blk + src_ofs, len); | |
| color_rgba decoded_trial_block[MAX_BLOCK_PIXELS]; | |
| if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, pUnpack_block_func_user_data)) | |
| continue; | |
| float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, total_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight); | |
| if (trial_mse < thresh_ms_err) | |
| { | |
| float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda; | |
| if (t < best_t) | |
| { | |
| best_t = t; | |
| memcpy(best_block, trial_block, block_size_to_optimize_in_bytes); | |
| best_match_len = len; | |
| best_match_src_window_ofs = src_match_window_ofs; | |
| best_match_dst_window_ofs = dst_match_window_ofs; | |
| best_match_src_block_ofs = src_ofs; | |
| best_match_dst_block_ofs = dst_ofs; | |
| best_match_bits = trial_match_bits; | |
| } | |
| } | |
| } // dst_ofs | |
| } // src_ofs | |
| } | |
| else | |
| { | |
| const uint32_t match_dist = (block_index - prev_block_index) * total_block_stride_in_bytes; | |
| // Assume the block has 1 match and block_size_to_optimize_in_bytes-match_len literals. | |
| const float trial_match_bits = (float)compute_match_cost_estimate(match_dist, len); | |
| const float trial_total_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + trial_match_bits; | |
| const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda; | |
| for (uint32_t ofs = 0; ofs <= (block_size_to_optimize_in_bytes - len); ofs++) | |
| { | |
| assert(len + ofs <= block_size_to_optimize_in_bytes); | |
| const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + ofs; | |
| const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + ofs; | |
| float trial_match_bits_to_use = trial_match_bits; | |
| float trial_total_bits_times_lambda_to_use = trial_total_bits_times_lambda; | |
| uint32_t hs = hash_hsieh(pPrev_blk + ofs, len, ofs); | |
| #if ERT_FAVOR_CONT_AND_REP0_MATCHES | |
| // Continue a previous match (which would cross block boundaries) | |
| if (((int)src_match_window_ofs == prev_match_window_ofs_to_favor_cont) && (ofs == 0)) | |
| { | |
| float continue_match_trial_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_CONTINUE_BITS; | |
| trial_match_bits_to_use = MATCH_CONTINUE_BITS; | |
| trial_total_bits_times_lambda_to_use = continue_match_trial_bits * params.m_lambda; | |
| } | |
| // Exploit REP0 matches | |
| else if ((prev_match_dist_to_favor != -1) && (src_match_window_ofs == (dst_match_window_ofs - prev_match_dist_to_favor))) | |
| { | |
| float continue_match_trial_bits = (block_size_to_optimize_in_bytes - len) * LITERAL_BITS + MATCH_REP0_BITS; | |
| trial_match_bits_to_use = MATCH_REP0_BITS; | |
| trial_total_bits_times_lambda_to_use = continue_match_trial_bits * params.m_lambda; | |
| } | |
| else | |
| { | |
| uint32_t hash_check = hash[hs & (HASH_SIZE - 1)]; | |
| if ((hash_check & 0xFF) == (block_index & 0xFF)) | |
| { | |
| if ((hash_check >> 8) == (hs >> 8)) | |
| continue; | |
| } | |
| } | |
| #else | |
| uint32_t hash_check = hash[hs & (HASH_SIZE - 1)]; | |
| if ((hash_check & 0xFF) == (block_index & 0xFF)) | |
| { | |
| if ((hash_check >> 8) == (hs >> 8)) | |
| continue; | |
| } | |
| #endif | |
| hash[hs & (HASH_SIZE - 1)] = (hs & 0xFFFFFF00) | (block_index & 0xFF); | |
| uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES]; | |
| memcpy(trial_block, pOrig_block, block_size_to_optimize_in_bytes); | |
| memcpy(trial_block + ofs, pPrev_blk + ofs, len); | |
| color_rgba decoded_trial_block[MAX_BLOCK_PIXELS]; | |
| if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, pUnpack_block_func_user_data)) | |
| continue; | |
| float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, total_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight); | |
| if (trial_mse < thresh_ms_err) | |
| { | |
| float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda_to_use; | |
| if (t < best_t) | |
| { | |
| best_t = t; | |
| memcpy(best_block, trial_block, block_size_to_optimize_in_bytes); | |
| best_match_len = len; | |
| best_match_src_window_ofs = src_match_window_ofs; | |
| best_match_dst_window_ofs = dst_match_window_ofs; | |
| best_match_src_block_ofs = ofs; | |
| best_match_dst_block_ofs = ofs; | |
| best_match_bits = trial_match_bits_to_use; | |
| } | |
| } | |
| } // ofs | |
| } | |
| } // len | |
| } // prev_block_index | |
| if (best_t < cur_t) | |
| { | |
| uint32_t best_second_match_len = 0, best_second_match_src_window_ofs = 0, best_second_match_dst_window_ofs = 0, best_second_match_src_block_ofs = 0, best_second_match_dst_block_ofs = 0; | |
| // Try injecting a second match, being sure it does't overlap with the first. | |
| if ((params.m_try_two_matches) && (best_match_len <= (block_size_to_optimize_in_bytes - 3))) | |
| { | |
| uint8_t matched_flags[MAX_BLOCK_SIZE_IN_BYTES]; | |
| memset(matched_flags, 0, sizeof(matched_flags)); | |
| memset(matched_flags + best_match_dst_block_ofs, 1, best_match_len); | |
| uint8_t orig_best_block[MAX_BLOCK_SIZE_IN_BYTES]; | |
| memcpy(orig_best_block, best_block, block_size_to_optimize_in_bytes); | |
| for (int prev_block_index = last_block_to_check; prev_block_index >= first_block_to_check; --prev_block_index) | |
| { | |
| const uint8_t* pPrev_blk = &pBlock_bytes[prev_block_index * total_block_stride_in_bytes]; | |
| const uint32_t match_dist = (block_index - prev_block_index) * total_block_stride_in_bytes; | |
| for (uint32_t len = 3; len <= (block_size_to_optimize_in_bytes - best_match_len); len++) | |
| { | |
| const float trial_total_bits = (block_size_to_optimize_in_bytes - len - best_match_len) * LITERAL_BITS + compute_match_cost_estimate(match_dist, len) + best_match_bits; | |
| const float trial_total_bits_times_lambda = trial_total_bits * params.m_lambda; | |
| for (uint32_t ofs = 0; ofs <= (block_size_to_optimize_in_bytes - len); ofs++) | |
| { | |
| int i; | |
| for (i = 0; i < (int)len; i++) | |
| if (matched_flags[ofs + i]) | |
| break; | |
| if (i != (int)len) | |
| continue; | |
| assert(len + ofs <= block_size_to_optimize_in_bytes); | |
| const uint32_t dst_match_window_ofs = block_index * total_block_stride_in_bytes + ofs; | |
| const uint32_t src_match_window_ofs = prev_block_index * total_block_stride_in_bytes + ofs; | |
| uint8_t trial_block[MAX_BLOCK_SIZE_IN_BYTES]; | |
| memcpy(trial_block, orig_best_block, block_size_to_optimize_in_bytes); | |
| memcpy(trial_block + ofs, pPrev_blk + ofs, len); | |
| color_rgba decoded_trial_block[MAX_BLOCK_PIXELS]; | |
| if (!(*pUnpack_block_func)(trial_block, decoded_trial_block, block_index, pUnpack_block_func_user_data)) | |
| continue; | |
| float trial_mse = compute_block_mse(pPixels, decoded_trial_block, block_width, block_height, total_block_pixels, num_comps, params.m_color_weights, one_over_total_color_weight); | |
| if (trial_mse < thresh_ms_err) | |
| { | |
| float t = trial_mse * smooth_block_mse_scale + trial_total_bits_times_lambda; | |
| if (t < best_t) | |
| { | |
| best_t = t; | |
| memcpy(best_block, trial_block, block_size_to_optimize_in_bytes); | |
| best_second_match_len = len; | |
| best_second_match_src_window_ofs = src_match_window_ofs; | |
| best_second_match_dst_window_ofs = dst_match_window_ofs; | |
| best_second_match_src_block_ofs = ofs; | |
| best_second_match_dst_block_ofs = ofs; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| memcpy(pOrig_block, best_block, block_size_to_optimize_in_bytes); | |
| total_modified++; | |
| if ((best_second_match_len == 0) || (best_match_dst_window_ofs > best_second_match_dst_window_ofs)) | |
| { | |
| int best_match_dist = best_match_dst_window_ofs - best_match_src_window_ofs; | |
| assert(best_match_dist >= 1); | |
| (void)best_match_dist; | |
| if (block_size_to_optimize_in_bytes == total_block_stride_in_bytes) | |
| { | |
| // If the match goes all the way to the end of a block, we can try to continue it on the next encoded block. | |
| if ((best_match_dst_block_ofs + best_match_len) == total_block_stride_in_bytes) | |
| prev_match_window_ofs_to_favor_cont = best_match_src_window_ofs + best_match_len; | |
| else | |
| prev_match_window_ofs_to_favor_cont = -1; | |
| } | |
| #if ERT_FAVOR_REP0_MATCHES | |
| // Compute the window offset where a cheaper REP0 match would be available | |
| prev_match_dist_to_favor = best_match_dist; | |
| #endif | |
| } | |
| else | |
| { | |
| int best_match_dist = best_second_match_dst_window_ofs - best_second_match_src_window_ofs; | |
| assert(best_match_dist >= 1); | |
| (void)best_match_dist; | |
| if (block_size_to_optimize_in_bytes == total_block_stride_in_bytes) | |
| { | |
| // If the match goes all the way to the end of a block, we can try to continue it on the next encoded block. | |
| if ((best_second_match_dst_block_ofs + best_second_match_len) == total_block_stride_in_bytes) | |
| prev_match_window_ofs_to_favor_cont = best_second_match_src_window_ofs + best_second_match_len; | |
| else | |
| prev_match_window_ofs_to_favor_cont = -1; | |
| } | |
| #if ERT_FAVOR_REP0_MATCHES | |
| // Compute the window offset where a cheaper REP0 match would be available | |
| prev_match_dist_to_favor = best_match_dist; | |
| #endif | |
| } | |
| len_hist[best_match_len]++; | |
| if (best_second_match_len) | |
| { | |
| second_len_hist[best_second_match_len]++; | |
| total_second_matches++; | |
| } | |
| } | |
| else | |
| { | |
| prev_match_window_ofs_to_favor_cont = -1; | |
| } | |
| } // block_index | |
| if (params.m_debug_output) | |
| { | |
| printf("Total smooth blocks: %3.2f%%\n", total_smooth_blocks * 100.0f / num_blocks); | |
| printf("Match length histogram:\n"); | |
| for (uint32_t i = MIN_MATCH_LEN; i <= block_size_to_optimize_in_bytes; i++) | |
| printf("%u%c", len_hist[i], (i < block_size_to_optimize_in_bytes) ? ',' : '\n'); | |
| printf("Total second matches: %u %3.2f%%\n", total_second_matches, total_second_matches * 100.0f / num_blocks); | |
| printf("Secod match length histogram:\n"); | |
| for (uint32_t i = MIN_MATCH_LEN; i <= block_size_to_optimize_in_bytes; i++) | |
| printf("%u%c", second_len_hist[i], (i < block_size_to_optimize_in_bytes) ? ',' : '\n'); | |
| } | |
| return true; | |
| } | |
| } // namespace ert |