diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..969a94d
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,58 @@
+project(bc7enc)
+
+cmake_minimum_required(VERSION 2.8)
+option(BUILD_X64 "build 64-bit" TRUE)
+
+message("Initial BUILD_X64=${BUILD_X64}")
+message("Initial CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+
+if( NOT CMAKE_BUILD_TYPE )
+  set( CMAKE_BUILD_TYPE Release )
+endif( NOT CMAKE_BUILD_TYPE )
+
+message( ${PROJECT_NAME} " build type: " ${CMAKE_BUILD_TYPE} )
+
+if (BUILD_X64)
+	message("Building 64-bit")
+else()
+	message("Building 32-bit")
+endif(BUILD_X64)
+
+if (NOT MSVC)
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+endif()
+
+# -fno-strict-aliasing shouldn't be necessary, it's here because that is what MSVC uses by default and that's what I've tested with the most.
+if (NOT MSVC)
+	set(GCC_COMPILE_FLAGS "-fno-strict-aliasing -Wall -Wextra")
+	if (NOT BUILD_X64)
+		set(GCC_COMPILE_FLAGS "${GCC_COMPILE_FLAGS} -m32")
+	endif()
+endif()	
+
+set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}")
+set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
+
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${GCC_COMPILE_FLAGS}")
+set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} ${GCC_COMPILE_FLAGS}")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${GCC_COMPILE_FLAGS} -D_DEBUG")
+
+set (BC7ENC_SRC_LIST ${COMMON_SRC_LIST}
+	bc7enc.cpp
+	lodepng.cpp
+	bc7decomp.c
+	bc7enc16.c
+	)
+	
+add_executable(bc7enc ${BC7ENC_SRC_LIST})
+
+if (NOT MSVC)
+	target_link_libraries(bc7enc m)
+endif()
+
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6c93a9d
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,76 @@
+bc7enc16.c/.h is available under 2 licenses -- choose whichever you prefer:
+
+ALTERNATIVE A for bc7enc.c/.h - MIT License
+Copyright(c) 2018 Richard Geldreich, Jr.
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files(the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions :
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B for bc7enc.c/.h - Public Domain(www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non - commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain.We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors.We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+
+bc7decomp.c/.h: Copyright (c) 2015 Harm Hanemaaijer <fgenfb@yahoo.com>
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+------------------------------------------------------------------------------
+
+LodePNG version 20161127
+
+Copyright (c) 2005-2016 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+	 1. The origin of this software must not be misrepresented; you must not
+	 claim that you wrote the original software. If you use this software
+	 in a product, an acknowledgment in the product documentation would be
+	 appreciated but is not required.
+
+	 2. Altered source versions must be plainly marked as such, and must not be
+	 misrepresented as being the original software.
+
+	 3. This notice may not be removed or altered from any source
+	 distribution.
+	 
\ No newline at end of file
diff --git a/README b/README
new file mode 100644
index 0000000..d41f8a6
--- /dev/null
+++ b/README
@@ -0,0 +1,11 @@
+bc7enc16 - Fast, single source file BC7/BPTC GPU texture encoder with perceptual colorspace metric support
+
+<work in progress - only tested under Windows with VS2015 so far>
+
+Supports modes 1 and 6. This is a strong opaque texture encoder, with basic
+support for alpha channels (using mode 6). If alpha is highly correlated
+compared to RGB, or it's relatively simple (think straightforward masks where
+lots of blocks are either all-transparent or all-opaque), it should work great.
+For complex alpha channels more modes (such as 4,5 or maybe 7) are necessary.
+
+
diff --git a/bc7decomp.c b/bc7decomp.c
new file mode 100644
index 0000000..0693332
--- /dev/null
+++ b/bc7decomp.c
@@ -0,0 +1,808 @@
+/*
+Copyright (c) 2015 Harm Hanemaaijer <fgenfb@yahoo.com>
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+// Modified by Rich Geldreich 4/26/18- fixed bugs in detexBlock128ExtractBits() and FullyDecodeEndpoints(), 
+// compared vs. DirectXTex'c BC7 decoder for correctness.
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <memory.h>
+#include "bc7decomp.h"
+
+// Integer division using look-up tables, used by BC1/2/3 and RGTC (BC4/5)
+// decompression.
+
+typedef struct {
+	uint64_t data0;
+	uint64_t data1;
+	int index;
+} detexBlock128;
+
+uint32_t detexBlock128ExtractBits(detexBlock128 *block, int nu_bits) {
+	uint32_t value = 0;
+	for (int i = 0; i < nu_bits; i++) {
+		if (block->index < 64) {
+			int shift = block->index - i;
+			if (shift < 0)
+				value |= (block->data0 & ((uint64_t)1 << block->index)) << (-shift);
+			else
+				value |= (block->data0 & ((uint64_t)1 << block->index)) >> shift;
+		}
+		else {
+			int shift = ((block->index - 64) - i);
+			if (shift < 0)
+				value |= (block->data1 & ((uint64_t)1 << (block->index - 64))) << (-shift);
+			else
+				value |= (block->data1 & ((uint64_t)1 << (block->index - 64))) >> shift;
+		}
+		block->index++;
+	}
+	//	if (block->index > 128)
+	//		printf("Block overflow (%d)\n", block->index);
+	return value;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPixel32GetR8(uint32_t pixel) {
+	return pixel & 0xFF;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPixel32GetG8(uint32_t pixel) {
+	return (pixel & 0xFF00) >> 8;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPixel32GetB8(uint32_t pixel) {
+	return (pixel & 0xFF0000) >> 16;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPixel32GetA8(uint32_t pixel) {
+	return (pixel & 0xFF000000) >> 24;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPack32R8(int r) {
+	return (uint32_t)r;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPack32G8(int g) {
+	return (uint32_t)g << 8;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPack32B8(int b) {
+	return (uint32_t)b << 16;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPack32A8(int a) {
+	return (uint32_t)a << 24;
+}
+
+static DETEX_INLINE_ONLY uint32_t detexPack32RGBA8(int r, int g, int b, int a) {
+	return (uint32_t)r | ((uint32_t)g << 8) | ((uint32_t)b << 16) |
+		((uint32_t)a << 24);
+}
+
+uint32_t detexBlock128ExtractBits(detexBlock128 *block, int nu_bits);
+
+/* Return bitfield from bit0 to bit1 from 64-bit bitstring. */
+static DETEX_INLINE_ONLY uint32_t detexGetBits64(uint64_t data, int bit0, int bit1) {
+	uint64_t mask;
+	if (bit1 == 63)
+		mask = UINT64_MAX;
+	else
+		mask = ((uint64_t)1 << (bit1 + 1)) - 1;
+
+	return (uint32_t)((data & mask) >> bit0);
+}
+
+const uint8_t detex_bptc_table_P2[64 * 16] = {
+	0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,
+	0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,
+	0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,
+	0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,
+	0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,
+	0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,
+	0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1,
+	0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
+	0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,
+	0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,
+	0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,
+	0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,
+	0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,
+	0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,
+	0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,
+	0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
+	0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1,
+	0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,
+	0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,
+	0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,
+	0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,
+	0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,
+	0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,
+	0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,
+	0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,
+	0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,
+	0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,
+	0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,
+	0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,
+	0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,
+	0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0,
+	0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,
+	0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,
+	0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,
+	0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0,
+	0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0,
+	0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,
+	0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,
+	0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,
+	0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,
+	0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,
+	0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0,
+	0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0,
+	0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,
+	0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,
+	0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1,
+	0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1,
+	0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,
+	0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,
+	0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,
+	0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,
+	0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,
+	0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,
+	0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1,
+	0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0,
+	0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0,
+	0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,
+	0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1,
+	0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1,
+	0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,
+	0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1,
+	0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,
+	0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,
+	0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
+};
+
+const uint8_t detex_bptc_table_P3[64 * 16] = {
+	0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2,
+	0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1,
+	0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1,
+	0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1,
+	0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2,
+	0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2,
+	0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1,
+	0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1,
+	0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,
+	0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+	0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2,
+	0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,
+	0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2,
+	0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2,
+	0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2,
+	0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0,
+	0,0,0,1,0,0,1,1,0,1,1,2,1,1,2,2,
+	0,1,1,1,0,0,1,1,2,0,0,1,2,2,0,0,
+	0,0,0,0,1,1,2,2,1,1,2,2,1,1,2,2,
+	0,0,2,2,0,0,2,2,0,0,2,2,1,1,1,1,
+	0,1,1,1,0,1,1,1,0,2,2,2,0,2,2,2,
+	0,0,0,1,0,0,0,1,2,2,2,1,2,2,2,1,
+	0,0,0,0,0,0,1,1,0,1,2,2,0,1,2,2,
+	0,0,0,0,1,1,0,0,2,2,1,0,2,2,1,0,
+	0,1,2,2,0,1,2,2,0,0,1,1,0,0,0,0,
+	0,0,1,2,0,0,1,2,1,1,2,2,2,2,2,2,
+	0,1,1,0,1,2,2,1,1,2,2,1,0,1,1,0,
+	0,0,0,0,0,1,1,0,1,2,2,1,1,2,2,1,
+	0,0,2,2,1,1,0,2,1,1,0,2,0,0,2,2,
+	0,1,1,0,0,1,1,0,2,0,0,2,2,2,2,2,
+	0,0,1,1,0,1,2,2,0,1,2,2,0,0,1,1,
+	0,0,0,0,2,0,0,0,2,2,1,1,2,2,2,1,
+	0,0,0,0,0,0,0,2,1,1,2,2,1,2,2,2,
+	0,2,2,2,0,0,2,2,0,0,1,2,0,0,1,1,
+	0,0,1,1,0,0,1,2,0,0,2,2,0,2,2,2,
+	0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,0,
+	0,0,0,0,1,1,1,1,2,2,2,2,0,0,0,0,
+	0,1,2,0,1,2,0,1,2,0,1,2,0,1,2,0,
+	0,1,2,0,2,0,1,2,1,2,0,1,0,1,2,0,
+	0,0,1,1,2,2,0,0,1,1,2,2,0,0,1,1,
+	0,0,1,1,1,1,2,2,2,2,0,0,0,0,1,1,
+	0,1,0,1,0,1,0,1,2,2,2,2,2,2,2,2,
+	0,0,0,0,0,0,0,0,2,1,2,1,2,1,2,1,
+	0,0,2,2,1,1,2,2,0,0,2,2,1,1,2,2,
+	0,0,2,2,0,0,1,1,0,0,2,2,0,0,1,1,
+	0,2,2,0,1,2,2,1,0,2,2,0,1,2,2,1,
+	0,1,0,1,2,2,2,2,2,2,2,2,0,1,0,1,
+	0,0,0,0,2,1,2,1,2,1,2,1,2,1,2,1,
+	0,1,0,1,0,1,0,1,0,1,0,1,2,2,2,2,
+	0,2,2,2,0,1,1,1,0,2,2,2,0,1,1,1,
+	0,0,0,2,1,1,1,2,0,0,0,2,1,1,1,2,
+	0,0,0,0,2,1,1,2,2,1,1,2,2,1,1,2,
+	0,2,2,2,0,1,1,1,0,1,1,1,0,2,2,2,
+	0,0,0,2,1,1,1,2,1,1,1,2,0,0,0,2,
+	0,1,1,0,0,1,1,0,0,1,1,0,2,2,2,2,
+	0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,2,
+	0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2,
+	0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2,
+	0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2,
+	0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2,
+	0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1,
+	0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2,
+	0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2,
+	0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0,
+};
+
+const uint8_t detex_bptc_table_anchor_index_second_subset[64] = {
+	15,15,15,15,15,15,15,15,
+	15,15,15,15,15,15,15,15,
+	15, 2, 8, 2, 2, 8, 8,15,
+	2, 8, 2, 2, 8, 8, 2, 2,
+	15,15, 6, 8, 2, 8,15,15,
+	2, 8, 2, 2, 2,15,15, 6,
+	6, 2, 6, 8,15,15, 2, 2,
+	15,15,15,15,15, 2, 2,15
+};
+
+const uint8_t detex_bptc_table_anchor_index_second_subset_of_three[64] = {
+	3, 3,15,15, 8, 3,15,15,
+	8, 8, 6, 6, 6, 5, 3, 3,
+	3, 3, 8,15, 3, 3, 6,10,
+	5, 8, 8, 6, 8, 5,15,15,
+	8,15, 3, 5, 6,10, 8,15,
+	15, 3,15, 5,15,15,15,15,
+	3,15, 5, 5, 5, 8, 5,10,
+	5,10, 8,13,15,12, 3, 3
+};
+
+const uint8_t detex_bptc_table_anchor_index_third_subset[64] = {
+	15, 8, 8, 3,15,15, 3, 8,
+	15,15,15,15,15,15,15, 8,
+	15, 8,15, 3,15, 8,15, 8,
+	3,15, 6,10,15,15,10, 8,
+	15, 3,15,10,10, 8, 9,10,
+	6,15, 8,15, 3, 6, 6, 8,
+	15, 3,15,15,15,15,15,15,
+	15,15,15,15, 3,15,15, 8
+};
+
+const uint16_t detex_bptc_table_aWeight2[4] = {
+	0, 21, 43, 64
+};
+
+const uint16_t detex_bptc_table_aWeight3[8] = {
+	0, 9, 18, 27, 37, 46, 55, 64
+};
+
+const uint16_t detex_bptc_table_aWeight4[16] = {
+	0, 4, 9, 13, 17, 21, 26, 30,
+	34, 38, 43, 47, 51, 55, 60, 64
+};
+
+
+
+// BPTC mode layout:
+//
+// Number of subsets = { 3, 2, 3, 2, 1, 1, 1, 2 };
+// Partition bits = { 4, 6, 6, 6, 0, 0, 0, 6 };
+// Rotation bits = { 0, 0, 0, 0, 2, 2, 0, 0 };
+// Mode 4 has one index selection bit.
+//
+//      #subsets color alpha before color   index after color	 index after	  After	     Index
+//                                                               alpha		  pbits	     bits (*)
+// Mode 0   3	  4	0    1 + 4 = 5			5 + 6 * 3 * 4 = 77	 77		  + 6 = 83   + 48 - 3 = 128
+// Mode 1   2	  6	0    2 + 6 = 8			8 + 4 * 3 * 6 = 80	 80		  + 2 = 82   + 48 - 2 = 128
+// Mode 2   3	  5	0    3 + 6 = 9			9 + 6 * 3 * 5 = 99	 99		  99	     + 32 - 3 = 128
+// Mode 3   2	  7	0    4 + 6 = 10	   10 + 4 * 3 * 7 = 94	 94		  + 4 = 98   + 32 - 2 = 128
+// Mode 4   1	  5	6    5 + 2 + 1 = 8	8 + 2 * 3 * 5 = 38	 37 + 2 * 6 = 50  50	     + 80 - 2 = 128
+// Mode 5   1	  7	8    6 + 2 = 8			8 + 2 * 3 * 7 = 50	 50 + 2 * 8 = 66  66	     + 64 - 2 = 128
+// Mode 6   1	  7	7    7					7 + 2 * 3 * 7 = 49	 49 + 2 * 7 = 63  + 2 = 65   + 64 - 1 = 128
+// Mode 7   2	  5	5    8 + 6 = 14     14 + 4 * 3 * 5 = 74	 74 + 4 * 5 = 94  + 4 = 98   + 32 - 2 = 128
+//
+// (*) For formats without alpha, the number of index bits is reduced by #subsets anchor bits.
+//     For formats with alpha, the number of index bits is reduced by 2 * #subsets by the anchor bits.
+
+
+static const uint8_t color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 };
+
+// Note: precision includes P-bits!
+static const uint8_t color_precision_plus_pbit_table[8] = { 5, 7, 5, 8, 5, 7, 8, 6 };
+
+static DETEX_INLINE_ONLY uint8_t GetColorComponentPrecision(int mode) {
+	return color_precision_table[mode];
+}
+
+static DETEX_INLINE_ONLY uint8_t GetColorComponentPrecisionPlusPbit(int mode) {
+	return color_precision_plus_pbit_table[mode];
+}
+
+static const int8_t alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 };
+
+// Note: precision include P-bits!
+static const uint8_t alpha_precision_plus_pbit_table[8] = { 0, 0, 0, 0, 6, 8, 8, 6 };
+
+static DETEX_INLINE_ONLY uint8_t GetAlphaComponentPrecision(int mode) {
+	return alpha_precision_table[mode];
+}
+
+static DETEX_INLINE_ONLY uint8_t GetAlphaComponentPrecisionPlusPbit(int mode) {
+	return alpha_precision_plus_pbit_table[mode];
+}
+
+static const int8_t components_in_qword0_table[8] = { 2, -1, 1, 1, 3, 3, 3, 2 };
+
+/* Extract endpoint colors. */
+static void ExtractEndpoints(int mode, int nu_subsets, detexBlock128 * DETEX_RESTRICT block,
+	uint8_t * DETEX_RESTRICT endpoint_array) {
+	// Optimized version avoiding the use of block_extract_bits().
+	int components_in_qword0 = components_in_qword0_table[mode];
+	uint64_t data = block->data0 >> block->index;
+	uint8_t precision = GetColorComponentPrecision(mode);
+	uint8_t mask = (1 << precision) - 1;
+	int total_bits_per_component = nu_subsets * 2 * precision;
+	for (int i = 0; i < components_in_qword0; i++)	// For each color component.
+		for (int j = 0; j < nu_subsets; j++)	// For each subset.
+			for (int k = 0; k < 2; k++) {	// For each endpoint.
+				endpoint_array[j * 8 + k * 4 + i] = data & mask;
+				data >>= precision;
+			}
+	block->index += components_in_qword0 * total_bits_per_component;
+	if (components_in_qword0 < 3) {
+		// Handle the color component that crosses the boundary between data0 and data1
+		data = block->data0 >> block->index;
+		data |= block->data1 << (64 - block->index);
+		int i = components_in_qword0;
+		for (int j = 0; j < nu_subsets; j++)	// For each subset.
+			for (int k = 0; k < 2; k++) {	// For each endpoint.
+				endpoint_array[j * 8 + k * 4 + i] = data & mask;
+				data >>= precision;
+			}
+		block->index += total_bits_per_component;
+	}
+	if (components_in_qword0 < 2) {
+		// Handle the color component that is wholly in data1.
+		data = block->data1 >> (block->index - 64);
+		int i = 2;
+		for (int j = 0; j < nu_subsets; j++)	// For each subset.
+			for (int k = 0; k < 2; k++) {	// For each endpoint.
+				endpoint_array[j * 8 + k * 4 + i] = data & mask;
+				data >>= precision;
+			}
+		block->index += total_bits_per_component;
+	}
+	// Alpha component.
+	if (GetAlphaComponentPrecision(mode) > 0) {
+		// For mode 7, the alpha data is wholly in data1.
+		// For modes 4 and 6, the alpha data is wholly in data0.
+		// For mode 5, the alpha data is in data0 and data1.
+		if (mode == 7)
+			data = block->data1 >> (block->index - 64);
+		else if (mode == 5)
+			data = (block->data0 >> block->index) | ((block->data1 & 0x3) << 14);
+		else
+			data = block->data0 >> block->index;
+		uint8_t alpha_precision = GetAlphaComponentPrecision(mode);
+		uint8_t mask = (1 << alpha_precision) - 1;
+		for (int j = 0; j < nu_subsets; j++)
+			for (int k = 0; k < 2; k++) {	// For each endpoint.
+				endpoint_array[j * 8 + k * 4 + 3] = data & mask;
+				data >>= alpha_precision;
+			}
+		block->index += nu_subsets * 2 * alpha_precision;
+	}
+}
+
+static const uint8_t mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 };
+
+static void FullyDecodeEndpoints(uint8_t * DETEX_RESTRICT endpoint_array, int nu_subsets,
+	int mode, detexBlock128 * DETEX_RESTRICT block) {
+	if (mode_has_p_bits[mode]) {
+		// Mode 1 (shared P-bits) handled elsewhere.
+		// Extract end-point P-bits. 
+		uint32_t bits;
+		if (block->index < 64)
+		{
+			bits = (uint32_t)(block->data0 >> block->index);
+			if ((block->index + nu_subsets * 2) > 64)
+			{
+				bits |= (block->data1 << (64 - block->index));
+			}
+		}
+		else
+			bits = (uint32_t)(block->data1 >> (block->index - 64));
+		for (int i = 0; i < nu_subsets * 2; i++) {
+			endpoint_array[i * 4 + 0] <<= 1;
+			endpoint_array[i * 4 + 1] <<= 1;
+			endpoint_array[i * 4 + 2] <<= 1;
+			endpoint_array[i * 4 + 3] <<= 1;
+			endpoint_array[i * 4 + 0] |= (bits & 1);
+			endpoint_array[i * 4 + 1] |= (bits & 1);
+			endpoint_array[i * 4 + 2] |= (bits & 1);
+			endpoint_array[i * 4 + 3] |= (bits & 1);
+			bits >>= 1;
+		}
+		block->index += nu_subsets * 2;
+	}
+	int color_prec = GetColorComponentPrecisionPlusPbit(mode);
+	int alpha_prec = GetAlphaComponentPrecisionPlusPbit(mode);
+	for (int i = 0; i < nu_subsets * 2; i++) {
+		// Color_component_precision & alpha_component_precision includes pbit
+		// left shift endpoint components so that their MSB lies in bit 7
+		endpoint_array[i * 4 + 0] <<= (8 - color_prec);
+		endpoint_array[i * 4 + 1] <<= (8 - color_prec);
+		endpoint_array[i * 4 + 2] <<= (8 - color_prec);
+		endpoint_array[i * 4 + 3] <<= (8 - alpha_prec);
+
+		// Replicate each component's MSB into the LSBs revealed by the left-shift operation above.
+		endpoint_array[i * 4 + 0] |= (endpoint_array[i * 4 + 0] >> color_prec);
+		endpoint_array[i * 4 + 1] |= (endpoint_array[i * 4 + 1] >> color_prec);
+		endpoint_array[i * 4 + 2] |= (endpoint_array[i * 4 + 2] >> color_prec);
+		endpoint_array[i * 4 + 3] |= (endpoint_array[i * 4 + 3] >> alpha_prec);
+	}
+	if (mode <= 3) {
+		for (int i = 0; i < nu_subsets * 2; i++)
+			endpoint_array[i * 4 + 3] = 0xFF;
+	}
+}
+
+static uint8_t Interpolate(uint8_t e0, uint8_t e1, uint8_t index, uint8_t indexprecision) {
+	if (indexprecision == 2)
+		return (uint8_t)(((64 - detex_bptc_table_aWeight2[index]) * (uint16_t)e0
+			+ detex_bptc_table_aWeight2[index] * (uint16_t)e1 + 32) >> 6);
+	else
+		if (indexprecision == 3)
+			return (uint8_t)(((64 - detex_bptc_table_aWeight3[index]) * (uint16_t)e0
+				+ detex_bptc_table_aWeight3[index] * (uint16_t)e1 + 32) >> 6);
+		else // indexprecision == 4
+			return (uint8_t)(((64 - detex_bptc_table_aWeight4[index]) * (uint16_t)e0
+				+ detex_bptc_table_aWeight4[index] * (uint16_t)e1 + 32) >> 6);
+}
+
+static const uint8_t bptc_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 };
+
+static DETEX_INLINE_ONLY int GetColorIndexBitcount(int mode, int index_selection_bit) {
+	// If the index selection bit is set for mode 4, return 3, otherwise 2.
+	return bptc_color_index_bitcount[mode] + index_selection_bit;
+}
+
+static uint8_t bptc_alpha_index_bitcount[8] = { 3, 3, 2, 2, 3, 2, 4, 2 };
+
+static DETEX_INLINE_ONLY int GetAlphaIndexBitcount(int mode, int index_selection_bit) {
+	// If the index selection bit is set for mode 4, return 2, otherwise 3.
+	return bptc_alpha_index_bitcount[mode] - index_selection_bit;
+}
+
+static const uint8_t bptc_NS[8] = { 3, 2, 3, 2, 1, 1, 1, 2 };
+
+static DETEX_INLINE_ONLY int GetNumberOfSubsets(int mode) {
+	return bptc_NS[mode];
+}
+
+static const uint8_t PB[8] = { 4, 6, 6, 6, 0, 0, 0, 6 };
+
+static DETEX_INLINE_ONLY int GetNumberOfPartitionBits(int mode) {
+	return PB[mode];
+}
+
+static const uint8_t RB[8] = { 0, 0, 0, 0, 2, 2, 0, 0 };
+
+static DETEX_INLINE_ONLY int GetNumberOfRotationBits(int mode) {
+	return RB[mode];
+}
+
+// Functions to extract parameters. */
+
+static int ExtractMode(detexBlock128 *block) {
+	for (int i = 0; i < 8; i++)
+		if (block->data0 & ((uint64_t)1 << i)) {
+			block->index = i + 1;
+			return i;
+		}
+	// Illegal.
+	return -1;
+}
+
+static DETEX_INLINE_ONLY int ExtractPartitionSetID(detexBlock128 *block, int mode) {
+	return detexBlock128ExtractBits(block, GetNumberOfPartitionBits(mode));
+}
+
+static DETEX_INLINE_ONLY int GetPartitionIndex(int nu_subsets, int partition_set_id, int i) {
+	if (nu_subsets == 1)
+		return 0;
+	if (nu_subsets == 2)
+		return detex_bptc_table_P2[partition_set_id * 16 + i];
+	return detex_bptc_table_P3[partition_set_id * 16 + i];
+}
+
+static DETEX_INLINE_ONLY int ExtractRotationBits(detexBlock128 *block, int mode) {
+	return detexBlock128ExtractBits(block, GetNumberOfRotationBits(mode));
+}
+
+static DETEX_INLINE_ONLY int GetAnchorIndex(int partition_set_id, int partition, int nu_subsets) {
+	if (partition == 0)
+		return 0;
+	if (nu_subsets == 2)
+		return detex_bptc_table_anchor_index_second_subset[partition_set_id];
+	if (partition == 1)
+		return detex_bptc_table_anchor_index_second_subset_of_three[partition_set_id];
+	return detex_bptc_table_anchor_index_third_subset[partition_set_id];
+}
+
+static const uint8_t IB[8] = { 3, 3, 2, 2, 2, 2, 4, 2 };
+static const uint8_t IB2[8] = { 0, 0, 0, 0, 3, 2, 0, 0 };
+static const uint8_t mode_has_partition_bits[8] = { 1, 1, 1, 1, 0, 0, 0, 1 };
+
+/* Decompress a 128-bit 4x4 pixel texture block compressed using BPTC mode 1. */
+
+static bool DecompressBlockBPTCMode1(detexBlock128 * DETEX_RESTRICT block,
+	uint8_t * DETEX_RESTRICT pixel_buffer) {
+	uint64_t data0 = block->data0;
+	uint64_t data1 = block->data1;
+	int partition_set_id = detexGetBits64(data0, 2, 7);
+	uint8_t endpoint[2 * 2 * 3];	// 2 subsets.
+	endpoint[0] = detexGetBits64(data0, 8, 13);	// red, subset 0, endpoint 0
+	endpoint[3] = detexGetBits64(data0, 14, 19);	// red, subset 0, endpoint 1
+	endpoint[6] = detexGetBits64(data0, 20, 25);	// red, subset 1, endpoint 0
+	endpoint[9] = detexGetBits64(data0, 26, 31);	// red, subset 1, endpoint 1
+	endpoint[1] = detexGetBits64(data0, 32, 37);	// green, subset 0, endpoint 0
+	endpoint[4] = detexGetBits64(data0, 38, 43);	// green, subset 0, endpoint 1
+	endpoint[7] = detexGetBits64(data0, 44, 49);	// green, subset 1, endpoint 0
+	endpoint[10] = detexGetBits64(data0, 50, 55);	// green, subset 1, endpoint 1
+	endpoint[2] = detexGetBits64(data0, 56, 61);	// blue, subset 0, endpoint 0
+	endpoint[5] = detexGetBits64(data0, 62, 63)	// blue, subset 0, endpoint 1
+		| (detexGetBits64(data1, 0, 3) << 2);
+	endpoint[8] = detexGetBits64(data1, 4, 9);	// blue, subset 1, endpoint 0
+	endpoint[11] = detexGetBits64(data1, 10, 15);	// blue, subset 1, endpoint 1
+																	// Decode endpoints.
+	for (int i = 0; i < 2 * 2; i++) {
+		//component-wise left-shift
+		endpoint[i * 3 + 0] <<= 2;
+		endpoint[i * 3 + 1] <<= 2;
+		endpoint[i * 3 + 2] <<= 2;
+	}
+	// P-bit is shared.
+	uint8_t pbit_zero = detexGetBits64(data1, 16, 16) << 1;
+	uint8_t pbit_one = detexGetBits64(data1, 17, 17) << 1;
+	// RGB only pbits for mode 1, one for each subset.
+	for (int j = 0; j < 3; j++) {
+		endpoint[0 * 3 + j] |= pbit_zero;
+		endpoint[1 * 3 + j] |= pbit_zero;
+		endpoint[2 * 3 + j] |= pbit_one;
+		endpoint[3 * 3 + j] |= pbit_one;
+	}
+	for (int i = 0; i < 2 * 2; i++) {
+		// Replicate each component's MSB into the LSB.
+		endpoint[i * 3 + 0] |= endpoint[i * 3 + 0] >> 7;
+		endpoint[i * 3 + 1] |= endpoint[i * 3 + 1] >> 7;
+		endpoint[i * 3 + 2] |= endpoint[i * 3 + 2] >> 7;
+	}
+
+	uint8_t subset_index[16];
+	for (int i = 0; i < 16; i++)
+		// subset_index[i] is a number from 0 to 1.
+		subset_index[i] = detex_bptc_table_P2[partition_set_id * 16 + i];
+	uint8_t anchor_index[2];
+	anchor_index[0] = 0;
+	anchor_index[1] = detex_bptc_table_anchor_index_second_subset[partition_set_id];
+	uint8_t color_index[16];
+	// Extract primary index bits.
+	data1 >>= 18;
+	for (int i = 0; i < 16; i++)
+		if (i == anchor_index[subset_index[i]]) {
+			// Highest bit is zero.
+			color_index[i] = data1 & 3; // Get two bits.
+			data1 >>= 2;
+		}
+		else {
+			color_index[i] = data1 & 7;	// Get three bits.
+			data1 >>= 3;
+		}
+		uint32_t *pixel32_buffer = (uint32_t *)pixel_buffer;
+		for (int i = 0; i < 16; i++) {
+			uint8_t endpoint_start[3];
+			uint8_t endpoint_end[3];
+			for (int j = 0; j < 3; j++) {
+				endpoint_start[j] = endpoint[2 * subset_index[i] * 3 + j];
+				endpoint_end[j] = endpoint[(2 * subset_index[i] + 1) * 3 + j];
+			}
+			uint32_t output;
+			output = detexPack32R8(Interpolate(endpoint_start[0], endpoint_end[0], color_index[i], 3));
+			output |= detexPack32G8(Interpolate(endpoint_start[1], endpoint_end[1], color_index[i], 3));
+			output |= detexPack32B8(Interpolate(endpoint_start[2], endpoint_end[2], color_index[i], 3));
+			output |= detexPack32A8(0xFF);
+			pixel32_buffer[i] = output;
+		}
+		return true;
+}
+
+/* Decompress a 128-bit 4x4 pixel texture block compressed using the BPTC */
+/* (BC7) format. */
+bool detexDecompressBlockBPTC(const uint8_t * DETEX_RESTRICT bitstring, uint32_t mode_mask,
+	uint32_t flags, uint8_t * DETEX_RESTRICT pixel_buffer) {
+	detexBlock128 block;
+	block.data0 = *(uint64_t *)&bitstring[0];
+	block.data1 = *(uint64_t *)&bitstring[8];
+	block.index = 0;
+	int mode = ExtractMode(&block);
+	if (mode == -1)
+		return 0;
+	// Allow compression tied to specific modes (according to mode_mask).
+	if (!(mode_mask & ((int)1 << mode)))
+		return 0;
+	if (mode >= 4 && (flags & DETEX_DECOMPRESS_FLAG_OPAQUE_ONLY))
+		return 0;
+	if (mode < 4 && (flags & DETEX_DECOMPRESS_FLAG_NON_OPAQUE_ONLY))
+		return 0;
+	if (mode == 1)
+		return DecompressBlockBPTCMode1(&block, pixel_buffer);
+
+	int nu_subsets = 1;
+	int partition_set_id = 0;
+	if (mode_has_partition_bits[mode]) {
+		nu_subsets = GetNumberOfSubsets(mode);
+		partition_set_id = ExtractPartitionSetID(&block, mode);
+	}
+	int rotation = ExtractRotationBits(&block, mode);
+	int index_selection_bit = 0;
+	if (mode == 4)
+		index_selection_bit = detexBlock128ExtractBits(&block, 1);
+
+	int alpha_index_bitcount = GetAlphaIndexBitcount(mode, index_selection_bit);
+	int color_index_bitcount = GetColorIndexBitcount(mode, index_selection_bit);
+
+	uint8_t endpoint_array[3 * 2 * 4];	// Max. 3 subsets.
+	ExtractEndpoints(mode, nu_subsets, &block, endpoint_array);
+	FullyDecodeEndpoints(endpoint_array, nu_subsets, mode, &block);
+
+	uint8_t subset_index[16];
+	for (int i = 0; i < 16; i++)
+		// subset_index[i] is a number from 0 to 2, or 0 to 1, or 0 depending on the number of subsets.
+		subset_index[i] = GetPartitionIndex(nu_subsets, partition_set_id, i);
+	uint8_t anchor_index[4] = { 0, 0, 0, 0 };	// Only need max. 3 elements.
+	for (int i = 0; i < nu_subsets; i++)
+		anchor_index[i] = GetAnchorIndex(partition_set_id, i, nu_subsets);
+	uint8_t color_index[16];
+	uint8_t alpha_index[16];
+	memset(color_index, 0, sizeof(color_index));
+	memset(alpha_index, 0, sizeof(alpha_index));
+	// Extract primary index bits.
+	uint64_t data1;
+	if (block.index >= 64) {
+		// Because the index bits are all in the second 64-bit word, there is no need to use
+		// block_extract_bits().
+		// This implies the mode is not 4.
+		data1 = block.data1 >> (block.index - 64);
+		uint8_t mask1 = (1 << IB[mode]) - 1;
+		uint8_t mask2 = (1 << (IB[mode] - 1)) - 1;
+		for (int i = 0; i < 16; i++)
+			if (i == anchor_index[subset_index[i]]) {
+				// Highest bit is zero.
+				color_index[i] = data1 & mask2;
+				data1 >>= IB[mode] - 1;
+				alpha_index[i] = color_index[i];
+			}
+			else {
+				color_index[i] = data1 & mask1;
+				data1 >>= IB[mode];
+				alpha_index[i] = color_index[i];
+			}
+	}
+	else {	// Implies mode 4.
+				// Because the bits cross the 64-bit word boundary, we have to be careful.
+				// Block index is 50 at this point.
+		uint64_t data = block.data0 >> 50;
+		data |= block.data1 << 14;
+		for (int i = 0; i < 16; i++)
+			if (i == anchor_index[subset_index[i]]) {
+				// Highest bit is zero.
+				if (index_selection_bit) {	// Implies mode == 4.
+					alpha_index[i] = data & 0x1;
+					data >>= 1;
+				}
+				else {
+					color_index[i] = data & 0x1;
+					data >>= 1;
+				}
+			}
+			else {
+				if (index_selection_bit) {	// Implies mode == 4.
+					alpha_index[i] = data & 0x3;
+					data >>= 2;
+				}
+				else {
+					color_index[i] = data & 0x3;
+					data >>= 2;
+				}
+			}
+			// Block index is 81 at this point.
+			data1 = block.data1 >> (81 - 64);
+	}
+	// Extract secondary index bits.
+	if (IB2[mode] > 0) {
+		uint8_t mask1 = (1 << IB2[mode]) - 1;
+		uint8_t mask2 = (1 << (IB2[mode] - 1)) - 1;
+		for (int i = 0; i < 16; i++)
+			if (i == anchor_index[subset_index[i]]) {
+				// Highest bit is zero.
+				if (index_selection_bit) {
+					color_index[i] = data1 & 0x3;
+					data1 >>= 2;
+				}
+				else {
+					//					alpha_index[i] = block_extract_bits(&block, IB2[mode] - 1);
+					alpha_index[i] = data1 & mask2;
+					data1 >>= IB2[mode] - 1;
+				}
+			}
+			else {
+				if (index_selection_bit) {
+					color_index[i] = data1 & 0x7;
+					data1 >>= 3;
+				}
+				else {
+					//					alpha_index[i] = block_extract_bits(&block, IB2[mode]);
+					alpha_index[i] = data1 & mask1;
+					data1 >>= IB2[mode];
+				}
+			}
+	}
+
+	uint32_t *pixel32_buffer = (uint32_t *)pixel_buffer;
+	for (int i = 0; i < 16; i++) {
+		uint8_t endpoint_start[4];
+		uint8_t endpoint_end[4];
+		for (int j = 0; j < 4; j++) {
+			endpoint_start[j] = endpoint_array[2 * subset_index[i] * 4 + j];
+			endpoint_end[j] = endpoint_array[(2 * subset_index[i] + 1) * 4 + j];
+		}
+
+		uint32_t output = 0;
+		output = detexPack32R8(Interpolate(endpoint_start[0], endpoint_end[0], color_index[i], color_index_bitcount));
+		output |= detexPack32G8(Interpolate(endpoint_start[1], endpoint_end[1], color_index[i], color_index_bitcount));
+		output |= detexPack32B8(Interpolate(endpoint_start[2], endpoint_end[2], color_index[i], color_index_bitcount));
+		output |= detexPack32A8(Interpolate(endpoint_start[3], endpoint_end[3], alpha_index[i], alpha_index_bitcount));
+
+		if (rotation > 0) {
+			if (rotation == 1)
+				output = detexPack32RGBA8(detexPixel32GetA8(output), detexPixel32GetG8(output),
+					detexPixel32GetB8(output), detexPixel32GetR8(output));
+			else
+				if (rotation == 2)
+					output = detexPack32RGBA8(detexPixel32GetR8(output), detexPixel32GetA8(output),
+						detexPixel32GetB8(output), detexPixel32GetG8(output));
+				else // rotation == 3
+					output = detexPack32RGBA8(detexPixel32GetR8(output), detexPixel32GetG8(output),
+						detexPixel32GetA8(output), detexPixel32GetB8(output));
+		}
+		pixel32_buffer[i] = output;
+	}
+	return true;
+}
+
+/* Return the internal mode of the BPTC block. */
+uint32_t detexGetModeBPTC(const uint8_t *bitstring) {
+	detexBlock128 block;
+	block.data0 = *(uint64_t *)&bitstring[0];
+	block.data1 = *(uint64_t *)&bitstring[8];
+	block.index = 0;
+	int mode = ExtractMode(&block);
+	return mode;
+}
+
+void detexSetModeBPTC(uint8_t *bitstring, uint32_t mode, uint32_t flags,
+	uint32_t *colors) {
+	// Mode 0 starts with 1
+	// Mode 1 starts with 01
+	// ...
+	// Mode 7 starts with 00000001
+	int bit = 0x1 << mode;
+	bitstring[0] &= ~(bit - 1);
+	bitstring[0] |= bit;
+	return;
+}
+
diff --git a/bc7decomp.h b/bc7decomp.h
new file mode 100644
index 0000000..1ade732
--- /dev/null
+++ b/bc7decomp.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#ifdef _MSC_VER
+#define DETEX_INLINE_ONLY __forceinline
+#define DETEX_RESTRICT __restrict
+#else
+#define DETEX_INLINE_ONLY
+#define DETEX_RESTRICT
+#endif
+
+enum {
+	/* Function returns false (invalid block) when the compressed block */
+	/* is in a format not allowed to be generated by an encoder. */
+	DETEX_DECOMPRESS_FLAG_ENCODE = 0x1,
+	/* For compression formats that have opaque and non-opaque modes, */
+	/* return false (invalid block) when the compressed block is encoded */
+	/* using a non-opaque mode. */
+	DETEX_DECOMPRESS_FLAG_OPAQUE_ONLY = 0x2,
+	/* For compression formats that have opaque and non-opaque modes, */
+	/* return false (invalid block) when the compressed block is encoded */
+	/* using an opaque mode. */
+	DETEX_DECOMPRESS_FLAG_NON_OPAQUE_ONLY = 0x4,
+}; 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool detexDecompressBlockBPTC(const uint8_t * DETEX_RESTRICT bitstring, uint32_t mode_mask,
+	uint32_t flags, uint8_t * DETEX_RESTRICT pixel_buffer);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/bc7enc.cpp b/bc7enc.cpp
new file mode 100644
index 0000000..f912c56
--- /dev/null
+++ b/bc7enc.cpp
@@ -0,0 +1,634 @@
+// bc7enc.cpp - bc7enc17.c command line example/test app
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <algorithm>
+#include <assert.h>
+#include <time.h>
+
+#include "bc7enc16.h"
+#include "lodepng.h"
+#include "dds_defs.h"
+#include "bc7decomp.h"
+
+template <typename T> inline T clamp(T v, T l, T h) { if (v < l) v = l; else if (v > h) v = h; return v; }
+inline int iabs(int i) { if (i < 0) i = -i; return i; }
+
+static int print_usage()
+{
+	fprintf(stderr, "bc7enc\n");
+	fprintf(stderr, "Reads PNG files (with or without alpha channels) and packs them to BC7/BPTC using modes 1 and 6.\n");
+	fprintf(stderr, "This tool works best with opaque images, or on images with relatively simple alpha channels.\n");
+	fprintf(stderr, "By default, a DX10 DDS file and a unpacked PNG file will be written to the source file's directory with the .dds/_unpacked.png/_unpacked_alpha.png suffixes.\n\n");
+	fprintf(stderr, "Usage: bc7enc [-apng_filename] [-l] [-uX] [-aX] [-g] [-y] input_filename.png [compressed_output.dds] [unpacked_output.png]\n");
+	fprintf(stderr, "-apng_filename Load G channel of PNG file into alpha channel of source image\n");
+	fprintf(stderr, "-l Use linear colorspace metrics instead of perceptual\n");
+	fprintf(stderr, "-uX Higher quality levels, X ranges from [0,4], higher=slower\n");
+	fprintf(stderr, "-pX Scan X partitions in mode 1, X ranges from [0,64], use 0 to disable mode 1 entirely (faster)\n");
+	fprintf(stderr, "-g Don't write an unpacked output PNG file\n");
+	fprintf(stderr, "-y Flip source image along Y axis before packing\n");
+		
+	return EXIT_FAILURE;
+}
+
+struct color_quad_u8
+{
+	uint8_t m_c[4];
+	
+	inline color_quad_u8(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+	{
+		set(r, g, b, a);
+	}
+
+	inline color_quad_u8(uint8_t y = 0, uint8_t a = 255)
+	{
+		set(y, a);
+	}
+
+	inline color_quad_u8 &set(uint8_t y, uint8_t a = 255)
+	{
+		m_c[0] = y;
+		m_c[1] = y;
+		m_c[2] = y;
+		m_c[3] = a;
+		return *this;
+	}
+	
+	inline color_quad_u8 &set(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+	{
+		m_c[0] = r;
+		m_c[1] = g;
+		m_c[2] = b;
+		m_c[3] = a;
+		return *this;
+	}
+
+	inline uint8_t &operator[] (uint32_t i) { assert(i < 4);  return m_c[i]; }
+	inline uint8_t operator[] (uint32_t i) const { assert(i < 4); return m_c[i]; }
+
+	inline int get_luma() const { return (13938U * m_c[0] + 46869U * m_c[1] + 4729U * m_c[2] + 32768U) >> 16U; } // REC709 weightings
+};
+typedef std::vector<color_quad_u8> color_quad_u8_vec;
+
+class image_u8
+{
+public:
+	image_u8() : 
+		m_width(0), m_height(0)
+	{
+	}
+
+	image_u8(uint32_t width, uint32_t height) :
+		m_width(width), m_height(height)
+	{
+		m_pixels.resize(width * height);
+	}
+
+	inline const color_quad_u8_vec &get_pixels() const { return m_pixels; }
+	inline color_quad_u8_vec &get_pixels() { return m_pixels; }
+
+	inline uint32_t width() const { return m_width; }
+	inline uint32_t height() const { return m_height; }
+	inline uint32_t total_pixels() const { return m_width * m_height; }
+
+	inline color_quad_u8 &operator()(uint32_t x, uint32_t y) { assert(x < m_width && y < m_height);  return m_pixels[x + m_width * y]; }
+	inline const color_quad_u8 &operator()(uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height);  return m_pixels[x + m_width * y]; }
+
+	image_u8& clear()
+	{
+		m_width = m_height = 0;
+		m_pixels.clear();
+		return *this;
+	}
+
+	image_u8& init(uint32_t width, uint32_t height)
+	{
+		clear();
+
+		m_width = width;
+		m_height = height;
+		m_pixels.resize(width * height);
+		return *this;
+	}
+
+	image_u8& set_all(const color_quad_u8 &p)
+	{
+		for (uint32_t i = 0; i < m_pixels.size(); i++)
+			m_pixels[i] = p;
+		return *this;
+	}
+
+	image_u8& crop(uint32_t new_width, uint32_t new_height)
+	{
+		if ((m_width == new_width) && (m_height == new_height))
+			return *this;
+
+		image_u8 new_image(new_width, new_height);
+
+		const uint32_t w = std::min(m_width, new_width);
+		const uint32_t h = std::min(m_height, new_height);
+
+		for (uint32_t y = 0; y < h; y++)
+			for (uint32_t x = 0; x < w; x++)
+				new_image(x, y) = (*this)(x, y);
+
+		return swap(new_image);
+	}
+
+	image_u8 &swap(image_u8 &other)
+	{
+		std::swap(m_width, other.m_width);
+		std::swap(m_height, other.m_height);
+		std::swap(m_pixels, other.m_pixels);
+		return *this;
+	}
+
+	inline void get_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, color_quad_u8 *pPixels)
+	{
+		assert((bx * width + width) <= m_width);
+		assert((by * height + height) <= m_height);
+
+		for (uint32_t y = 0; y < height; y++)
+			memcpy(pPixels + y * width, &(*this)(bx * width, by * height + y), width * sizeof(color_quad_u8));
+	}
+
+	inline void set_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, const color_quad_u8 *pPixels)
+	{
+		assert((bx * width + width) <= m_width);
+		assert((by * height + height) <= m_height);
+
+		for (uint32_t y = 0; y < height; y++)
+			memcpy(&(*this)(bx * width, by * height + y), pPixels + y * width, width * sizeof(color_quad_u8));
+	}
+
+	image_u8 &swizzle(uint32_t r, uint32_t g, uint32_t b, uint32_t a)
+	{
+		assert((r | g | b | a) <= 3);
+		for (uint32_t y = 0; y < m_height; y++)
+		{
+			for (uint32_t x = 0; x < m_width; x++)
+			{
+				color_quad_u8 tmp((*this)(x, y));
+				(*this)(x, y).set(tmp[r], tmp[g], tmp[b], tmp[a]);
+			}
+		}
+
+		return *this;
+	}
+		
+private:
+	color_quad_u8_vec m_pixels;
+	uint32_t m_width, m_height;
+};
+
+static bool load_png(const char *pFilename, image_u8 &img)
+{
+	img.clear();
+
+	std::vector<unsigned char> pixels;
+	unsigned int w = 0, h = 0;
+	unsigned int e = lodepng::decode(pixels, w, h, pFilename);
+	if (e != 0)
+	{
+		fprintf(stderr, "Failed loading PNG file %s\n", pFilename);
+		return false;
+	}
+
+	img.init(w, h);
+	memcpy(&img.get_pixels()[0], &pixels[0], w * h * sizeof(uint32_t));
+	
+	return true;
+}
+
+static bool save_png(const char *pFilename, const image_u8 &img, bool save_alpha)
+{
+	const uint32_t w = img.width();
+	const uint32_t h = img.height();
+
+	std::vector<unsigned char> pixels;
+	if (save_alpha)
+	{
+		pixels.resize(w * h * sizeof(color_quad_u8));
+		memcpy(&pixels[0], &img.get_pixels()[0], w * h * sizeof(color_quad_u8));
+	}
+	else
+	{
+		pixels.resize(w * h * 3);
+		unsigned char *pDst = &pixels[0];
+		for (uint32_t y = 0; y < h; y++)
+			for (uint32_t x = 0; x < w; x++, pDst += 3)
+				pDst[0] = img(x, y)[0], pDst[1] = img(x, y)[1], pDst[2] = img(x, y)[2];
+	}
+	
+	return lodepng::encode(pFilename, pixels, w, h, save_alpha ? LCT_RGBA : LCT_RGB) == 0;
+}
+
+class image_metrics
+{
+public:
+	double m_max, m_mean, m_mean_squared, m_root_mean_squared, m_peak_snr;
+
+	image_metrics()
+	{
+		clear();
+	}
+
+	void clear()
+	{
+		memset(this, 0, sizeof(*this));
+	}
+
+	void compute(const image_u8 &a, const image_u8 &b, uint32_t first_channel, uint32_t num_channels)
+	{
+		const bool average_component_error = true;
+
+		const uint32_t width = std::min(a.width(), b.width());
+		const uint32_t height = std::min(a.height(), b.height());
+
+		assert((first_channel < 4U) && (first_channel + num_channels <= 4U));
+
+		// Histogram approach originally due to Charles Bloom.
+		double hist[256];
+		memset(hist, 0, sizeof(hist));
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				const color_quad_u8 &ca = a(x, y);
+				const color_quad_u8 &cb = b(x, y);
+
+				if (!num_channels)
+					hist[iabs(ca.get_luma() - cb.get_luma())]++;
+				else
+				{
+					for (uint32_t c = 0; c < num_channels; c++)
+						hist[iabs(ca[first_channel + c] - cb[first_channel + c])]++;
+				}
+			}
+		}
+
+		m_max = 0;
+		double sum = 0.0f, sum2 = 0.0f;
+		for (uint32_t i = 0; i < 256; i++)
+		{
+			if (!hist[i])
+				continue;
+
+			m_max = std::max<double>(m_max, i);
+
+			double x = i * hist[i];
+
+			sum += x;
+			sum2 += i * x;
+		}
+
+		// See http://richg42.blogspot.com/2016/09/how-to-compute-psnr-from-old-berkeley.html
+		double total_values = width * height;
+
+		if (average_component_error)
+			total_values *= clamp<uint32_t>(num_channels, 1, 4);
+
+		m_mean = clamp<double>(sum / total_values, 0.0f, 255.0f);
+		m_mean_squared = clamp<double>(sum2 / total_values, 0.0f, 255.0f * 255.0f);
+
+		m_root_mean_squared = sqrt(m_mean_squared);
+
+		if (!m_root_mean_squared)
+			m_peak_snr = 1e+10f;
+		else
+			m_peak_snr = clamp<double>(log10(255.0f / m_root_mean_squared) * 20.0f, 0.0f, 500.0f);
+	}
+};
+
+struct bc7_block
+{
+	uint64_t m_vals[2];
+};
+
+typedef std::vector<bc7_block> bc7_block_vec;
+
+static bool save_bc7_dds(const char *pFilename, uint32_t width, uint32_t height, const bc7_block *pBlocks, bool srgb)
+{
+	FILE *pFile = NULL;
+	fopen_s(&pFile, pFilename, "wb");
+	if (!pFile)
+	{
+		fprintf(stderr, "Failed creating file %s!\n", pFilename);
+		return false;
+	}
+
+	fwrite("DDS ", 4, 1, pFile);
+
+	DDSURFACEDESC2 desc;
+	memset(&desc, 0, sizeof(desc));
+
+	desc.dwSize = sizeof(desc);
+	desc.dwFlags = DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT | DDSD_CAPS;
+
+	desc.dwWidth = width;
+	desc.dwHeight = height;
+
+	desc.ddsCaps.dwCaps = DDSCAPS_TEXTURE;
+	desc.ddpfPixelFormat.dwSize = sizeof(desc.ddpfPixelFormat);
+				
+	desc.ddpfPixelFormat.dwFlags |= DDPF_FOURCC;
+
+	desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', '1', '0');
+	desc.ddpfPixelFormat.dwRGBBitCount = 0;
+	
+	const uint32_t pixel_format_bpp = 8;
+
+	desc.lPitch = (((desc.dwWidth + 3) & ~3) * ((desc.dwHeight + 3) & ~3) * pixel_format_bpp) >> 3;
+	desc.dwFlags |= DDSD_LINEARSIZE;
+
+	fwrite(&desc, sizeof(desc), 1, pFile);
+		
+	DDS_HEADER_DXT10 hdr10;
+	memset(&hdr10, 0, sizeof(hdr10));
+
+	hdr10.dxgiFormat = srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM;
+	hdr10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
+	hdr10.arraySize = 1;
+
+	fwrite(&hdr10, sizeof(hdr10), 1, pFile);
+
+	fwrite(pBlocks, desc.lPitch, 1, pFile);
+
+	if (fclose(pFile) == EOF)
+	{
+		fprintf(stderr, "Failed writing to DDS file %s!\n", pFilename);
+		return false;
+	}
+
+	return true;
+}
+
+static void strip_extension(std::string &s)
+{
+	for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--)
+	{
+		if (s[i] == '.')
+		{
+			s.resize(i);
+			break;
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc < 2)
+		return print_usage();
+
+	std::string src_filename;
+	std::string src_alpha_filename;
+	std::string dds_output_filename;
+	std::string png_output_filename;
+	std::string png_alpha_output_filename;
+	int uber_level = 0;
+	int max_partitions_to_scan = BC7ENC16_MAX_PARTITIONS1;
+	bool perceptual = true;
+	bool no_output_png = false;
+	bool y_flip = false;
+	
+	for (int i = 1; i < argc; i++)
+	{
+		const char *pArg = argv[i];
+		if (pArg[0] == '-')
+		{
+			switch (pArg[1])
+			{
+				case 'y':
+				{
+					y_flip = true;
+					break;
+				}
+				case 'a':
+				{
+					src_alpha_filename = pArg + 2;
+					break;
+				}
+				case 'u':
+				{
+					uber_level = atoi(pArg + 2);
+					if ((uber_level < 0) || (uber_level > BC7ENC16_MAX_UBER_LEVEL))
+					{
+						fprintf(stderr, "Invalid argument: %s\n", pArg);
+						return EXIT_FAILURE;
+					}
+					break;
+
+				}
+				case 'g':
+				{
+					no_output_png = true;
+					break;
+				}
+				case 'l':
+				{
+					perceptual = false;
+					break;
+				}
+				case 'p':
+				{
+					max_partitions_to_scan = atoi(pArg + 2);
+					if ((max_partitions_to_scan < 0) || (max_partitions_to_scan > BC7ENC16_MAX_PARTITIONS1))
+					{
+						fprintf(stderr, "Invalid argument: %s\n", pArg);
+						return EXIT_FAILURE;
+					}
+					break;
+				}
+				default:
+				{
+					fprintf(stderr, "Invalid argument: %s\n", pArg);
+					return EXIT_FAILURE;
+				}
+			}
+		}
+		else
+		{
+			if (!src_filename.size())
+				src_filename = pArg;
+			else if (!dds_output_filename.size())
+				dds_output_filename = pArg;
+			else if (!png_output_filename.size())
+				png_output_filename = pArg;
+			else
+			{
+				fprintf(stderr, "Invalid argument: %s\n", pArg);
+				return EXIT_FAILURE;
+			}
+		}
+	}
+
+	if (!src_filename.size())
+	{
+		fprintf(stderr, "No source filename specified!\n");
+		return EXIT_FAILURE;
+	}
+
+	if (!dds_output_filename.size())
+	{
+		dds_output_filename = src_filename;
+		strip_extension(dds_output_filename);
+		dds_output_filename += ".dds";
+	}
+
+	if (!png_output_filename.size())
+	{
+		png_output_filename = src_filename;
+		strip_extension(png_output_filename);
+		png_output_filename += "_unpacked.png";
+	}
+
+	png_alpha_output_filename = png_output_filename;
+	strip_extension(png_alpha_output_filename);
+	png_alpha_output_filename += "_unpacked_alpha.png";
+		
+	image_u8 source_image;
+	if (!load_png(src_filename.c_str(), source_image))
+		return EXIT_FAILURE;
+
+	printf("Source image: %s %ux%u\n", src_filename.c_str(), source_image.width(), source_image.height());
+
+	if (src_alpha_filename.size())
+	{
+		image_u8 source_alpha_image;
+		if (!load_png(src_alpha_filename.c_str(), source_alpha_image))
+			return EXIT_FAILURE;
+
+		printf("Source alpha image: %s %ux%u\n", src_alpha_filename.c_str(), source_alpha_image.width(), source_alpha_image.height());
+
+		const uint32_t w = std::min(source_alpha_image.width(), source_image.width());
+		const uint32_t h = std::min(source_alpha_image.height(), source_image.height());
+		
+		for (uint32_t y = 0; y < h; y++)
+			for (uint32_t x = 0; x < w; x++)
+				source_image(x, y)[3] = source_alpha_image(x, y)[1];
+	}
+				
+	const uint32_t orig_width = source_image.width();
+	const uint32_t orig_height = source_image.height();
+
+	if (y_flip)
+	{
+		image_u8 temp;
+		temp.init(orig_width, orig_height);
+
+		for (uint32_t y = 0; y < orig_height; y++)
+			for (uint32_t x = 0; x < orig_width; x++)
+				temp(x, (orig_height - 1) - y) = source_image(x, y);
+
+		temp.swap(source_image);
+	}
+
+	source_image.crop((source_image.width() + 3) & ~3, (source_image.height() + 3) & ~3);
+		
+	const uint32_t blocks_x = source_image.width() / 4;
+	const uint32_t blocks_y = source_image.height() / 4;
+
+	bc7_block_vec packed_image(blocks_x * blocks_y);
+
+	bc7enc16_compress_block_params pack_params;
+	bc7enc16_compress_block_params_init(&pack_params);
+	if (!perceptual)
+		bc7enc16_compress_block_params_init_linear_weights(&pack_params);
+	pack_params.m_max_partitions_mode1 = max_partitions_to_scan;
+	pack_params.m_uber_level = uber_level;
+	
+	printf("Max mode 1 partitions: %u, uber level: %u, perceptual: %u\n", pack_params.m_max_partitions_mode1, pack_params.m_uber_level, perceptual);
+
+	bc7enc16_compress_block_init();
+
+	bool has_alpha = false;
+
+	clock_t start_t = clock();
+	for (uint32_t by = 0; by < blocks_y; by++)
+	{
+		for (uint32_t bx = 0; bx < blocks_x; bx++)
+		{
+			color_quad_u8 pixels[16];
+
+			source_image.get_block(bx, by, 4, 4, pixels);
+			
+			bc7_block *pBlock = &packed_image[bx + by * blocks_x];
+
+			if (bc7enc16_compress_block(pBlock, pixels, &pack_params))
+				has_alpha = true;
+		}
+
+		if ((by & 63) == 0)
+			printf(".");
+	}
+	
+	clock_t end_t = clock();
+	
+	printf("\nTotal time: %f secs\n", (double)(end_t - start_t) / CLOCKS_PER_SEC);
+		
+	if (has_alpha)
+		printf("Source image had an alpha channel.\n");
+	
+	bool failed = false;
+	if (!save_bc7_dds(dds_output_filename.c_str(), orig_width, orig_height, &packed_image[0], perceptual))
+		failed = true;
+	else
+		printf("Wrote DDS file %s\n", dds_output_filename.c_str());
+
+	if ((!no_output_png) && (png_output_filename.size()))
+	{
+		image_u8 unpacked_image(source_image.width(), source_image.height());
+
+		for (uint32_t by = 0; by < blocks_y; by++)
+		{
+			for (uint32_t bx = 0; bx < blocks_x; bx++)
+			{
+				bc7_block *pBlock = &packed_image[bx + by * blocks_x];
+
+				color_quad_u8 unpacked_pixels[16];
+				detexDecompressBlockBPTC((const uint8_t *)pBlock, UINT32_MAX, 0, (uint8_t *)unpacked_pixels);
+
+				unpacked_image.set_block(bx, by, 4, 4, unpacked_pixels);
+			}
+		}
+
+		image_metrics y_metrics;
+		y_metrics.compute(source_image, unpacked_image, 0, 0);
+		printf("Luma  Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", y_metrics.m_max, y_metrics.m_root_mean_squared, y_metrics.m_peak_snr);
+
+		image_metrics rgb_metrics;
+		rgb_metrics.compute(source_image, unpacked_image, 0, 3);
+		printf("RGB   Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", rgb_metrics.m_max, rgb_metrics.m_root_mean_squared, rgb_metrics.m_peak_snr);
+
+		image_metrics rgba_metrics;
+		rgba_metrics.compute(source_image, unpacked_image, 0, 4);
+		printf("RGBA  Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", rgba_metrics.m_max, rgba_metrics.m_root_mean_squared, rgba_metrics.m_peak_snr);
+						
+		image_metrics a_metrics;
+		a_metrics.compute(source_image, unpacked_image, 3, 1);
+		printf("Alpha Max error: %3.0f RMSE: %f PSNR %03.02f dB\n", a_metrics.m_max, a_metrics.m_root_mean_squared, a_metrics.m_peak_snr);
+
+		if (!save_png(png_output_filename.c_str(), unpacked_image, false))
+			failed = true;
+		else
+			printf("Wrote PNG file %s\n", png_output_filename.c_str());
+
+		//if ((png_alpha_output_filename.size()) && (has_alpha))
+		if (png_alpha_output_filename.size())
+		{
+			image_u8 unpacked_image_alpha(unpacked_image);
+			for (uint32_t y = 0; y < unpacked_image_alpha.height(); y++)
+				for (uint32_t x = 0; x < unpacked_image_alpha.width(); x++)
+					unpacked_image_alpha(x, y).set(unpacked_image_alpha(x, y)[3], 255);
+
+			if (!save_png(png_alpha_output_filename.c_str(), unpacked_image_alpha, false))
+				failed = true;
+			else
+				printf("Wrote PNG file %s\n", png_alpha_output_filename.c_str());
+		}
+	}
+		
+	return failed ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/bc7enc16.c b/bc7enc16.c
new file mode 100644
index 0000000..fce7e76
--- /dev/null
+++ b/bc7enc16.c
@@ -0,0 +1,1413 @@
+// File: bc7enc16.c - Richard Geldreich, Jr. 4/2018 - MIT license or public domain (see end of file)
+#include "bc7enc16.h"
+#include <math.h>
+#include <memory.h>
+#include <assert.h>
+
+// Helpers
+static inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
+static inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
+static inline float saturate(float value) { return clampf(value, 0, 1.0f); }
+static inline uint8_t minimumub(uint8_t a, uint8_t b) { return (a < b) ? a : b; }
+static inline uint32_t minimumu(uint32_t a, uint32_t b) { return (a < b) ? a : b; }
+static inline float minimumf(float a, float b) { return (a < b) ? a : b; }
+static inline uint8_t maximumub(uint8_t a, uint8_t b) { return (a > b) ? a : b; }
+static inline uint32_t maximumu(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
+static inline float maximumf(float a, float b) { return (a > b) ? a : b; }
+static inline int squarei(int i) { return i * i; }
+static inline float squaref(float i) { return i * i; }
+
+typedef struct { uint8_t m_c[4]; } color_quad_u8;
+typedef struct { float m_c[4]; } vec4F;
+
+static inline color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { pRes->m_c[0] = (uint8_t)clampi(r, 0, 255); pRes->m_c[1] = (uint8_t)clampi(g, 0, 255); pRes->m_c[2] = (uint8_t)clampi(b, 0, 255); pRes->m_c[3] = (uint8_t)clampi(a, 0, 255); return pRes; }
+static inline color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { assert((uint32_t)(r | g | b | a) <= 255); pRes->m_c[0] = (uint8_t)r; pRes->m_c[1] = (uint8_t)g; pRes->m_c[2] = (uint8_t)b; pRes->m_c[3] = (uint8_t)a; return pRes; }
+static inline bc7enc16_bool color_quad_u8_notequals(const color_quad_u8 *pLHS, const color_quad_u8 *pRHS) { return (pLHS->m_c[0] != pRHS->m_c[0]) || (pLHS->m_c[1] != pRHS->m_c[1]) || (pLHS->m_c[2] != pRHS->m_c[2]) || (pLHS->m_c[3] != pRHS->m_c[3]); }
+static inline vec4F *vec4F_set_scalar(vec4F *pV, float x) {	pV->m_c[0] = x; pV->m_c[1] = x; pV->m_c[2] = x;	pV->m_c[3] = x;	return pV; }
+static inline vec4F *vec4F_set(vec4F *pV, float x, float y, float z, float w) {	pV->m_c[0] = x;	pV->m_c[1] = y;	pV->m_c[2] = z;	pV->m_c[3] = w;	return pV; }
+static inline vec4F *vec4F_saturate_in_place(vec4F *pV) { pV->m_c[0] = saturate(pV->m_c[0]); pV->m_c[1] = saturate(pV->m_c[1]); pV->m_c[2] = saturate(pV->m_c[2]); pV->m_c[3] = saturate(pV->m_c[3]); return pV; }
+static inline vec4F vec4F_saturate(const vec4F *pV) { vec4F res; res.m_c[0] = saturate(pV->m_c[0]); res.m_c[1] = saturate(pV->m_c[1]); res.m_c[2] = saturate(pV->m_c[2]); res.m_c[3] = saturate(pV->m_c[3]); return res; }
+static inline vec4F vec4F_from_color(const color_quad_u8 *pC) { vec4F res; vec4F_set(&res, pC->m_c[0], pC->m_c[1], pC->m_c[2], pC->m_c[3]); return res; }
+static inline vec4F vec4F_add(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->m_c[0] + pRHS->m_c[0], pLHS->m_c[1] + pRHS->m_c[1], pLHS->m_c[2] + pRHS->m_c[2], pLHS->m_c[3] + pRHS->m_c[3]); return res; }
+static inline vec4F vec4F_sub(const vec4F *pLHS, const vec4F *pRHS) { vec4F res; vec4F_set(&res, pLHS->m_c[0] - pRHS->m_c[0], pLHS->m_c[1] - pRHS->m_c[1], pLHS->m_c[2] - pRHS->m_c[2], pLHS->m_c[3] - pRHS->m_c[3]); return res; }
+static inline float vec4F_dot(const vec4F *pLHS, const vec4F *pRHS) { return pLHS->m_c[0] * pRHS->m_c[0] + pLHS->m_c[1] * pRHS->m_c[1] + pLHS->m_c[2] * pRHS->m_c[2] + pLHS->m_c[3] * pRHS->m_c[3]; }
+static inline vec4F vec4F_mul(const vec4F *pLHS, float s) { vec4F res; vec4F_set(&res, pLHS->m_c[0] * s, pLHS->m_c[1] * s, pLHS->m_c[2] * s, pLHS->m_c[3] * s); return res; }
+static inline vec4F *vec4F_normalize_in_place(vec4F *pV) { float s = pV->m_c[0] * pV->m_c[0] + pV->m_c[1] * pV->m_c[1] + pV->m_c[2] * pV->m_c[2] + pV->m_c[3] * pV->m_c[3]; if (s != 0.0f) { s = 1.0f / sqrtf(s); pV->m_c[0] *= s; pV->m_c[1] *= s; pV->m_c[2] *= s; pV->m_c[3] *= s; } return pV; }
+
+// Various BC7 tables
+static const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+static const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+// Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
+static const float g_bc7_weights3x[8 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f,
+	0.079102f, 0.718750f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+static const float g_bc7_weights4x[16 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f,
+	0.451416f, 0.328125f, 0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f,
+	0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+static const uint8_t g_bc7_partition1[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+static const uint8_t g_bc7_partition2[64 * 16] =
+{
+	0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,		0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,		0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,		0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,		0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,		0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,
+	0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,		0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,		0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,		0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
+	0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1,		0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,		0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,		0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,		0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,		0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0,		0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,		0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,
+	0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,		0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,		0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,		0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,		0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0,		0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,		0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0,		0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,
+	0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,		0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,		0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0,		0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0,		0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,		0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,		0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,		0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,
+	0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0,		0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0,		0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0,		0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,		0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,		0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1,		0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1,		0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,
+	0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,		0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,		0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,		0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,		0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,		0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1,		0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0,		0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0,
+	0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1,		0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1,		0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1,		0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,		0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1,		0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0,		0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,		0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1
+};
+static const uint8_t g_bc7_table_anchor_index_second_subset[64] = {	15,15,15,15,15,15,15,15,		15,15,15,15,15,15,15,15,		15, 2, 8, 2, 2, 8, 8,15,		2, 8, 2, 2, 8, 8, 2, 2,		15,15, 6, 8, 2, 8,15,15,		2, 8, 2, 2, 2,15,15, 6,		6, 2, 6, 8,15,15, 2, 2,		15,15,15,15,15, 2, 2,15 };
+static const uint8_t g_bc7_num_subsets[8] = { 3, 2, 3, 2, 1, 1, 1, 2 };
+static const uint8_t g_bc7_partition_bits[8] = { 4, 6, 6, 6, 0, 0, 0, 6 };
+static const uint8_t g_bc7_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 };
+static int get_bc7_color_index_size(int mode, int index_selection_bit) { return g_bc7_color_index_bitcount[mode] + index_selection_bit; }
+static const uint8_t g_bc7_mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 };
+static const uint8_t g_bc7_mode_has_shared_p_bits[8] = { 0, 1, 0, 0, 0, 0, 0, 0 };
+static const uint8_t g_bc7_color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 };
+static const int8_t g_bc7_alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 };
+
+typedef struct { uint16_t m_error; uint8_t m_lo; uint8_t m_hi; } endpoint_err;
+
+static endpoint_err g_bc7_mode_1_optimal_endpoints[256][2]; // [c][pbit]
+static const uint32_t BC7ENC16_MODE_1_OPTIMAL_INDEX = 2;
+
+// Initialize the lookup table used for optimal single color compression in mode 1. Must be called before encoding.
+void bc7enc16_compress_block_init()
+{
+	for (int c = 0; c < 256; c++)
+	{
+		for (uint32_t lp = 0; lp < 2; lp++)
+		{
+			endpoint_err best;
+			best.m_error = (uint16_t)UINT16_MAX;
+			for (uint32_t l = 0; l < 64; l++)
+			{
+				uint32_t low = ((l << 1) | lp) << 1;
+				low |= (low >> 7);
+				for (uint32_t h = 0; h < 64; h++)
+				{
+					uint32_t high = ((h << 1) | lp) << 1;
+					high |= (high >> 7);
+					const int k = (low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6;
+					const int err = (k - c) * (k - c);
+					if (err < best.m_error)
+					{
+						best.m_error = (uint16_t)err;
+						best.m_lo = (uint8_t)l;
+						best.m_hi = (uint8_t)h;
+					}
+				} // h
+			} // l
+			g_bc7_mode_1_optimal_endpoints[c][lp] = best;
+		} // lp
+	} // c
+}
+
+static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_quad_u8 *pColors)
+{
+	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+	float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f;
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel].m_c[0];
+		z10 += pSelector_weights[sel].m_c[1];
+		z11 += pSelector_weights[sel].m_c[2];
+		float w = pSelector_weights[sel].m_c[3];
+		q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
+		q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
+		q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
+		q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+	q10_a = t_a - q00_a;
+
+	z01 = z10;
+
+	float det = z00 * z11 - z01 * z10;
+	if (det != 0.0f)
+		det = 1.0f / det;
+
+	float iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+	pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+	pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+	pXl->m_c[3] = (float)(iz00 * q00_a + iz01 * q10_a); pXh->m_c[3] = (float)(iz10 * q00_a + iz11 * q10_a);
+}
+
+static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSelectors, const vec4F *pSelector_weights, vec4F *pXl, vec4F *pXh, const color_quad_u8 *pColors)
+{
+	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel].m_c[0];
+		z10 += pSelector_weights[sel].m_c[1];
+		z11 += pSelector_weights[sel].m_c[2];
+		float w = pSelector_weights[sel].m_c[3];
+		q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
+		q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
+		q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+
+	z01 = z10;
+
+	float det = z00 * z11 - z01 * z10;
+	if (det != 0.0f)
+		det = 1.0f / det;
+
+	float iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+	pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+	pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+	pXl->m_c[3] = 255.0f; pXh->m_c[3] = 255.0f;
+}
+
+typedef struct 
+{
+	uint32_t m_num_pixels;
+	const color_quad_u8 *m_pPixels;
+	uint32_t m_num_selector_weights;
+	const uint32_t *m_pSelector_weights;
+	const vec4F *m_pSelector_weightsx;
+	uint32_t m_comp_bits;
+	uint32_t m_weights[4];
+	bc7enc16_bool m_has_alpha;
+	bc7enc16_bool m_has_pbits;
+	bc7enc16_bool m_endpoints_share_pbit;
+	bc7enc16_bool m_perceptual;
+} color_cell_compressor_params;
+
+typedef struct 
+{
+	uint64_t m_best_overall_err;
+	color_quad_u8 m_low_endpoint;
+	color_quad_u8 m_high_endpoint;
+	uint32_t m_pbits[2];
+	uint8_t *m_pSelectors;
+	uint8_t *m_pSelectors_temp;
+} color_cell_compressor_results;
+
+static inline color_quad_u8 scale_color(const color_quad_u8 *pC, const color_cell_compressor_params *pParams)
+{
+	color_quad_u8 results;
+
+	const uint32_t n = pParams->m_comp_bits + (pParams->m_has_pbits ? 1 : 0);
+	assert((n >= 4) && (n <= 8));
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		uint32_t v = pC->m_c[i] << (8 - n);
+		v |= (v >> n);
+		assert(v <= 255);
+		results.m_c[i] = (uint8_t)(v);
+	}
+
+	return results;
+}
+
+static inline uint64_t compute_color_distance_rgb(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc16_bool perceptual, const uint32_t weights[4])
+{
+	int dr, dg, db;
+
+	if (perceptual)
+	{
+		const int l1 = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37;
+		const int cr1 = ((int)pE1->m_c[0] << 9) - l1;
+		const int cb1 = ((int)pE1->m_c[2] << 9) - l1;
+		const int l2 = pE2->m_c[0] * 109 + pE2->m_c[1] * 366 + pE2->m_c[2] * 37;
+		const int cr2 = ((int)pE2->m_c[0] << 9) - l2;
+		const int cb2 = ((int)pE2->m_c[2] << 9) - l2;
+		dr = (l1 - l2) >> 8;
+		dg = (cr1 - cr2) >> 8;
+		db = (cb1 - cb2) >> 8;
+	}
+	else
+	{
+		dr = (int)pE1->m_c[0] - (int)pE2->m_c[0];
+		dg = (int)pE1->m_c[1] - (int)pE2->m_c[1];
+		db = (int)pE1->m_c[2] - (int)pE2->m_c[2];
+	}
+
+	return weights[0] * (uint32_t)(dr * dr) + weights[1] * (uint32_t)(dg * dg) + weights[2] * (uint32_t)(db * db);
+}
+
+static inline uint64_t compute_color_distance_rgba(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc16_bool perceptual, const uint32_t weights[4])
+{
+	int da = (int)pE1->m_c[3] - (int)pE2->m_c[3];
+	return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * (uint32_t)(da * da));
+}
+
+static uint64_t pack_mode1_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	uint32_t best_err = UINT_MAX;
+	uint32_t best_p = 0;
+
+	for (uint32_t p = 0; p < 2; p++)
+	{
+		uint32_t err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error;
+		if (err < best_err)
+		{
+			best_err = err;
+			best_p = p;
+		}
+	}
+
+	const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p];
+	const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p];
+	const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0);
+	pResults->m_pbits[0] = best_p;
+	pResults->m_pbits[1] = 0;
+
+	memset(pSelectors, BC7ENC16_MODE_1_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		uint32_t low = ((pResults->m_low_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1;
+		low |= (low >> 7);
+
+		uint32_t high = ((pResults->m_high_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1;
+		high |= (high >> 7);
+
+		p.m_c[i] = (uint8_t)((low * (64 - g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC16_MODE_1_OPTIMAL_INDEX] + 32) >> 6);
+	}
+	p.m_c[3] = 255;
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 *pHigh, const uint32_t pbits[2], const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
+{
+	color_quad_u8 quantMinColor = *pLow;
+	color_quad_u8 quantMaxColor = *pHigh;
+
+	if (pParams->m_has_pbits)
+	{
+		uint32_t minPBit, maxPBit;
+
+		if (pParams->m_endpoints_share_pbit)
+			maxPBit = minPBit = pbits[0];
+		else
+		{
+			minPBit = pbits[0];
+			maxPBit = pbits[1];
+		}
+
+		quantMinColor.m_c[0] = (uint8_t)((pLow->m_c[0] << 1) | minPBit);
+		quantMinColor.m_c[1] = (uint8_t)((pLow->m_c[1] << 1) | minPBit);
+		quantMinColor.m_c[2] = (uint8_t)((pLow->m_c[2] << 1) | minPBit);
+		quantMinColor.m_c[3] = (uint8_t)((pLow->m_c[3] << 1) | minPBit);
+
+		quantMaxColor.m_c[0] = (uint8_t)((pHigh->m_c[0] << 1) | maxPBit);
+		quantMaxColor.m_c[1] = (uint8_t)((pHigh->m_c[1] << 1) | maxPBit);
+		quantMaxColor.m_c[2] = (uint8_t)((pHigh->m_c[2] << 1) | maxPBit);
+		quantMaxColor.m_c[3] = (uint8_t)((pHigh->m_c[3] << 1) | maxPBit);
+	}
+
+	color_quad_u8 actualMinColor = scale_color(&quantMinColor, pParams);
+	color_quad_u8 actualMaxColor = scale_color(&quantMaxColor, pParams);
+
+	const uint32_t N = pParams->m_num_selector_weights;
+
+	color_quad_u8 weightedColors[16];
+	weightedColors[0] = actualMinColor;
+	weightedColors[N - 1] = actualMaxColor;
+
+	const uint32_t nc = pParams->m_has_alpha ? 4 : 3;
+	for (uint32_t i = 1; i < (N - 1); i++)
+		for (uint32_t j = 0; j < nc; j++)
+			weightedColors[i].m_c[j] = (uint8_t)((actualMinColor.m_c[j] * (64 - pParams->m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams->m_pSelector_weights[i] + 32) >> 6);
+
+	const int lr = actualMinColor.m_c[0];
+	const int lg = actualMinColor.m_c[1];
+	const int lb = actualMinColor.m_c[2];
+	const int dr = actualMaxColor.m_c[0] - lr;
+	const int dg = actualMaxColor.m_c[1] - lg;
+	const int db = actualMaxColor.m_c[2] - lb;
+	
+	uint64_t total_err = 0;
+	
+	if (!pParams->m_perceptual)
+	{
+		if (pParams->m_has_alpha)
+		{
+			const int la = actualMinColor.m_c[3];
+			const int da = actualMaxColor.m_c[3] - la;
+
+			const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f);
+
+			for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+			{
+				const color_quad_u8 *pC = &pParams->m_pPixels[i];
+				int r = pC->m_c[0];
+				int g = pC->m_c[1];
+				int b = pC->m_c[2];
+				int a = pC->m_c[3];
+
+				int best_sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f);
+				best_sel = clampi(best_sel, 1, N - 1);
+
+				uint64_t err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC16_FALSE, pParams->m_weights);
+				uint64_t err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC16_FALSE, pParams->m_weights);
+
+				if (err1 > err0)
+				{
+					err1 = err0;
+					--best_sel;
+				}
+				total_err += err1;
+
+				pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+			}
+		}
+		else
+		{
+			const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f);
+
+			for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+			{
+				const color_quad_u8 *pC = &pParams->m_pPixels[i];
+				int r = pC->m_c[0];
+				int g = pC->m_c[1];
+				int b = pC->m_c[2];
+
+				int sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f);
+				sel = clampi(sel, 1, N - 1);
+
+				uint64_t err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC16_FALSE, pParams->m_weights);
+				uint64_t err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC16_FALSE, pParams->m_weights);
+
+				int best_sel = sel;
+				uint64_t best_err = err1;
+				if (err0 < best_err)
+				{
+					best_err = err0;
+					best_sel = sel - 1;
+				}
+
+				total_err += best_err;
+
+				pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+			}
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint64_t best_err = UINT64_MAX;
+			uint32_t best_sel = 0;
+
+			if (pParams->m_has_alpha)
+			{
+				for (uint32_t j = 0; j < N; j++)
+				{
+					uint64_t err = compute_color_distance_rgba(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC16_TRUE, pParams->m_weights);
+					if (err < best_err)
+					{
+						best_err = err;
+						best_sel = j;
+					}
+				}
+			}
+			else
+			{
+				for (uint32_t j = 0; j < N; j++)
+				{
+					uint64_t err = compute_color_distance_rgb(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC16_TRUE, pParams->m_weights);
+					if (err < best_err)
+					{
+						best_err = err;
+						best_sel = j;
+					}
+				}
+			}
+
+			total_err += best_err;
+
+			pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+		}
+	}
+
+	if (total_err < pResults->m_best_overall_err)
+	{
+		pResults->m_best_overall_err = total_err;
+
+		pResults->m_low_endpoint = *pLow;
+		pResults->m_high_endpoint = *pHigh;
+
+		pResults->m_pbits[0] = pbits[0];
+		pResults->m_pbits[1] = pbits[1];
+
+		memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+	}
+				
+	return total_err;
+}
+
+static void fixDegenerateEndpoints(uint32_t mode, color_quad_u8 *pTrialMinColor, color_quad_u8 *pTrialMaxColor, const vec4F *pXl, const vec4F *pXh, uint32_t iscale)
+{
+	if (mode == 1)
+	{
+		// fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps)
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i])
+			{
+				if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.0f)
+				{
+					if (pTrialMinColor->m_c[i] > (iscale >> 1))
+					{
+						if (pTrialMinColor->m_c[i] > 0)
+							pTrialMinColor->m_c[i]--;
+						else
+							if (pTrialMaxColor->m_c[i] < iscale)
+								pTrialMaxColor->m_c[i]++;
+					}
+					else
+					{
+						if (pTrialMaxColor->m_c[i] < iscale)
+							pTrialMaxColor->m_c[i]++;
+						else if (pTrialMinColor->m_c[i] > 0)
+							pTrialMinColor->m_c[i]--;
+					}
+				}
+			}
+		}
+	}
+}
+
+static uint64_t find_optimal_solution(uint32_t mode, vec4F xl, vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
+{
+	vec4F_saturate_in_place(&xl); vec4F_saturate_in_place(&xh);
+
+	if (pParams->m_has_pbits)
+	{
+		const int iscalep = (1 << (pParams->m_comp_bits + 1)) - 1;
+		const float scalep = (float)iscalep;
+
+		const int32_t totalComps = pParams->m_has_alpha ? 4 : 3;
+
+		uint32_t best_pbits[2];
+		color_quad_u8 bestMinColor, bestMaxColor;
+
+		if (!pParams->m_endpoints_share_pbit)
+		{
+			float best_err0 = 1e+9;
+			float best_err1 = 1e+9;
+
+			for (int p = 0; p < 2; p++)
+			{
+				color_quad_u8 xMinColor, xMaxColor;
+
+				// Notes: The pbit controls which quantization intervals are selected.
+				// total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc.
+				// pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value
+				// rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5)
+				// rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5)
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+					xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+				}
+
+				color_quad_u8 scaledLow = scale_color(&xMinColor, pParams);
+				color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams);
+
+				float err0 = 0, err1 = 0;
+				for (int i = 0; i < totalComps; i++)
+				{
+					err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f);
+					err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f);
+				}
+
+				if (err0 < best_err0)
+				{
+					best_err0 = err0;
+					best_pbits[0] = p;
+
+					bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1;
+					bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1;
+					bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1;
+					bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1;
+				}
+
+				if (err1 < best_err1)
+				{
+					best_err1 = err1;
+					best_pbits[1] = p;
+
+					bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1;
+					bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1;
+					bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1;
+					bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1;
+				}
+			}
+		}
+		else
+		{
+			// Endpoints share pbits
+			float best_err = 1e+9;
+
+			for (int p = 0; p < 2; p++)
+			{
+				color_quad_u8 xMinColor, xMaxColor;
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+					xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+				}
+
+				color_quad_u8 scaledLow = scale_color(&xMinColor, pParams);
+				color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams);
+
+				float err = 0;
+				for (int i = 0; i < totalComps; i++)
+					err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]);
+
+				if (err < best_err)
+				{
+					best_err = err;
+					best_pbits[0] = p;
+					best_pbits[1] = p;
+					for (uint32_t j = 0; j < 4; j++)
+					{
+						bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1;
+						bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1;
+					}
+				}
+			}
+		}
+						
+		fixDegenerateEndpoints(mode, &bestMinColor, &bestMaxColor, &xl, &xh, iscalep >> 1);
+
+		if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&bestMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&bestMaxColor, &pResults->m_high_endpoint) || (best_pbits[0] != pResults->m_pbits[0]) || (best_pbits[1] != pResults->m_pbits[1]))
+			evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits, pParams, pResults);
+	}
+	else
+	{
+		const int iscale = (1 << pParams->m_comp_bits) - 1;
+		const float scale = (float)iscale;
+
+		color_quad_u8 trialMinColor, trialMaxColor;
+		color_quad_u8_set_clamped(&trialMinColor, (int)(xl.m_c[0] * scale + .5f), (int)(xl.m_c[1] * scale + .5f), (int)(xl.m_c[2] * scale + .5f), (int)(xl.m_c[3] * scale + .5f));
+		color_quad_u8_set_clamped(&trialMaxColor, (int)(xh.m_c[0] * scale + .5f), (int)(xh.m_c[1] * scale + .5f), (int)(xh.m_c[2] * scale + .5f), (int)(xh.m_c[3] * scale + .5f));
+
+		fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, iscale);
+
+		if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+			evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+	}
+
+	return pResults->m_best_overall_err;
+}
+
+static uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, const bc7enc16_compress_block_params *pComp_params)
+{
+	assert((mode == 6) || (!pParams->m_has_alpha));
+
+	pResults->m_best_overall_err = UINT64_MAX;
+
+	// If the partition's colors are all the same in mode 1, then just pack them as a single color.
+	if (mode == 1)
+	{
+		const uint32_t cr = pParams->m_pPixels[0].m_c[0], cg = pParams->m_pPixels[0].m_c[1], cb = pParams->m_pPixels[0].m_c[2];
+
+		bc7enc16_bool allSame = BC7ENC16_TRUE;
+		for (uint32_t i = 1; i < pParams->m_num_pixels; i++)
+		{
+			if ((cr != pParams->m_pPixels[i].m_c[0]) || (cg != pParams->m_pPixels[i].m_c[1]) || (cb != pParams->m_pPixels[i].m_c[2]))
+			{
+				allSame = BC7ENC16_FALSE;
+				break;
+			}
+		}
+
+		if (allSame)
+			return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+	}
+
+	// Compute partition's mean color and principle axis.
+	vec4F meanColor, axis;
+	vec4F_set_scalar(&meanColor, 0.0f);
+
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+	{
+		vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+		meanColor = vec4F_add(&meanColor, &color);
+	}
+				
+	vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels));
+
+	meanColor = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels * 255.0f));
+	vec4F_saturate_in_place(&meanColor);
+
+	if (pParams->m_has_alpha)
+	{
+		// Use incremental PCA for RGBA PCA, because it's simple.
+		vec4F_set_scalar(&axis, 0.0f);
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+			color = vec4F_sub(&color, &meanColorScaled);
+			vec4F a = vec4F_mul(&color, color.m_c[0]);
+			vec4F b = vec4F_mul(&color, color.m_c[1]);
+			vec4F c = vec4F_mul(&color, color.m_c[2]);
+			vec4F d = vec4F_mul(&color, color.m_c[3]);
+			vec4F n = i ? axis : color;
+			vec4F_normalize_in_place(&n);
+			axis.m_c[0] += vec4F_dot(&a, &n);
+			axis.m_c[1] += vec4F_dot(&b, &n);
+			axis.m_c[2] += vec4F_dot(&c, &n);
+			axis.m_c[3] += vec4F_dot(&d, &n);
+		}
+		vec4F_normalize_in_place(&axis);
+	}
+	else
+	{
+		// Use covar technique for RGB PCA, because it doesn't require per-pixel normalization.
+		float cov[6] = { 0, 0, 0, 0, 0, 0 };
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			const color_quad_u8 *pV = &pParams->m_pPixels[i];
+			float r = pV->m_c[0] - meanColorScaled.m_c[0];
+			float g = pV->m_c[1] - meanColorScaled.m_c[1];
+			float b = pV->m_c[2] - meanColorScaled.m_c[2];
+			cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b;
+		}
+
+		float vfr = .9f, vfg = 1.0f, vfb = .7f;
+		for (uint32_t iter = 0; iter < 3; iter++)
+		{
+			float r = vfr*cov[0] + vfg*cov[1] + vfb*cov[2];
+			float g = vfr*cov[1] + vfg*cov[3] + vfb*cov[4];
+			float b = vfr*cov[2] + vfg*cov[4] + vfb*cov[5];
+
+			float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
+			if (m > 1e-10f)
+			{
+				m = 1.0f / m;
+				r *= m; g *= m;	b *= m;
+			}
+
+			vfr = r; vfg = g; vfb = b;
+		}
+
+		float len = vfr*vfr + vfg*vfg + vfb*vfb;
+		if (len < 1e-10f)
+			vec4F_set_scalar(&axis, 0.0f);
+		else
+		{
+			len = 1.0f / sqrtf(len);
+			vfr *= len; vfg *= len; vfb *= len;
+			vec4F_set(&axis, vfr, vfg, vfb, 0);
+		}
+	}
+				
+	if (vec4F_dot(&axis, &axis) < .5f)
+	{
+		if (pParams->m_perceptual)
+			vec4F_set(&axis, .213f, .715f, .072f, pParams->m_has_alpha ? .715f : 0);
+		else
+			vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams->m_has_alpha ? 1.0f : 0);
+		vec4F_normalize_in_place(&axis);
+	}
+
+	float l = 1e+9f, h = -1e+9f;
+
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+	{
+		vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+
+		vec4F q = vec4F_sub(&color, &meanColorScaled);
+		float d = vec4F_dot(&q, &axis);
+
+		l = minimumf(l, d);
+		h = maximumf(h, d);
+	}
+
+	l *= (1.0f / 255.0f);
+	h *= (1.0f / 255.0f);
+
+	vec4F b0 = vec4F_mul(&axis, l);
+	vec4F b1 = vec4F_mul(&axis, h);
+	vec4F c0 = vec4F_add(&meanColor, &b0);
+	vec4F c1 = vec4F_add(&meanColor, &b1);
+	vec4F minColor = vec4F_saturate(&c0);
+	vec4F maxColor = vec4F_saturate(&c1);
+				
+	vec4F whiteVec;
+	vec4F_set_scalar(&whiteVec, 1.0f);
+	if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec))
+	{
+		vec4F temp = minColor;
+		minColor = maxColor;
+		maxColor = temp;
+	}
+	// First find a solution using the block's PCA.
+	if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults))
+		return 0;
+	
+	if (pComp_params->m_try_least_squares)
+	{
+		// Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors.
+		vec4F xl, xh;
+		vec4F_set_scalar(&xl, 0.0f);
+		vec4F_set_scalar(&xh, 0.0f);
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+	}
+	
+	if (pComp_params->m_uber_level > 0)
+	{
+		// In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors,
+		// then try decrementing the selectrors, then try both.
+		uint8_t selectors_temp[16], selectors_temp1[16];
+		memcpy(selectors_temp, pResults->m_pSelectors, pParams->m_num_pixels);
+
+		const int max_selector = pParams->m_num_selector_weights - 1;
+
+		uint32_t min_sel = 16;
+		uint32_t max_sel = 0;
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			min_sel = minimumu(min_sel, sel);
+			max_sel = maximumu(max_sel, sel);
+		}
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1)))
+				sel++;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		vec4F xl, xh;
+		vec4F_set_scalar(&xl, 0.0f);
+		vec4F_set_scalar(&xh, 0.0f);
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == max_sel) && (sel > 0))
+				sel--;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1)))
+				sel++;
+			else if ((sel == max_sel) && (sel > 0))
+				sel--;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		// In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another.
+		const uint32_t uber_err_thresh = (pParams->m_num_pixels * 56) >> 4;
+		if ((pComp_params->m_uber_level >= 2) && (pResults->m_best_overall_err > uber_err_thresh))
+		{
+			const int Q = (pComp_params->m_uber_level >= 4) ? (pComp_params->m_uber_level - 2) : 1;
+			for (int ly = -Q; ly <= 1; ly++)
+			{
+				for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++)
+				{
+					if ((ly == 0) && (hy == max_selector))
+						continue;
+
+					for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+						selectors_temp1[i] = (uint8_t)clampf(floorf((float)max_selector * ((float)selectors_temp[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_selector);
+
+					//vec4F xl, xh;
+					vec4F_set_scalar(&xl, 0.0f);
+					vec4F_set_scalar(&xh, 0.0f);
+					if (pParams->m_has_alpha)
+						compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+					else
+						compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+					xl = vec4F_mul(&xl, (1.0f / 255.0f));
+					xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+					if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+						return 0;
+				}
+			}
+		}
+	}
+
+	if (mode == 1)
+	{
+		// Try encoding the partition as a single color by using the optimal singe colors tables to encode the block to its mean.
+		color_cell_compressor_results avg_results = *pResults;
+		const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+		uint64_t avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+		if (avg_err < pResults->m_best_overall_err)
+		{
+			*pResults = avg_results;
+			memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+			pResults->m_best_overall_err = avg_err;
+		}
+	}
+				
+	return pResults->m_best_overall_err;
+}
+
+static uint64_t color_cell_compression_est(uint32_t num_pixels, const color_quad_u8 *pPixels, bc7enc16_bool perceptual, uint32_t pweights[4], uint64_t best_err_so_far)
+{
+	// Find RGB bounds as an approximation of the block's principle axis
+	uint32_t lr = 255, lg = 255, lb = 255;
+	uint32_t hr = 0, hg = 0, hb = 0;
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		const color_quad_u8 *pC = &pPixels[i];
+		if (pC->m_c[0] < lr) lr = pC->m_c[0];
+		if (pC->m_c[1] < lg) lg = pC->m_c[1];
+		if (pC->m_c[2] < lb) lb = pC->m_c[2];
+		if (pC->m_c[0] > hr) hr = pC->m_c[0];
+		if (pC->m_c[1] > hg) hg = pC->m_c[1];
+		if (pC->m_c[2] > hb) hb = pC->m_c[2];
+	}
+		
+	color_quad_u8 lowColor; color_quad_u8_set(&lowColor, lr, lg, lb, 0);
+	color_quad_u8 highColor; color_quad_u8_set(&highColor, hr, hg, hb, 0);
+
+	// Place endpoints at bbox diagonals and compute interpolated colors 
+	const uint32_t N = 8;
+	color_quad_u8 weightedColors[8];
+
+	weightedColors[0] = lowColor;
+	weightedColors[N - 1] = highColor;
+	for (uint32_t i = 1; i < (N - 1); i++)
+	{
+		weightedColors[i].m_c[0] = (uint8_t)((lowColor.m_c[0] * (64 - g_bc7_weights3[i]) + highColor.m_c[0] * g_bc7_weights3[i] + 32) >> 6);
+		weightedColors[i].m_c[1] = (uint8_t)((lowColor.m_c[1] * (64 - g_bc7_weights3[i]) + highColor.m_c[1] * g_bc7_weights3[i] + 32) >> 6);
+		weightedColors[i].m_c[2] = (uint8_t)((lowColor.m_c[2] * (64 - g_bc7_weights3[i]) + highColor.m_c[2] * g_bc7_weights3[i] + 32) >> 6);
+	}
+
+	// Compute dots and thresholds
+	const int ar = highColor.m_c[0] - lowColor.m_c[0];
+	const int ag = highColor.m_c[1] - lowColor.m_c[1];
+	const int ab = highColor.m_c[2] - lowColor.m_c[2];
+
+	int dots[8];
+	for (uint32_t i = 0; i < N; i++)
+		dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab;
+
+	int thresh[8 - 1];
+	for (uint32_t i = 0; i < (N - 1); i++)
+		thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1;
+
+	uint64_t total_err = 0;
+	if (perceptual)
+	{
+		// Transform block's interpolated colors to YCbCr
+		int l1[8], cr1[8], cb1[8];
+		for (int j = 0; j < 8; j++)
+		{
+			const color_quad_u8 *pE1 = &weightedColors[j];
+			l1[j] = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37;
+			cr1[j] = ((int)pE1->m_c[0] << 9) - l1[j];
+			cb1[j] = ((int)pE1->m_c[2] << 9) - l1[j];
+		}
+
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const color_quad_u8 *pC = &pPixels[i];
+
+			int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2];
+
+			// Find approximate selector
+			uint32_t s = 0;
+			if (d >= thresh[6])
+				s = 7;
+			else if (d >= thresh[5])
+				s = 6;
+			else if (d >= thresh[4])
+				s = 5;
+			else if (d >= thresh[3])
+				s = 4;
+			else if (d >= thresh[2])
+				s = 3;
+			else if (d >= thresh[1])
+				s = 2;
+			else if (d >= thresh[0])
+				s = 1;
+
+			// Compute error
+			const int l2 = pC->m_c[0] * 109 + pC->m_c[1] * 366 + pC->m_c[2] * 37;
+			const int cr2 = ((int)pC->m_c[0] << 9) - l2;
+			const int cb2 = ((int)pC->m_c[2] << 9) - l2;
+
+			const int dl = (l1[s] - l2) >> 8;
+			const int dcr = (cr1[s] - cr2) >> 8;
+			const int dcb = (cb1[s] - cb2) >> 8;
+
+			int ie = (pweights[0] * dl * dl) + (pweights[1] * dcr * dcr) + (pweights[2] * dcb * dcb);
+
+			total_err += ie;
+			if (total_err > best_err_so_far)
+				break;
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const color_quad_u8 *pC = &pPixels[i];
+
+			int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2];
+
+			// Find approximate selector
+			uint32_t s = 0;
+			if (d >= thresh[6])
+				s = 7;
+			else if (d >= thresh[5])
+				s = 6;
+			else if (d >= thresh[4])
+				s = 5;
+			else if (d >= thresh[3])
+				s = 4;
+			else if (d >= thresh[2])
+				s = 3;
+			else if (d >= thresh[1])
+				s = 2;
+			else if (d >= thresh[0])
+				s = 1;
+
+			// Compute error
+			const color_quad_u8 *pE1 = &weightedColors[s];
+
+			int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+			int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+			int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+
+			total_err += pweights[0] * (dr * dr) + pweights[1] * (dg * dg) + pweights[2] * (db * db);
+			if (total_err > best_err_so_far)
+				break;
+		}
+	}
+
+	return total_err;
+}
+
+// Estimate the partition used by mode 1. This scans through each partition and computes an approximate error for each.
+static uint32_t estimate_partition(const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, uint32_t pweights[4])
+{
+	const uint32_t total_partitions = minimumu(pComp_params->m_max_partitions_mode1, BC7ENC16_MAX_PARTITIONS1);
+	if (total_partitions <= 1)
+		return 0;
+
+	uint64_t best_err = UINT64_MAX;
+	uint32_t best_partition = 0;
+
+	for (uint32_t partition = 0; (partition < total_partitions) && (best_err > 0); partition++)
+	{
+		const uint8_t *pPartition = &g_bc7_partition2[partition * 16];
+
+		color_quad_u8 subset_colors[2][16];
+		uint32_t subset_total_colors[2] = { 0, 0 };
+		for (uint32_t index = 0; index < 16; index++)
+			subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = pPixels[index];
+						
+		uint64_t total_subset_err = 0;
+		for (uint32_t subset = 0; (subset < 2) && (total_subset_err < best_err); subset++)
+			total_subset_err += color_cell_compression_est(subset_total_colors[subset], &subset_colors[subset][0], pComp_params->m_perceptual, pweights, best_err);
+
+		if (total_subset_err < best_err)
+		{
+			best_err = total_subset_err;
+			best_partition = partition;
+		}
+
+	} // partition
+
+	return best_partition;
+}
+
+static void set_block_bits(uint8_t *pBytes, uint32_t val, uint32_t num_bits, uint32_t *pCur_ofs)
+{
+	assert((num_bits <= 32) && (val < (1ULL << num_bits)));
+	while (num_bits)
+	{
+		const uint32_t n = minimumu(8 - (*pCur_ofs & 7), num_bits);
+		pBytes[*pCur_ofs >> 3] |= (uint8_t)(val << (*pCur_ofs & 7));
+		val >>= n;
+		num_bits -= n;
+		*pCur_ofs += n;
+	}
+	assert(*pCur_ofs <= 128);
+}
+
+typedef struct
+{
+	uint32_t m_mode;
+	uint32_t m_partition;
+	uint8_t m_selectors[16];
+	color_quad_u8 m_low[2];
+	color_quad_u8 m_high[2];
+	uint32_t m_pbits[2][2];
+} bc7_optimization_results;
+
+static void encode_bc7_block(void *pBlock, const bc7_optimization_results *pResults)
+{
+	const uint32_t best_mode = pResults->m_mode;
+	const uint32_t total_subsets = g_bc7_num_subsets[best_mode];
+	const uint32_t total_partitions = 1 << g_bc7_partition_bits[best_mode];
+	const uint8_t *pPartition = (total_subsets == 2) ? &g_bc7_partition2[pResults->m_partition * 16] : &g_bc7_partition1[0];
+
+	uint8_t color_selectors[16];
+	memcpy(color_selectors, pResults->m_selectors, 16);
+
+	color_quad_u8 low[2], high[2];
+	memcpy(low, pResults->m_low, sizeof(low));
+	memcpy(high, pResults->m_high, sizeof(high));
+
+	uint32_t pbits[2][2];
+	memcpy(pbits, pResults->m_pbits, sizeof(pbits));
+
+	int anchor[2] = { -1, -1 };
+
+	for (uint32_t k = 0; k < total_subsets; k++)
+	{
+		const uint32_t anchor_index = k ? g_bc7_table_anchor_index_second_subset[pResults->m_partition] : 0;
+		anchor[k] = anchor_index;
+
+		const uint32_t color_index_bits = get_bc7_color_index_size(best_mode, 0);
+		const uint32_t num_color_indices = 1 << color_index_bits;
+
+		if (color_selectors[anchor_index] & (num_color_indices >> 1))
+		{
+			for (uint32_t i = 0; i < 16; i++)
+				if (pPartition[i] == k)
+					color_selectors[i] = (uint8_t)((num_color_indices - 1) - color_selectors[i]);
+
+			color_quad_u8 tmp = low[k];
+			low[k] = high[k];
+			high[k] = tmp;
+
+			if (!g_bc7_mode_has_shared_p_bits[best_mode])
+			{
+				uint32_t t = pbits[k][0];
+				pbits[k][0] = pbits[k][1];
+				pbits[k][1] = t;
+			}
+		}
+	}
+
+	uint8_t *pBlock_bytes = (uint8_t *)(pBlock);
+	memset(pBlock_bytes, 0, BC7ENC16_BLOCK_SIZE);
+
+	uint32_t cur_bit_ofs = 0;
+	set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs);
+
+	if (total_partitions > 1)
+		set_block_bits(pBlock_bytes, pResults->m_partition, 6, &cur_bit_ofs);
+
+	const uint32_t total_comps = (best_mode >= 4) ? 4 : 3;
+	for (uint32_t comp = 0; comp < total_comps; comp++)
+	{
+		for (uint32_t subset = 0; subset < total_subsets; subset++)
+		{
+			set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
+			set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs);
+		}
+	}
+
+	for (uint32_t subset = 0; subset < total_subsets; subset++)
+	{
+		set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs);
+		if (!g_bc7_mode_has_shared_p_bits[best_mode])
+			set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs);
+	}
+
+	for (int idx = 0; idx < 16; idx++)
+	{
+		uint32_t n = get_bc7_color_index_size(best_mode, 0);
+		if ((idx == anchor[0]) || (idx == anchor[1]))
+			n--;
+		set_block_bits(pBlock_bytes, color_selectors[idx], n, &cur_bit_ofs);
+	}
+
+	assert(cur_bit_ofs == 128);
+}
+
+static void handle_alpha_block(void *pBlock, const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, color_cell_compressor_params *pParams)
+{
+	color_cell_compressor_results results6;
+	
+	pParams->m_pSelector_weights = g_bc7_weights4;
+	pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights4x;
+	pParams->m_num_selector_weights = 16;
+	pParams->m_comp_bits = 7;
+	pParams->m_has_pbits = BC7ENC16_TRUE;
+	pParams->m_has_alpha = BC7ENC16_TRUE;
+	pParams->m_perceptual = pComp_params->m_perceptual;
+	pParams->m_num_pixels = 16;
+	pParams->m_pPixels = pPixels;
+
+	bc7_optimization_results opt_results;
+	results6.m_pSelectors = opt_results.m_selectors;
+
+	uint8_t selectors_temp[16];
+	results6.m_pSelectors_temp = selectors_temp;
+
+	color_cell_compression(6, pParams, &results6, pComp_params);
+		
+	opt_results.m_mode = 6;
+	opt_results.m_partition = 0;
+	opt_results.m_low[0] = results6.m_low_endpoint;
+	opt_results.m_high[0] = results6.m_high_endpoint;
+	opt_results.m_pbits[0][0] = results6.m_pbits[0];
+	opt_results.m_pbits[0][1] = results6.m_pbits[1];
+
+	encode_bc7_block(pBlock, &opt_results);
+}
+
+static void handle_opaque_block(void *pBlock, const color_quad_u8 *pPixels, const bc7enc16_compress_block_params *pComp_params, color_cell_compressor_params *pParams)
+{
+	uint8_t selectors_temp[16];
+	
+	// Mode 6
+	bc7_optimization_results opt_results;
+	
+	pParams->m_pSelector_weights = g_bc7_weights4;
+	pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights4x;
+	pParams->m_num_selector_weights = 16;
+	pParams->m_comp_bits = 7;
+	pParams->m_has_pbits = BC7ENC16_TRUE;
+	pParams->m_endpoints_share_pbit = BC7ENC16_FALSE;
+	pParams->m_perceptual = pComp_params->m_perceptual;
+	pParams->m_num_pixels = 16;
+	pParams->m_pPixels = pPixels;
+	pParams->m_has_alpha = BC7ENC16_FALSE;
+
+	color_cell_compressor_results results6;
+	results6.m_pSelectors = opt_results.m_selectors;
+	results6.m_pSelectors_temp = selectors_temp;
+
+	uint64_t best_err = color_cell_compression(6, pParams, &results6, pComp_params);
+	
+	opt_results.m_mode = 6;
+	opt_results.m_partition = 0;
+	opt_results.m_low[0] = results6.m_low_endpoint;
+	opt_results.m_high[0] = results6.m_high_endpoint;
+	opt_results.m_pbits[0][0] = results6.m_pbits[0];
+	opt_results.m_pbits[0][1] = results6.m_pbits[1];
+				
+	// Mode 1
+	if ((best_err > 0) && (pComp_params->m_max_partitions_mode1 > 0))
+	{
+		const uint32_t trial_partition  = estimate_partition(pPixels, pComp_params, pParams->m_weights);
+
+		pParams->m_pSelector_weights = g_bc7_weights3;
+		pParams->m_pSelector_weightsx = (const vec4F *)g_bc7_weights3x;
+		pParams->m_num_selector_weights = 8;
+		pParams->m_comp_bits = 6;
+		pParams->m_has_pbits = BC7ENC16_TRUE;
+		pParams->m_endpoints_share_pbit = BC7ENC16_TRUE;
+
+		const uint8_t *pPartition = &g_bc7_partition2[trial_partition * 16];
+						
+		color_quad_u8 subset_colors[2][16];
+
+		uint32_t subset_total_colors1[2] = { 0, 0 };
+				
+		uint8_t subset_pixel_index1[2][16];
+		uint8_t subset_selectors1[2][16];
+		color_cell_compressor_results subset_results1[2];
+
+		for (uint32_t idx = 0; idx < 16; idx++)
+		{
+			const uint32_t p = pPartition[idx];
+			subset_colors[p][subset_total_colors1[p]] = pPixels[idx];
+			subset_pixel_index1[p][subset_total_colors1[p]] = (uint8_t)idx;
+			subset_total_colors1[p]++;
+		}
+								
+		uint64_t trial_err = 0;
+		for (uint32_t subset = 0; subset < 2; subset++)
+		{
+			pParams->m_num_pixels = subset_total_colors1[subset];
+			pParams->m_pPixels = &subset_colors[subset][0];
+
+			color_cell_compressor_results *pResults = &subset_results1[subset];
+			pResults->m_pSelectors = &subset_selectors1[subset][0];
+			pResults->m_pSelectors_temp = selectors_temp;
+			uint64_t err = color_cell_compression(1, pParams, pResults, pComp_params);
+			trial_err += err;
+			if (trial_err > best_err)
+				break;
+					
+		} // subset
+
+		if (trial_err < best_err)
+		{
+			best_err = trial_err;
+			opt_results.m_mode = 1;
+			opt_results.m_partition = trial_partition;
+			for (uint32_t subset = 0; subset < 2; subset++)
+			{
+				for (uint32_t i = 0; i < subset_total_colors1[subset]; i++)
+					opt_results.m_selectors[subset_pixel_index1[subset][i]] = subset_selectors1[subset][i];
+				opt_results.m_low[subset] = subset_results1[subset].m_low_endpoint;
+				opt_results.m_high[subset] = subset_results1[subset].m_high_endpoint;
+				opt_results.m_pbits[subset][0] = subset_results1[subset].m_pbits[0];
+			}
+		}
+	}
+
+	encode_bc7_block(pBlock, &opt_results);
+}
+
+bc7enc16_bool bc7enc16_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc16_compress_block_params *pComp_params)
+{
+	assert(g_bc7_mode_1_optimal_endpoints[255][0].m_hi != 0);
+
+	const color_quad_u8 *pPixels = (const color_quad_u8 *)(pPixelsRGBA);
+
+	color_cell_compressor_params params;
+	if (pComp_params->m_perceptual)
+	{
+		// https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
+		const float pr_weight = (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f));
+		const float pb_weight = (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f));
+		params.m_weights[0] = (int)(pComp_params->m_weights[0] * 4.0f);
+		params.m_weights[1] = (int)(pComp_params->m_weights[1] * 4.0f * pr_weight);
+		params.m_weights[2] = (int)(pComp_params->m_weights[2] * 4.0f * pb_weight);
+		params.m_weights[3] = pComp_params->m_weights[3] * 4;
+	}
+	else
+		memcpy(params.m_weights, pComp_params->m_weights, sizeof(params.m_weights));
+
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		if (pPixels[i].m_c[3] < 255)
+		{
+			handle_alpha_block(pBlock, pPixels, pComp_params, &params);
+			return BC7ENC16_TRUE;
+		}
+	}
+	handle_opaque_block(pBlock, pPixels, pComp_params, &params);
+	return BC7ENC16_FALSE;
+}
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright(c) 2018 Richard Geldreich, Jr.
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files(the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions :
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain(www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non - commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain.We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors.We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/bc7enc16.h b/bc7enc16.h
new file mode 100644
index 0000000..6383c69
--- /dev/null
+++ b/bc7enc16.h
@@ -0,0 +1,62 @@
+// File: bc7enc16.h - Richard Geldreich, Jr. - MIT license or public domain (see end of bc7enc16.c)
+#include <stdlib.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BC7ENC16_BLOCK_SIZE (16)
+#define BC7ENC16_MAX_PARTITIONS1 (64)
+#define BC7ENC16_MAX_UBER_LEVEL (4)
+
+typedef uint8_t bc7enc16_bool;
+#define BC7ENC16_TRUE (1)
+#define BC7ENC16_FALSE (0)
+
+typedef struct
+{
+	uint32_t m_max_partitions_mode1;
+	uint32_t m_weights[4];
+	uint32_t m_uber_level;
+	bc7enc16_bool m_perceptual;
+	bc7enc16_bool m_try_least_squares;
+} bc7enc16_compress_block_params;
+
+inline void bc7enc16_compress_block_params_init_linear_weights(bc7enc16_compress_block_params *p)
+{
+	p->m_perceptual = BC7ENC16_FALSE;
+	p->m_weights[0] = 1;
+	p->m_weights[1] = 1;
+	p->m_weights[2] = 1;
+	p->m_weights[3] = 1;
+}
+
+inline void bc7enc16_compress_block_params_init_perceptual_weights(bc7enc16_compress_block_params *p)
+{
+	p->m_perceptual = BC7ENC16_TRUE;
+	p->m_weights[0] = 128;
+	p->m_weights[1] = 64;
+	p->m_weights[2] = 16;
+	p->m_weights[3] = 32;
+}
+
+inline void bc7enc16_compress_block_params_init(bc7enc16_compress_block_params *p)
+{
+	p->m_max_partitions_mode1 = BC7ENC16_MAX_PARTITIONS1;
+	p->m_try_least_squares = BC7ENC16_TRUE;
+	p->m_uber_level = 0;
+	bc7enc16_compress_block_params_init_perceptual_weights(p);
+}
+
+// bc7enc16_compress_block_init() MUST be called before calling bc7enc16_compress_block() (or you'll get artifacts).
+void bc7enc16_compress_block_init();
+
+// Packs a single block of 16x16 RGBA pixels (R first in memory) to 128-bit BC7 block pBlock, using either mode 1 and/or 6.
+// Alpha blocks will always use mode 6, and by default opaque blocks will use either modes 1 or 6.
+// Returns BC7ENC16_TRUE if the block had any pixels with alpha < 255, otherwise it return BC7ENC16_FALSE. (This is not an error code - a block is always encoded.)
+bc7enc16_bool bc7enc16_compress_block(void *pBlock, const void *pPixelsRGBA, const bc7enc16_compress_block_params *pComp_params);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/dds_defs.h b/dds_defs.h
new file mode 100644
index 0000000..0c6d164
--- /dev/null
+++ b/dds_defs.h
@@ -0,0 +1,286 @@
+// File: dds_defs.h
+// DX9 .DDS file header definitions.
+#pragma once
+
+#define PIXEL_FMT_FOURCC(a, b, c, d) ((a) | ((b) << 8U) | ((c) << 16U) | ((d) << 24U))
+
+enum pixel_format
+{
+	PIXEL_FMT_INVALID = 0,
+
+	PIXEL_FMT_DXT1 = PIXEL_FMT_FOURCC('D', 'X', 'T', '1'),
+	PIXEL_FMT_DXT2 = PIXEL_FMT_FOURCC('D', 'X', 'T', '2'),
+	PIXEL_FMT_DXT3 = PIXEL_FMT_FOURCC('D', 'X', 'T', '3'),
+	PIXEL_FMT_DXT4 = PIXEL_FMT_FOURCC('D', 'X', 'T', '4'),
+	PIXEL_FMT_DXT5 = PIXEL_FMT_FOURCC('D', 'X', 'T', '5'),
+	PIXEL_FMT_3DC = PIXEL_FMT_FOURCC('A', 'T', 'I', '2'), // DXN_YX
+	PIXEL_FMT_DXN = PIXEL_FMT_FOURCC('A', '2', 'X', 'Y'), // DXN_XY
+	PIXEL_FMT_DXT5A = PIXEL_FMT_FOURCC('A', 'T', 'I', '1'), // ATI1N, http://developer.amd.com/media/gpu_assets/Radeon_X1x00_Programming_Guide.pdf
+
+	// Non-standard formats (some of these are supported by ATI's Compressonator)
+	PIXEL_FMT_DXT5_CCxY = PIXEL_FMT_FOURCC('C', 'C', 'x', 'Y'),
+	PIXEL_FMT_DXT5_xGxR = PIXEL_FMT_FOURCC('x', 'G', 'x', 'R'),
+	PIXEL_FMT_DXT5_xGBR = PIXEL_FMT_FOURCC('x', 'G', 'B', 'R'),
+	PIXEL_FMT_DXT5_AGBR = PIXEL_FMT_FOURCC('A', 'G', 'B', 'R'),
+
+	PIXEL_FMT_DXT1A = PIXEL_FMT_FOURCC('D', 'X', '1', 'A'),
+	PIXEL_FMT_ETC1 = PIXEL_FMT_FOURCC('E', 'T', 'C', '1'),
+
+	PIXEL_FMT_R8G8B8 = PIXEL_FMT_FOURCC('R', 'G', 'B', 'x'),
+	PIXEL_FMT_L8 = PIXEL_FMT_FOURCC('L', 'x', 'x', 'x'),
+	PIXEL_FMT_A8 = PIXEL_FMT_FOURCC('x', 'x', 'x', 'A'),
+	PIXEL_FMT_A8L8 = PIXEL_FMT_FOURCC('L', 'x', 'x', 'A'),
+	PIXEL_FMT_A8R8G8B8 = PIXEL_FMT_FOURCC('R', 'G', 'B', 'A')
+};
+
+const uint32_t cDDSMaxImageDimensions = 8192U;
+
+// Total size of header is sizeof(uint32)+cDDSSizeofDDSurfaceDesc2;
+const uint32_t cDDSSizeofDDSurfaceDesc2 = 124;
+
+// "DDS "
+const uint32_t cDDSFileSignature = 0x20534444;
+
+struct DDCOLORKEY
+{
+	uint32_t dwUnused0;
+	uint32_t dwUnused1;
+};
+
+struct DDPIXELFORMAT
+{
+	uint32_t dwSize;
+	uint32_t dwFlags;
+	uint32_t dwFourCC;
+	uint32_t dwRGBBitCount;     // ATI compressonator will place a FOURCC code here for swizzled/cooked DXTn formats
+	uint32_t dwRBitMask;
+	uint32_t dwGBitMask;
+	uint32_t dwBBitMask;
+	uint32_t dwRGBAlphaBitMask;
+};
+
+struct DDSCAPS2
+{
+	uint32_t dwCaps;
+	uint32_t dwCaps2;
+	uint32_t dwCaps3;
+	uint32_t dwCaps4;
+};
+
+struct DDSURFACEDESC2
+{
+	uint32_t dwSize;
+	uint32_t dwFlags;
+	uint32_t dwHeight;
+	uint32_t dwWidth;
+	union
+	{
+		int32_t lPitch;
+		uint32_t dwLinearSize;
+	};
+	uint32_t dwBackBufferCount;
+	uint32_t dwMipMapCount;
+	uint32_t dwAlphaBitDepth;
+	uint32_t dwUnused0;
+	uint32_t lpSurface;
+	DDCOLORKEY unused0;
+	DDCOLORKEY unused1;
+	DDCOLORKEY unused2;
+	DDCOLORKEY unused3;
+	DDPIXELFORMAT ddpfPixelFormat;
+	DDSCAPS2 ddsCaps;
+	uint32_t dwUnused1;
+};
+
+const uint32_t DDSD_CAPS = 0x00000001;
+const uint32_t DDSD_HEIGHT = 0x00000002;
+const uint32_t DDSD_WIDTH = 0x00000004;
+const uint32_t DDSD_PITCH = 0x00000008;
+
+const uint32_t DDSD_BACKBUFFERCOUNT = 0x00000020;
+const uint32_t DDSD_ZBUFFERBITDEPTH = 0x00000040;
+const uint32_t DDSD_ALPHABITDEPTH = 0x00000080;
+
+const uint32_t DDSD_LPSURFACE = 0x00000800;
+
+const uint32_t DDSD_PIXELFORMAT = 0x00001000;
+const uint32_t DDSD_CKDESTOVERLAY = 0x00002000;
+const uint32_t DDSD_CKDESTBLT = 0x00004000;
+const uint32_t DDSD_CKSRCOVERLAY = 0x00008000;
+
+const uint32_t DDSD_CKSRCBLT = 0x00010000;
+const uint32_t DDSD_MIPMAPCOUNT = 0x00020000;
+const uint32_t DDSD_REFRESHRATE = 0x00040000;
+const uint32_t DDSD_LINEARSIZE = 0x00080000;
+
+const uint32_t DDSD_TEXTURESTAGE = 0x00100000;
+const uint32_t DDSD_FVF = 0x00200000;
+const uint32_t DDSD_SRCVBHANDLE = 0x00400000;
+const uint32_t DDSD_DEPTH = 0x00800000;
+
+const uint32_t DDSD_ALL = 0x00fff9ee;
+
+const uint32_t DDPF_ALPHAPIXELS = 0x00000001;
+const uint32_t DDPF_ALPHA = 0x00000002;
+const uint32_t DDPF_FOURCC = 0x00000004;
+const uint32_t DDPF_PALETTEINDEXED8 = 0x00000020;
+const uint32_t DDPF_RGB = 0x00000040;
+const uint32_t DDPF_LUMINANCE = 0x00020000;
+
+const uint32_t DDSCAPS_COMPLEX = 0x00000008;
+const uint32_t DDSCAPS_TEXTURE = 0x00001000;
+const uint32_t DDSCAPS_MIPMAP = 0x00400000;
+
+const uint32_t DDSCAPS2_CUBEMAP = 0x00000200;
+const uint32_t DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400;
+const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800;
+
+const uint32_t DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000;
+const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000;
+const uint32_t DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000;
+const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000;
+
+const uint32_t DDSCAPS2_VOLUME = 0x00200000;
+
+typedef enum DXGI_FORMAT 
+{
+	DXGI_FORMAT_UNKNOWN = 0,
+	DXGI_FORMAT_R32G32B32A32_TYPELESS = 1,
+	DXGI_FORMAT_R32G32B32A32_FLOAT = 2,
+	DXGI_FORMAT_R32G32B32A32_UINT = 3,
+	DXGI_FORMAT_R32G32B32A32_SINT = 4,
+	DXGI_FORMAT_R32G32B32_TYPELESS = 5,
+	DXGI_FORMAT_R32G32B32_FLOAT = 6,
+	DXGI_FORMAT_R32G32B32_UINT = 7,
+	DXGI_FORMAT_R32G32B32_SINT = 8,
+	DXGI_FORMAT_R16G16B16A16_TYPELESS = 9,
+	DXGI_FORMAT_R16G16B16A16_FLOAT = 10,
+	DXGI_FORMAT_R16G16B16A16_UNORM = 11,
+	DXGI_FORMAT_R16G16B16A16_UINT = 12,
+	DXGI_FORMAT_R16G16B16A16_SNORM = 13,
+	DXGI_FORMAT_R16G16B16A16_SINT = 14,
+	DXGI_FORMAT_R32G32_TYPELESS = 15,
+	DXGI_FORMAT_R32G32_FLOAT = 16,
+	DXGI_FORMAT_R32G32_UINT = 17,
+	DXGI_FORMAT_R32G32_SINT = 18,
+	DXGI_FORMAT_R32G8X24_TYPELESS = 19,
+	DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20,
+	DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21,
+	DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22,
+	DXGI_FORMAT_R10G10B10A2_TYPELESS = 23,
+	DXGI_FORMAT_R10G10B10A2_UNORM = 24,
+	DXGI_FORMAT_R10G10B10A2_UINT = 25,
+	DXGI_FORMAT_R11G11B10_FLOAT = 26,
+	DXGI_FORMAT_R8G8B8A8_TYPELESS = 27,
+	DXGI_FORMAT_R8G8B8A8_UNORM = 28,
+	DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29,
+	DXGI_FORMAT_R8G8B8A8_UINT = 30,
+	DXGI_FORMAT_R8G8B8A8_SNORM = 31,
+	DXGI_FORMAT_R8G8B8A8_SINT = 32,
+	DXGI_FORMAT_R16G16_TYPELESS = 33,
+	DXGI_FORMAT_R16G16_FLOAT = 34,
+	DXGI_FORMAT_R16G16_UNORM = 35,
+	DXGI_FORMAT_R16G16_UINT = 36,
+	DXGI_FORMAT_R16G16_SNORM = 37,
+	DXGI_FORMAT_R16G16_SINT = 38,
+	DXGI_FORMAT_R32_TYPELESS = 39,
+	DXGI_FORMAT_D32_FLOAT = 40,
+	DXGI_FORMAT_R32_FLOAT = 41,
+	DXGI_FORMAT_R32_UINT = 42,
+	DXGI_FORMAT_R32_SINT = 43,
+	DXGI_FORMAT_R24G8_TYPELESS = 44,
+	DXGI_FORMAT_D24_UNORM_S8_UINT = 45,
+	DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46,
+	DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47,
+	DXGI_FORMAT_R8G8_TYPELESS = 48,
+	DXGI_FORMAT_R8G8_UNORM = 49,
+	DXGI_FORMAT_R8G8_UINT = 50,
+	DXGI_FORMAT_R8G8_SNORM = 51,
+	DXGI_FORMAT_R8G8_SINT = 52,
+	DXGI_FORMAT_R16_TYPELESS = 53,
+	DXGI_FORMAT_R16_FLOAT = 54,
+	DXGI_FORMAT_D16_UNORM = 55,
+	DXGI_FORMAT_R16_UNORM = 56,
+	DXGI_FORMAT_R16_UINT = 57,
+	DXGI_FORMAT_R16_SNORM = 58,
+	DXGI_FORMAT_R16_SINT = 59,
+	DXGI_FORMAT_R8_TYPELESS = 60,
+	DXGI_FORMAT_R8_UNORM = 61,
+	DXGI_FORMAT_R8_UINT = 62,
+	DXGI_FORMAT_R8_SNORM = 63,
+	DXGI_FORMAT_R8_SINT = 64,
+	DXGI_FORMAT_A8_UNORM = 65,
+	DXGI_FORMAT_R1_UNORM = 66,
+	DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67,
+	DXGI_FORMAT_R8G8_B8G8_UNORM = 68,
+	DXGI_FORMAT_G8R8_G8B8_UNORM = 69,
+	DXGI_FORMAT_BC1_TYPELESS = 70,
+	DXGI_FORMAT_BC1_UNORM = 71,
+	DXGI_FORMAT_BC1_UNORM_SRGB = 72,
+	DXGI_FORMAT_BC2_TYPELESS = 73,
+	DXGI_FORMAT_BC2_UNORM = 74,
+	DXGI_FORMAT_BC2_UNORM_SRGB = 75,
+	DXGI_FORMAT_BC3_TYPELESS = 76,
+	DXGI_FORMAT_BC3_UNORM = 77,
+	DXGI_FORMAT_BC3_UNORM_SRGB = 78,
+	DXGI_FORMAT_BC4_TYPELESS = 79,
+	DXGI_FORMAT_BC4_UNORM = 80,
+	DXGI_FORMAT_BC4_SNORM = 81,
+	DXGI_FORMAT_BC5_TYPELESS = 82,
+	DXGI_FORMAT_BC5_UNORM = 83,
+	DXGI_FORMAT_BC5_SNORM = 84,
+	DXGI_FORMAT_B5G6R5_UNORM = 85,
+	DXGI_FORMAT_B5G5R5A1_UNORM = 86,
+	DXGI_FORMAT_B8G8R8A8_UNORM = 87,
+	DXGI_FORMAT_B8G8R8X8_UNORM = 88,
+	DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM = 89,
+	DXGI_FORMAT_B8G8R8A8_TYPELESS = 90,
+	DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91,
+	DXGI_FORMAT_B8G8R8X8_TYPELESS = 92,
+	DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93,
+	DXGI_FORMAT_BC6H_TYPELESS = 94,
+	DXGI_FORMAT_BC6H_UF16 = 95,
+	DXGI_FORMAT_BC6H_SF16 = 96,
+	DXGI_FORMAT_BC7_TYPELESS = 97,
+	DXGI_FORMAT_BC7_UNORM = 98,
+	DXGI_FORMAT_BC7_UNORM_SRGB = 99,
+	DXGI_FORMAT_AYUV = 100,
+	DXGI_FORMAT_Y410 = 101,
+	DXGI_FORMAT_Y416 = 102,
+	DXGI_FORMAT_NV12 = 103,
+	DXGI_FORMAT_P010 = 104,
+	DXGI_FORMAT_P016 = 105,
+	DXGI_FORMAT_420_OPAQUE = 106,
+	DXGI_FORMAT_YUY2 = 107,
+	DXGI_FORMAT_Y210 = 108,
+	DXGI_FORMAT_Y216 = 109,
+	DXGI_FORMAT_NV11 = 110,
+	DXGI_FORMAT_AI44 = 111,
+	DXGI_FORMAT_IA44 = 112,
+	DXGI_FORMAT_P8 = 113,
+	DXGI_FORMAT_A8P8 = 114,
+	DXGI_FORMAT_B4G4R4A4_UNORM = 115,
+	DXGI_FORMAT_P208 = 130,
+	DXGI_FORMAT_V208 = 131,
+	DXGI_FORMAT_V408 = 132,
+	DXGI_FORMAT_FORCE_UINT = 0xffffffff
+} DXGI_FORMAT;
+
+enum D3D10_RESOURCE_DIMENSION 
+{
+	D3D10_RESOURCE_DIMENSION_UNKNOWN = 0,
+	D3D10_RESOURCE_DIMENSION_BUFFER = 1,
+	D3D10_RESOURCE_DIMENSION_TEXTURE1D = 2,
+	D3D10_RESOURCE_DIMENSION_TEXTURE2D = 3,
+	D3D10_RESOURCE_DIMENSION_TEXTURE3D = 4
+};
+
+struct DDS_HEADER_DXT10
+{
+	DXGI_FORMAT              dxgiFormat;
+	D3D10_RESOURCE_DIMENSION resourceDimension;
+	uint32_t                 miscFlag;
+	uint32_t                 arraySize;
+	uint32_t                 miscFlags2;
+};
+
diff --git a/lodepng.cpp b/lodepng.cpp
new file mode 100644
index 0000000..742766f
--- /dev/null
+++ b/lodepng.cpp
@@ -0,0 +1,6232 @@
+/*
+LodePNG version 20161127
+
+Copyright (c) 2005-2016 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+	 1. The origin of this software must not be misrepresented; you must not
+	 claim that you wrote the original software. If you use this software
+	 in a product, an acknowledgment in the product documentation would be
+	 appreciated but is not required.
+
+	 2. Altered source versions must be plainly marked as such, and must not be
+	 misrepresented as being the original software.
+
+	 3. This notice may not be removed or altered from any source
+	 distribution.
+*/
+
+/*
+The manual and changelog are in the header file "lodepng.h"
+Rename this file to lodepng.cpp to use it for C++, or to lodepng.c to use it for C.
+*/
+
+#include "lodepng.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1310) /*Visual Studio: A few warning types are not desired here.*/
+#pragma warning( disable : 4244 ) /*implicit conversions: not warned by gcc -Wall -Wextra and requires too much casts*/
+#pragma warning( disable : 4996 ) /*VS does not like fopen, but fopen_s is not standard C so unusable here*/
+#endif /*_MSC_VER */
+
+const char* LODEPNG_VERSION_STRING = "20161127";
+
+/*
+This source file is built up in the following large parts. The code sections
+with the "LODEPNG_COMPILE_" #defines divide this up further in an intermixed way.
+-Tools for C and common code for PNG and Zlib
+-C Code for Zlib (huffman, deflate, ...)
+-C Code for PNG (file format chunks, adam7, PNG filters, color conversions, ...)
+-The C++ wrapper around all of the above
+*/
+
+/*The malloc, realloc and free functions defined here with "lodepng_" in front
+of the name, so that you can easily change them to others related to your
+platform if needed. Everything else in the code calls these. Pass
+-DLODEPNG_NO_COMPILE_ALLOCATORS to the compiler, or comment out
+#define LODEPNG_COMPILE_ALLOCATORS in the header, to disable the ones here and
+define them in your own project's source files without needing to change
+lodepng source code. Don't forget to remove "static" if you copypaste them
+from here.*/
+
+#ifdef LODEPNG_COMPILE_ALLOCATORS
+static void* lodepng_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static void* lodepng_realloc(void* ptr, size_t new_size)
+{
+	return realloc(ptr, new_size);
+}
+
+static void lodepng_free(void* ptr)
+{
+	free(ptr);
+}
+#else /*LODEPNG_COMPILE_ALLOCATORS*/
+void* lodepng_malloc(size_t size);
+void* lodepng_realloc(void* ptr, size_t new_size);
+void lodepng_free(void* ptr);
+#endif /*LODEPNG_COMPILE_ALLOCATORS*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // Tools for C, and common code for PNG and Zlib.                       // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Often in case of an error a value is assigned to a variable and then it breaks
+out of a loop (to go to the cleanup phase of a function). This macro does that.
+It makes the error handling code shorter and more readable.
+
+Example: if(!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83);
+*/
+#define CERROR_BREAK(errorvar, code)\
+{\
+  errorvar = code;\
+  break;\
+}
+
+/*version of CERROR_BREAK that assumes the common case where the error variable is named "error"*/
+#define ERROR_BREAK(code) CERROR_BREAK(error, code)
+
+/*Set error var to the error code, and return it.*/
+#define CERROR_RETURN_ERROR(errorvar, code)\
+{\
+  errorvar = code;\
+  return code;\
+}
+
+/*Try the code, if it returns error, also return the error.*/
+#define CERROR_TRY_RETURN(call)\
+{\
+  unsigned error = call;\
+  if(error) return error;\
+}
+
+/*Set error var to the error code, and return from the void function.*/
+#define CERROR_RETURN(errorvar, code)\
+{\
+  errorvar = code;\
+  return;\
+}
+
+/*
+About uivector, ucvector and string:
+-All of them wrap dynamic arrays or text strings in a similar way.
+-LodePNG was originally written in C++. The vectors replace the std::vectors that were used in the C++ version.
+-The string tools are made to avoid problems with compilers that declare things like strncat as deprecated.
+-They're not used in the interface, only internally in this file as static functions.
+-As with many other structs in this file, the init and cleanup functions serve as ctor and dtor.
+*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*dynamic vector of unsigned ints*/
+typedef struct uivector
+{
+	unsigned* data;
+	size_t size; /*size in number of unsigned longs*/
+	size_t allocsize; /*allocated size in bytes*/
+} uivector;
+
+static void uivector_cleanup(void* p)
+{
+	((uivector*)p)->size = ((uivector*)p)->allocsize = 0;
+	lodepng_free(((uivector*)p)->data);
+	((uivector*)p)->data = NULL;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_reserve(uivector* p, size_t allocsize)
+{
+	if (allocsize > p->allocsize)
+	{
+		size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+		void* data = lodepng_realloc(p->data, newsize);
+		if (data)
+		{
+			p->allocsize = newsize;
+			p->data = (unsigned*)data;
+		}
+		else return 0; /*error: not enough memory*/
+	}
+	return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_resize(uivector* p, size_t size)
+{
+	if (!uivector_reserve(p, size * sizeof(unsigned))) return 0;
+	p->size = size;
+	return 1; /*success*/
+}
+
+/*resize and give all new elements the value*/
+static unsigned uivector_resizev(uivector* p, size_t size, unsigned value)
+{
+	size_t oldsize = p->size, i;
+	if (!uivector_resize(p, size)) return 0;
+	for (i = oldsize; i < size; ++i) p->data[i] = value;
+	return 1;
+}
+
+static void uivector_init(uivector* p)
+{
+	p->data = NULL;
+	p->size = p->allocsize = 0;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_push_back(uivector* p, unsigned c)
+{
+	if (!uivector_resize(p, p->size + 1)) return 0;
+	p->data[p->size - 1] = c;
+	return 1;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+/*dynamic vector of unsigned chars*/
+typedef struct ucvector
+{
+	unsigned char* data;
+	size_t size; /*used size*/
+	size_t allocsize; /*allocated size*/
+} ucvector;
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_reserve(ucvector* p, size_t allocsize)
+{
+	if (allocsize > p->allocsize)
+	{
+		size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+		void* data = lodepng_realloc(p->data, newsize);
+		if (data)
+		{
+			p->allocsize = newsize;
+			p->data = (unsigned char*)data;
+		}
+		else return 0; /*error: not enough memory*/
+	}
+	return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_resize(ucvector* p, size_t size)
+{
+	if (!ucvector_reserve(p, size * sizeof(unsigned char))) return 0;
+	p->size = size;
+	return 1; /*success*/
+}
+
+#ifdef LODEPNG_COMPILE_PNG
+
+static void ucvector_cleanup(void* p)
+{
+	((ucvector*)p)->size = ((ucvector*)p)->allocsize = 0;
+	lodepng_free(((ucvector*)p)->data);
+	((ucvector*)p)->data = NULL;
+}
+
+static void ucvector_init(ucvector* p)
+{
+	p->data = NULL;
+	p->size = p->allocsize = 0;
+}
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*you can both convert from vector to buffer&size and vica versa. If you use
+init_buffer to take over a buffer and size, it is not needed to use cleanup*/
+static void ucvector_init_buffer(ucvector* p, unsigned char* buffer, size_t size)
+{
+	p->data = buffer;
+	p->allocsize = p->size = size;
+}
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#if (defined(LODEPNG_COMPILE_PNG) && defined(LODEPNG_COMPILE_ANCILLARY_CHUNKS)) || defined(LODEPNG_COMPILE_ENCODER)
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_push_back(ucvector* p, unsigned char c)
+{
+	if (!ucvector_resize(p, p->size + 1)) return 0;
+	p->data[p->size - 1] = c;
+	return 1;
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned string_resize(char** out, size_t size)
+{
+	char* data = (char*)lodepng_realloc(*out, size + 1);
+	if (data)
+	{
+		data[size] = 0; /*null termination char*/
+		*out = data;
+	}
+	return data != 0;
+}
+
+/*init a {char*, size_t} pair for use as string*/
+static void string_init(char** out)
+{
+	*out = NULL;
+	string_resize(out, 0);
+}
+
+/*free the above pair again*/
+static void string_cleanup(char** out)
+{
+	lodepng_free(*out);
+	*out = NULL;
+}
+
+static void string_set(char** out, const char* in)
+{
+	size_t insize = strlen(in), i;
+	if (string_resize(out, insize))
+	{
+		for (i = 0; i != insize; ++i)
+		{
+			(*out)[i] = in[i];
+		}
+	}
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_read32bitInt(const unsigned char* buffer)
+{
+	return (unsigned)((buffer[0] << 24) | (buffer[1] << 16) | (buffer[2] << 8) | buffer[3]);
+}
+
+#if defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)
+/*buffer must have at least 4 allocated bytes available*/
+static void lodepng_set32bitInt(unsigned char* buffer, unsigned value)
+{
+	buffer[0] = (unsigned char)((value >> 24) & 0xff);
+	buffer[1] = (unsigned char)((value >> 16) & 0xff);
+	buffer[2] = (unsigned char)((value >> 8) & 0xff);
+	buffer[3] = (unsigned char)((value) & 0xff);
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static void lodepng_add32bitInt(ucvector* buffer, unsigned value)
+{
+	ucvector_resize(buffer, buffer->size + 4); /*todo: give error if resize failed*/
+	lodepng_set32bitInt(&buffer->data[buffer->size - 4], value);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / File IO                                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DISK
+
+/* returns negative value on error. This should be pure C compatible, so no fstat. */
+static long lodepng_filesize(const char* filename)
+{
+	FILE* file;
+	long size;
+	file = fopen(filename, "rb");
+	if (!file) return -1;
+
+	if (fseek(file, 0, SEEK_END) != 0)
+	{
+		fclose(file);
+		return -1;
+	}
+
+	size = ftell(file);
+	/* It may give LONG_MAX as directory size, this is invalid for us. */
+	if (size == LONG_MAX) size = -1;
+
+	fclose(file);
+	return size;
+}
+
+/* load file into buffer that already has the correct allocated size. Returns error code.*/
+static unsigned lodepng_buffer_file(unsigned char* out, size_t size, const char* filename)
+{
+	FILE* file;
+	size_t readsize;
+	file = fopen(filename, "rb");
+	if (!file) return 78;
+
+	readsize = fread(out, 1, size, file);
+	fclose(file);
+
+	if (readsize != size) return 78;
+	return 0;
+}
+
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename)
+{
+	long size = lodepng_filesize(filename);
+	if (size < 0) return 78;
+	*outsize = (size_t)size;
+
+	*out = (unsigned char*)lodepng_malloc((size_t)size);
+	if (!(*out) && size > 0) return 83; /*the above malloc failed*/
+
+	return lodepng_buffer_file(*out, (size_t)size, filename);
+}
+
+/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename)
+{
+	FILE* file;
+	file = fopen(filename, "wb");
+	if (!file) return 79;
+	fwrite((char*)buffer, 1, buffersize, file);
+	fclose(file);
+	return 0;
+}
+
+#endif /*LODEPNG_COMPILE_DISK*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of common code and tools. Begin of Zlib related code.            // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_ENCODER
+/*TODO: this ignores potential out of memory errors*/
+#define addBitToStream(/*size_t**/ bitpointer, /*ucvector**/ bitstream, /*unsigned char*/ bit)\
+{\
+  /*add a new byte at the end*/\
+  if(((*bitpointer) & 7) == 0) ucvector_push_back(bitstream, (unsigned char)0);\
+  /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/\
+  (bitstream->data[bitstream->size - 1]) |= (bit << ((*bitpointer) & 0x7));\
+  ++(*bitpointer);\
+}
+
+static void addBitsToStream(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+	size_t i;
+	for (i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> i) & 1));
+}
+
+static void addBitsToStreamReversed(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+	size_t i;
+	for (i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> (nbits - 1 - i)) & 1));
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+#define READBIT(bitpointer, bitstream) ((bitstream[bitpointer >> 3] >> (bitpointer & 0x7)) & (unsigned char)1)
+
+static unsigned char readBitFromStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+	unsigned char result = (unsigned char)(READBIT(*bitpointer, bitstream));
+	++(*bitpointer);
+	return result;
+}
+
+static unsigned readBitsFromStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+	unsigned result = 0, i;
+	for (i = 0; i != nbits; ++i)
+	{
+		result += ((unsigned)READBIT(*bitpointer, bitstream)) << i;
+		++(*bitpointer);
+	}
+	return result;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflate - Huffman                                                      / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#define FIRST_LENGTH_CODE_INDEX 257
+#define LAST_LENGTH_CODE_INDEX 285
+/*256 literals, the end code, some length codes, and 2 unused codes*/
+#define NUM_DEFLATE_CODE_SYMBOLS 288
+/*the distance codes have their own symbols, 30 used, 2 unused*/
+#define NUM_DISTANCE_SYMBOLS 32
+/*the code length codes. 0-15: code lengths, 16: copy previous 3-6 times, 17: 3-10 zeros, 18: 11-138 zeros*/
+#define NUM_CODE_LENGTH_CODES 19
+
+/*the base lengths represented by codes 257-285*/
+static const unsigned LENGTHBASE[29]
+= { 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59,
+	67, 83, 99, 115, 131, 163, 195, 227, 258 };
+
+/*the extra bits used by codes 257-285 (added to base length)*/
+static const unsigned LENGTHEXTRA[29]
+= { 0, 0, 0, 0, 0, 0, 0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+	 4,  4,  4,   4,   5,   5,   5,   5,   0 };
+
+/*the base backwards distances (the bits of distance codes appear after length codes and use their own huffman tree)*/
+static const unsigned DISTANCEBASE[30]
+= { 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
+	769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577 };
+
+/*the extra bits of backwards distances (added to base)*/
+static const unsigned DISTANCEEXTRA[30]
+= { 0, 0, 0, 0, 1, 1, 2,  2,  3,  3,  4,  4,  5,  5,   6,   6,   7,   7,   8,
+	  8,    9,    9,   10,   10,   11,   11,   12,    12,    13,    13 };
+
+/*the order in which "code length alphabet code lengths" are stored, out of this
+the huffman tree of the dynamic huffman tree lengths is generated*/
+static const unsigned CLCL_ORDER[NUM_CODE_LENGTH_CODES]
+= { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Huffman tree struct, containing multiple representations of the tree
+*/
+typedef struct HuffmanTree
+{
+	unsigned* tree2d;
+	unsigned* tree1d;
+	unsigned* lengths; /*the lengths of the codes of the 1d-tree*/
+	unsigned maxbitlen; /*maximum number of bits a single code can get*/
+	unsigned numcodes; /*number of symbols in the alphabet = number of codes*/
+} HuffmanTree;
+
+/*function used for debug purposes to draw the tree in ascii art with C++*/
+/*
+static void HuffmanTree_draw(HuffmanTree* tree)
+{
+  std::cout << "tree. length: " << tree->numcodes << " maxbitlen: " << tree->maxbitlen << std::endl;
+  for(size_t i = 0; i != tree->tree1d.size; ++i)
+  {
+	 if(tree->lengths.data[i])
+		std::cout << i << " " << tree->tree1d.data[i] << " " << tree->lengths.data[i] << std::endl;
+  }
+  std::cout << std::endl;
+}*/
+
+static void HuffmanTree_init(HuffmanTree* tree)
+{
+	tree->tree2d = 0;
+	tree->tree1d = 0;
+	tree->lengths = 0;
+}
+
+static void HuffmanTree_cleanup(HuffmanTree* tree)
+{
+	lodepng_free(tree->tree2d);
+	lodepng_free(tree->tree1d);
+	lodepng_free(tree->lengths);
+}
+
+/*the tree representation used by the decoder. return value is error*/
+static unsigned HuffmanTree_make2DTree(HuffmanTree* tree)
+{
+	unsigned nodefilled = 0; /*up to which node it is filled*/
+	unsigned treepos = 0; /*position in the tree (1 of the numcodes columns)*/
+	unsigned n, i;
+
+	tree->tree2d = (unsigned*)lodepng_malloc(tree->numcodes * 2 * sizeof(unsigned));
+	if (!tree->tree2d) return 83; /*alloc fail*/
+
+	/*
+	convert tree1d[] to tree2d[][]. In the 2D array, a value of 32767 means
+	uninited, a value >= numcodes is an address to another bit, a value < numcodes
+	is a code. The 2 rows are the 2 possible bit values (0 or 1), there are as
+	many columns as codes - 1.
+	A good huffman tree has N * 2 - 1 nodes, of which N - 1 are internal nodes.
+	Here, the internal nodes are stored (what their 0 and 1 option point to).
+	There is only memory for such good tree currently, if there are more nodes
+	(due to too long length codes), error 55 will happen
+	*/
+	for (n = 0; n < tree->numcodes * 2; ++n)
+	{
+		tree->tree2d[n] = 32767; /*32767 here means the tree2d isn't filled there yet*/
+	}
+
+	for (n = 0; n < tree->numcodes; ++n) /*the codes*/
+	{
+		for (i = 0; i != tree->lengths[n]; ++i) /*the bits for this code*/
+		{
+			unsigned char bit = (unsigned char)((tree->tree1d[n] >> (tree->lengths[n] - i - 1)) & 1);
+			/*oversubscribed, see comment in lodepng_error_text*/
+			if (treepos > 2147483647 || treepos + 2 > tree->numcodes) return 55;
+			if (tree->tree2d[2 * treepos + bit] == 32767) /*not yet filled in*/
+			{
+				if (i + 1 == tree->lengths[n]) /*last bit*/
+				{
+					tree->tree2d[2 * treepos + bit] = n; /*put the current code in it*/
+					treepos = 0;
+				}
+				else
+				{
+					/*put address of the next step in here, first that address has to be found of course
+					(it's just nodefilled + 1)...*/
+					++nodefilled;
+					/*addresses encoded with numcodes added to it*/
+					tree->tree2d[2 * treepos + bit] = nodefilled + tree->numcodes;
+					treepos = nodefilled;
+				}
+			}
+			else treepos = tree->tree2d[2 * treepos + bit] - tree->numcodes;
+		}
+	}
+
+	for (n = 0; n < tree->numcodes * 2; ++n)
+	{
+		if (tree->tree2d[n] == 32767) tree->tree2d[n] = 0; /*remove possible remaining 32767's*/
+	}
+
+	return 0;
+}
+
+/*
+Second step for the ...makeFromLengths and ...makeFromFrequencies functions.
+numcodes, lengths and maxbitlen must already be filled in correctly. return
+value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths2(HuffmanTree* tree)
+{
+	uivector blcount;
+	uivector nextcode;
+	unsigned error = 0;
+	unsigned bits, n;
+
+	uivector_init(&blcount);
+	uivector_init(&nextcode);
+
+	tree->tree1d = (unsigned*)lodepng_malloc(tree->numcodes * sizeof(unsigned));
+	if (!tree->tree1d) error = 83; /*alloc fail*/
+
+	if (!uivector_resizev(&blcount, tree->maxbitlen + 1, 0)
+		|| !uivector_resizev(&nextcode, tree->maxbitlen + 1, 0))
+		error = 83; /*alloc fail*/
+
+	if (!error)
+	{
+		/*step 1: count number of instances of each code length*/
+		for (bits = 0; bits != tree->numcodes; ++bits) ++blcount.data[tree->lengths[bits]];
+		/*step 2: generate the nextcode values*/
+		for (bits = 1; bits <= tree->maxbitlen; ++bits)
+		{
+			nextcode.data[bits] = (nextcode.data[bits - 1] + blcount.data[bits - 1]) << 1;
+		}
+		/*step 3: generate all the codes*/
+		for (n = 0; n != tree->numcodes; ++n)
+		{
+			if (tree->lengths[n] != 0) tree->tree1d[n] = nextcode.data[tree->lengths[n]]++;
+		}
+	}
+
+	uivector_cleanup(&blcount);
+	uivector_cleanup(&nextcode);
+
+	if (!error) return HuffmanTree_make2DTree(tree);
+	else return error;
+}
+
+/*
+given the code lengths (as stored in the PNG file), generate the tree as defined
+by Deflate. maxbitlen is the maximum bits that a code in the tree can have.
+return value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths(HuffmanTree* tree, const unsigned* bitlen,
+	size_t numcodes, unsigned maxbitlen)
+{
+	unsigned i;
+	tree->lengths = (unsigned*)lodepng_malloc(numcodes * sizeof(unsigned));
+	if (!tree->lengths) return 83; /*alloc fail*/
+	for (i = 0; i != numcodes; ++i) tree->lengths[i] = bitlen[i];
+	tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+	tree->maxbitlen = maxbitlen;
+	return HuffmanTree_makeFromLengths2(tree);
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*BPM: Boundary Package Merge, see "A Fast and Space-Economical Algorithm for Length-Limited Coding",
+Jyrki Katajainen, Alistair Moffat, Andrew Turpin, 1995.*/
+
+/*chain node for boundary package merge*/
+typedef struct BPMNode
+{
+	int weight; /*the sum of all weights in this chain*/
+	unsigned index; /*index of this leaf node (called "count" in the paper)*/
+	struct BPMNode* tail; /*the next nodes in this chain (null if last)*/
+	int in_use;
+} BPMNode;
+
+/*lists of chains*/
+typedef struct BPMLists
+{
+	/*memory pool*/
+	unsigned memsize;
+	BPMNode* memory;
+	unsigned numfree;
+	unsigned nextfree;
+	BPMNode** freelist;
+	/*two heads of lookahead chains per list*/
+	unsigned listsize;
+	BPMNode** chains0;
+	BPMNode** chains1;
+} BPMLists;
+
+/*creates a new chain node with the given parameters, from the memory in the lists */
+static BPMNode* bpmnode_create(BPMLists* lists, int weight, unsigned index, BPMNode* tail)
+{
+	unsigned i;
+	BPMNode* result;
+
+	/*memory full, so garbage collect*/
+	if (lists->nextfree >= lists->numfree)
+	{
+		/*mark only those that are in use*/
+		for (i = 0; i != lists->memsize; ++i) lists->memory[i].in_use = 0;
+		for (i = 0; i != lists->listsize; ++i)
+		{
+			BPMNode* node;
+			for (node = lists->chains0[i]; node != 0; node = node->tail) node->in_use = 1;
+			for (node = lists->chains1[i]; node != 0; node = node->tail) node->in_use = 1;
+		}
+		/*collect those that are free*/
+		lists->numfree = 0;
+		for (i = 0; i != lists->memsize; ++i)
+		{
+			if (!lists->memory[i].in_use) lists->freelist[lists->numfree++] = &lists->memory[i];
+		}
+		lists->nextfree = 0;
+	}
+
+	result = lists->freelist[lists->nextfree++];
+	result->weight = weight;
+	result->index = index;
+	result->tail = tail;
+	return result;
+}
+
+/*sort the leaves with stable mergesort*/
+static void bpmnode_sort(BPMNode* leaves, size_t num)
+{
+	BPMNode* mem = (BPMNode*)lodepng_malloc(sizeof(*leaves) * num);
+	size_t width, counter = 0;
+	for (width = 1; width < num; width *= 2)
+	{
+		BPMNode* a = (counter & 1) ? mem : leaves;
+		BPMNode* b = (counter & 1) ? leaves : mem;
+		size_t p;
+		for (p = 0; p < num; p += 2 * width)
+		{
+			size_t q = (p + width > num) ? num : (p + width);
+			size_t r = (p + 2 * width > num) ? num : (p + 2 * width);
+			size_t i = p, j = q, k;
+			for (k = p; k < r; k++)
+			{
+				if (i < q && (j >= r || a[i].weight <= a[j].weight)) b[k] = a[i++];
+				else b[k] = a[j++];
+			}
+		}
+		counter++;
+	}
+	if (counter & 1) memcpy(leaves, mem, sizeof(*leaves) * num);
+	lodepng_free(mem);
+}
+
+/*Boundary Package Merge step, numpresent is the amount of leaves, and c is the current chain.*/
+static void boundaryPM(BPMLists* lists, BPMNode* leaves, size_t numpresent, int c, int num)
+{
+	unsigned lastindex = lists->chains1[c]->index;
+
+	if (c == 0)
+	{
+		if (lastindex >= numpresent) return;
+		lists->chains0[c] = lists->chains1[c];
+		lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, 0);
+	}
+	else
+	{
+		/*sum of the weights of the head nodes of the previous lookahead chains.*/
+		int sum = lists->chains0[c - 1]->weight + lists->chains1[c - 1]->weight;
+		lists->chains0[c] = lists->chains1[c];
+		if (lastindex < numpresent && sum > leaves[lastindex].weight)
+		{
+			lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, lists->chains1[c]->tail);
+			return;
+		}
+		lists->chains1[c] = bpmnode_create(lists, sum, lastindex, lists->chains1[c - 1]);
+		/*in the end we are only interested in the chain of the last list, so no
+		need to recurse if we're at the last one (this gives measurable speedup)*/
+		if (num + 1 < (int)(2 * numpresent - 2))
+		{
+			boundaryPM(lists, leaves, numpresent, c - 1, num);
+			boundaryPM(lists, leaves, numpresent, c - 1, num);
+		}
+	}
+}
+
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+	size_t numcodes, unsigned maxbitlen)
+{
+	unsigned error = 0;
+	unsigned i;
+	size_t numpresent = 0; /*number of symbols with non-zero frequency*/
+	BPMNode* leaves; /*the symbols, only those with > 0 frequency*/
+
+	if (numcodes == 0) return 80; /*error: a tree of 0 symbols is not supposed to be made*/
+	if ((1u << maxbitlen) < numcodes) return 80; /*error: represent all symbols*/
+
+	leaves = (BPMNode*)lodepng_malloc(numcodes * sizeof(*leaves));
+	if (!leaves) return 83; /*alloc fail*/
+
+	for (i = 0; i != numcodes; ++i)
+	{
+		if (frequencies[i] > 0)
+		{
+			leaves[numpresent].weight = (int)frequencies[i];
+			leaves[numpresent].index = i;
+			++numpresent;
+		}
+	}
+
+	for (i = 0; i != numcodes; ++i) lengths[i] = 0;
+
+	/*ensure at least two present symbols. There should be at least one symbol
+	according to RFC 1951 section 3.2.7. Some decoders incorrectly require two. To
+	make these work as well ensure there are at least two symbols. The
+	Package-Merge code below also doesn't work correctly if there's only one
+	symbol, it'd give it the theoritical 0 bits but in practice zlib wants 1 bit*/
+	if (numpresent == 0)
+	{
+		lengths[0] = lengths[1] = 1; /*note that for RFC 1951 section 3.2.7, only lengths[0] = 1 is needed*/
+	}
+	else if (numpresent == 1)
+	{
+		lengths[leaves[0].index] = 1;
+		lengths[leaves[0].index == 0 ? 1 : 0] = 1;
+	}
+	else
+	{
+		BPMLists lists;
+		BPMNode* node;
+
+		bpmnode_sort(leaves, numpresent);
+
+		lists.listsize = maxbitlen;
+		lists.memsize = 2 * maxbitlen * (maxbitlen + 1);
+		lists.nextfree = 0;
+		lists.numfree = lists.memsize;
+		lists.memory = (BPMNode*)lodepng_malloc(lists.memsize * sizeof(*lists.memory));
+		lists.freelist = (BPMNode**)lodepng_malloc(lists.memsize * sizeof(BPMNode*));
+		lists.chains0 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+		lists.chains1 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+		if (!lists.memory || !lists.freelist || !lists.chains0 || !lists.chains1) error = 83; /*alloc fail*/
+
+		if (!error)
+		{
+			for (i = 0; i != lists.memsize; ++i) lists.freelist[i] = &lists.memory[i];
+
+			bpmnode_create(&lists, leaves[0].weight, 1, 0);
+			bpmnode_create(&lists, leaves[1].weight, 2, 0);
+
+			for (i = 0; i != lists.listsize; ++i)
+			{
+				lists.chains0[i] = &lists.memory[0];
+				lists.chains1[i] = &lists.memory[1];
+			}
+
+			/*each boundaryPM call adds one chain to the last list, and we need 2 * numpresent - 2 chains.*/
+			for (i = 2; i != 2 * numpresent - 2; ++i) boundaryPM(&lists, leaves, numpresent, (int)maxbitlen - 1, (int)i);
+
+			for (node = lists.chains1[maxbitlen - 1]; node; node = node->tail)
+			{
+				for (i = 0; i != node->index; ++i) ++lengths[leaves[i].index];
+			}
+		}
+
+		lodepng_free(lists.memory);
+		lodepng_free(lists.freelist);
+		lodepng_free(lists.chains0);
+		lodepng_free(lists.chains1);
+	}
+
+	lodepng_free(leaves);
+	return error;
+}
+
+/*Create the Huffman tree given the symbol frequencies*/
+static unsigned HuffmanTree_makeFromFrequencies(HuffmanTree* tree, const unsigned* frequencies,
+	size_t mincodes, size_t numcodes, unsigned maxbitlen)
+{
+	unsigned error = 0;
+	while (!frequencies[numcodes - 1] && numcodes > mincodes) --numcodes; /*trim zeroes*/
+	tree->maxbitlen = maxbitlen;
+	tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+	tree->lengths = (unsigned*)lodepng_realloc(tree->lengths, numcodes * sizeof(unsigned));
+	if (!tree->lengths) return 83; /*alloc fail*/
+	/*initialize all lengths to 0*/
+	memset(tree->lengths, 0, numcodes * sizeof(unsigned));
+
+	error = lodepng_huffman_code_lengths(tree->lengths, frequencies, numcodes, maxbitlen);
+	if (!error) error = HuffmanTree_makeFromLengths2(tree);
+	return error;
+}
+
+static unsigned HuffmanTree_getCode(const HuffmanTree* tree, unsigned index)
+{
+	return tree->tree1d[index];
+}
+
+static unsigned HuffmanTree_getLength(const HuffmanTree* tree, unsigned index)
+{
+	return tree->lengths[index];
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*get the literal and length code tree of a deflated block with fixed tree, as per the deflate specification*/
+static unsigned generateFixedLitLenTree(HuffmanTree* tree)
+{
+	unsigned i, error = 0;
+	unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+	if (!bitlen) return 83; /*alloc fail*/
+
+	/*288 possible codes: 0-255=literals, 256=endcode, 257-285=lengthcodes, 286-287=unused*/
+	for (i = 0; i <= 143; ++i) bitlen[i] = 8;
+	for (i = 144; i <= 255; ++i) bitlen[i] = 9;
+	for (i = 256; i <= 279; ++i) bitlen[i] = 7;
+	for (i = 280; i <= 287; ++i) bitlen[i] = 8;
+
+	error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DEFLATE_CODE_SYMBOLS, 15);
+
+	lodepng_free(bitlen);
+	return error;
+}
+
+/*get the distance code tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static unsigned generateFixedDistanceTree(HuffmanTree* tree)
+{
+	unsigned i, error = 0;
+	unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+	if (!bitlen) return 83; /*alloc fail*/
+
+	/*there are 32 distance codes, but 30-31 are unused*/
+	for (i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen[i] = 5;
+	error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DISTANCE_SYMBOLS, 15);
+
+	lodepng_free(bitlen);
+	return error;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/*
+returns the code, or (unsigned)(-1) if error happened
+inbitlength is the length of the complete buffer, in bits (so its byte length times 8)
+*/
+static unsigned huffmanDecodeSymbol(const unsigned char* in, size_t* bp,
+	const HuffmanTree* codetree, size_t inbitlength)
+{
+	unsigned treepos = 0, ct;
+	for (;;)
+	{
+		if (*bp >= inbitlength) return (unsigned)(-1); /*error: end of input memory reached without endcode*/
+		/*
+		decode the symbol from the tree. The "readBitFromStream" code is inlined in
+		the expression below because this is the biggest bottleneck while decoding
+		*/
+		ct = codetree->tree2d[(treepos << 1) + READBIT(*bp, in)];
+		++(*bp);
+		if (ct < codetree->numcodes) return ct; /*the symbol is decoded, return it*/
+		else treepos = ct - codetree->numcodes; /*symbol not yet decoded, instead move tree position*/
+
+		if (treepos >= codetree->numcodes) return (unsigned)(-1); /*error: it appeared outside the codetree*/
+	}
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Inflator (Decompressor)                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*get the tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static void getTreeInflateFixed(HuffmanTree* tree_ll, HuffmanTree* tree_d)
+{
+	/*TODO: check for out of memory errors*/
+	generateFixedLitLenTree(tree_ll);
+	generateFixedDistanceTree(tree_d);
+}
+
+/*get the tree of a deflated block with dynamic tree, the tree itself is also Huffman compressed with a known tree*/
+static unsigned getTreeInflateDynamic(HuffmanTree* tree_ll, HuffmanTree* tree_d,
+	const unsigned char* in, size_t* bp, size_t inlength)
+{
+	/*make sure that length values that aren't filled in will be 0, or a wrong tree will be generated*/
+	unsigned error = 0;
+	unsigned n, HLIT, HDIST, HCLEN, i;
+	size_t inbitlength = inlength * 8;
+
+	/*see comments in deflateDynamic for explanation of the context and these variables, it is analogous*/
+	unsigned* bitlen_ll = 0; /*lit,len code lengths*/
+	unsigned* bitlen_d = 0; /*dist code lengths*/
+	/*code length code lengths ("clcl"), the bit lengths of the huffman tree used to compress bitlen_ll and bitlen_d*/
+	unsigned* bitlen_cl = 0;
+	HuffmanTree tree_cl; /*the code tree for code length codes (the huffman tree for compressed huffman trees)*/
+
+	if ((*bp) + 14 > (inlength << 3)) return 49; /*error: the bit pointer is or will go past the memory*/
+
+	/*number of literal/length codes + 257. Unlike the spec, the value 257 is added to it here already*/
+	HLIT = readBitsFromStream(bp, in, 5) + 257;
+	/*number of distance codes. Unlike the spec, the value 1 is added to it here already*/
+	HDIST = readBitsFromStream(bp, in, 5) + 1;
+	/*number of code length codes. Unlike the spec, the value 4 is added to it here already*/
+	HCLEN = readBitsFromStream(bp, in, 4) + 4;
+
+	if ((*bp) + HCLEN * 3 > (inlength << 3)) return 50; /*error: the bit pointer is or will go past the memory*/
+
+	HuffmanTree_init(&tree_cl);
+
+	while (!error)
+	{
+		/*read the code length codes out of 3 * (amount of code length codes) bits*/
+
+		bitlen_cl = (unsigned*)lodepng_malloc(NUM_CODE_LENGTH_CODES * sizeof(unsigned));
+		if (!bitlen_cl) ERROR_BREAK(83 /*alloc fail*/);
+
+		for (i = 0; i != NUM_CODE_LENGTH_CODES; ++i)
+		{
+			if (i < HCLEN) bitlen_cl[CLCL_ORDER[i]] = readBitsFromStream(bp, in, 3);
+			else bitlen_cl[CLCL_ORDER[i]] = 0; /*if not, it must stay 0*/
+		}
+
+		error = HuffmanTree_makeFromLengths(&tree_cl, bitlen_cl, NUM_CODE_LENGTH_CODES, 7);
+		if (error) break;
+
+		/*now we can use this tree to read the lengths for the tree that this function will return*/
+		bitlen_ll = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+		bitlen_d = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+		if (!bitlen_ll || !bitlen_d) ERROR_BREAK(83 /*alloc fail*/);
+		for (i = 0; i != NUM_DEFLATE_CODE_SYMBOLS; ++i) bitlen_ll[i] = 0;
+		for (i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen_d[i] = 0;
+
+		/*i is the current symbol we're reading in the part that contains the code lengths of lit/len and dist codes*/
+		i = 0;
+		while (i < HLIT + HDIST)
+		{
+			unsigned code = huffmanDecodeSymbol(in, bp, &tree_cl, inbitlength);
+			if (code <= 15) /*a length code*/
+			{
+				if (i < HLIT) bitlen_ll[i] = code;
+				else bitlen_d[i - HLIT] = code;
+				++i;
+			}
+			else if (code == 16) /*repeat previous*/
+			{
+				unsigned replength = 3; /*read in the 2 bits that indicate repeat length (3-6)*/
+				unsigned value; /*set value to the previous code*/
+
+				if (i == 0) ERROR_BREAK(54); /*can't repeat previous if i is 0*/
+
+				if ((*bp + 2) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+				replength += readBitsFromStream(bp, in, 2);
+
+				if (i < HLIT + 1) value = bitlen_ll[i - 1];
+				else value = bitlen_d[i - HLIT - 1];
+				/*repeat this value in the next lengths*/
+				for (n = 0; n < replength; ++n)
+				{
+					if (i >= HLIT + HDIST) ERROR_BREAK(13); /*error: i is larger than the amount of codes*/
+					if (i < HLIT) bitlen_ll[i] = value;
+					else bitlen_d[i - HLIT] = value;
+					++i;
+				}
+			}
+			else if (code == 17) /*repeat "0" 3-10 times*/
+			{
+				unsigned replength = 3; /*read in the bits that indicate repeat length*/
+				if ((*bp + 3) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+				replength += readBitsFromStream(bp, in, 3);
+
+				/*repeat this value in the next lengths*/
+				for (n = 0; n < replength; ++n)
+				{
+					if (i >= HLIT + HDIST) ERROR_BREAK(14); /*error: i is larger than the amount of codes*/
+
+					if (i < HLIT) bitlen_ll[i] = 0;
+					else bitlen_d[i - HLIT] = 0;
+					++i;
+				}
+			}
+			else if (code == 18) /*repeat "0" 11-138 times*/
+			{
+				unsigned replength = 11; /*read in the bits that indicate repeat length*/
+				if ((*bp + 7) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+				replength += readBitsFromStream(bp, in, 7);
+
+				/*repeat this value in the next lengths*/
+				for (n = 0; n < replength; ++n)
+				{
+					if (i >= HLIT + HDIST) ERROR_BREAK(15); /*error: i is larger than the amount of codes*/
+
+					if (i < HLIT) bitlen_ll[i] = 0;
+					else bitlen_d[i - HLIT] = 0;
+					++i;
+				}
+			}
+			else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+			{
+				if (code == (unsigned)(-1))
+				{
+					/*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+					(10=no endcode, 11=wrong jump outside of tree)*/
+					error = (*bp) > inbitlength ? 10 : 11;
+				}
+				else error = 16; /*unexisting code, this can never happen*/
+				break;
+			}
+		}
+		if (error) break;
+
+		if (bitlen_ll[256] == 0) ERROR_BREAK(64); /*the length of the end code 256 must be larger than 0*/
+
+		/*now we've finally got HLIT and HDIST, so generate the code trees, and the function is done*/
+		error = HuffmanTree_makeFromLengths(tree_ll, bitlen_ll, NUM_DEFLATE_CODE_SYMBOLS, 15);
+		if (error) break;
+		error = HuffmanTree_makeFromLengths(tree_d, bitlen_d, NUM_DISTANCE_SYMBOLS, 15);
+
+		break; /*end of error-while*/
+	}
+
+	lodepng_free(bitlen_cl);
+	lodepng_free(bitlen_ll);
+	lodepng_free(bitlen_d);
+	HuffmanTree_cleanup(&tree_cl);
+
+	return error;
+}
+
+/*inflate a block with dynamic of fixed Huffman tree*/
+static unsigned inflateHuffmanBlock(ucvector* out, const unsigned char* in, size_t* bp,
+	size_t* pos, size_t inlength, unsigned btype)
+{
+	unsigned error = 0;
+	HuffmanTree tree_ll; /*the huffman tree for literal and length codes*/
+	HuffmanTree tree_d; /*the huffman tree for distance codes*/
+	size_t inbitlength = inlength * 8;
+
+	HuffmanTree_init(&tree_ll);
+	HuffmanTree_init(&tree_d);
+
+	if (btype == 1) getTreeInflateFixed(&tree_ll, &tree_d);
+	else if (btype == 2) error = getTreeInflateDynamic(&tree_ll, &tree_d, in, bp, inlength);
+
+	while (!error) /*decode all symbols until end reached, breaks at end code*/
+	{
+		/*code_ll is literal, length or end code*/
+		unsigned code_ll = huffmanDecodeSymbol(in, bp, &tree_ll, inbitlength);
+		if (code_ll <= 255) /*literal symbol*/
+		{
+			/*ucvector_push_back would do the same, but for some reason the two lines below run 10% faster*/
+			if (!ucvector_resize(out, (*pos) + 1)) ERROR_BREAK(83 /*alloc fail*/);
+			out->data[*pos] = (unsigned char)code_ll;
+			++(*pos);
+		}
+		else if (code_ll >= FIRST_LENGTH_CODE_INDEX && code_ll <= LAST_LENGTH_CODE_INDEX) /*length code*/
+		{
+			unsigned code_d, distance;
+			unsigned numextrabits_l, numextrabits_d; /*extra bits for length and distance*/
+			size_t start, forward, backward, length;
+
+			/*part 1: get length base*/
+			length = LENGTHBASE[code_ll - FIRST_LENGTH_CODE_INDEX];
+
+			/*part 2: get extra bits and add the value of that to length*/
+			numextrabits_l = LENGTHEXTRA[code_ll - FIRST_LENGTH_CODE_INDEX];
+			if ((*bp + numextrabits_l) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+			length += readBitsFromStream(bp, in, numextrabits_l);
+
+			/*part 3: get distance code*/
+			code_d = huffmanDecodeSymbol(in, bp, &tree_d, inbitlength);
+			if (code_d > 29)
+			{
+				if (code_ll == (unsigned)(-1)) /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+				{
+					/*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+					(10=no endcode, 11=wrong jump outside of tree)*/
+					error = (*bp) > inlength * 8 ? 10 : 11;
+				}
+				else error = 18; /*error: invalid distance code (30-31 are never used)*/
+				break;
+			}
+			distance = DISTANCEBASE[code_d];
+
+			/*part 4: get extra bits from distance*/
+			numextrabits_d = DISTANCEEXTRA[code_d];
+			if ((*bp + numextrabits_d) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+			distance += readBitsFromStream(bp, in, numextrabits_d);
+
+			/*part 5: fill in all the out[n] values based on the length and dist*/
+			start = (*pos);
+			if (distance > start) ERROR_BREAK(52); /*too long backward distance*/
+			backward = start - distance;
+
+			if (!ucvector_resize(out, (*pos) + length)) ERROR_BREAK(83 /*alloc fail*/);
+			if (distance < length) {
+				for (forward = 0; forward < length; ++forward)
+				{
+					out->data[(*pos)++] = out->data[backward++];
+				}
+			}
+			else {
+				memcpy(out->data + *pos, out->data + backward, length);
+				*pos += length;
+			}
+		}
+		else if (code_ll == 256)
+		{
+			break; /*end code, break the loop*/
+		}
+		else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+		{
+			/*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+			(10=no endcode, 11=wrong jump outside of tree)*/
+			error = ((*bp) > inlength * 8) ? 10 : 11;
+			break;
+		}
+	}
+
+	HuffmanTree_cleanup(&tree_ll);
+	HuffmanTree_cleanup(&tree_d);
+
+	return error;
+}
+
+static unsigned inflateNoCompression(ucvector* out, const unsigned char* in, size_t* bp, size_t* pos, size_t inlength)
+{
+	size_t p;
+	unsigned LEN, NLEN, n, error = 0;
+
+	/*go to first boundary of byte*/
+	while (((*bp) & 0x7) != 0) ++(*bp);
+	p = (*bp) / 8; /*byte position*/
+
+	/*read LEN (2 bytes) and NLEN (2 bytes)*/
+	if (p + 4 >= inlength) return 52; /*error, bit pointer will jump past memory*/
+	LEN = in[p] + 256u * in[p + 1]; p += 2;
+	NLEN = in[p] + 256u * in[p + 1]; p += 2;
+
+	/*check if 16-bit NLEN is really the one's complement of LEN*/
+	if (LEN + NLEN != 65535) return 21; /*error: NLEN is not one's complement of LEN*/
+
+	if (!ucvector_resize(out, (*pos) + LEN)) return 83; /*alloc fail*/
+
+	/*read the literal data: LEN bytes are now stored in the out buffer*/
+	if (p + LEN > inlength) return 23; /*error: reading outside of in buffer*/
+	for (n = 0; n < LEN; ++n) out->data[(*pos)++] = in[p++];
+
+	(*bp) = p * 8;
+
+	return error;
+}
+
+static unsigned lodepng_inflatev(ucvector* out,
+	const unsigned char* in, size_t insize,
+	const LodePNGDecompressSettings* settings)
+{
+	/*bit pointer in the "in" data, current byte is bp >> 3, current bit is bp & 0x7 (from lsb to msb of the byte)*/
+	size_t bp = 0;
+	unsigned BFINAL = 0;
+	size_t pos = 0; /*byte position in the out buffer*/
+	unsigned error = 0;
+
+	(void)settings;
+
+	while (!BFINAL)
+	{
+		unsigned BTYPE;
+		if (bp + 2 >= insize * 8) return 52; /*error, bit pointer will jump past memory*/
+		BFINAL = readBitFromStream(&bp, in);
+		BTYPE = 1u * readBitFromStream(&bp, in);
+		BTYPE += 2u * readBitFromStream(&bp, in);
+
+		if (BTYPE == 3) return 20; /*error: invalid BTYPE*/
+		else if (BTYPE == 0) error = inflateNoCompression(out, in, &bp, &pos, insize); /*no compression*/
+		else error = inflateHuffmanBlock(out, in, &bp, &pos, insize, BTYPE); /*compression, BTYPE 01 or 10*/
+
+		if (error) return error;
+	}
+
+	return error;
+}
+
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+	const unsigned char* in, size_t insize,
+	const LodePNGDecompressSettings* settings)
+{
+	unsigned error;
+	ucvector v;
+	ucvector_init_buffer(&v, *out, *outsize);
+	error = lodepng_inflatev(&v, in, insize, settings);
+	*out = v.data;
+	*outsize = v.size;
+	return error;
+}
+
+static unsigned inflate(unsigned char** out, size_t* outsize,
+	const unsigned char* in, size_t insize,
+	const LodePNGDecompressSettings* settings)
+{
+	if (settings->custom_inflate)
+	{
+		return settings->custom_inflate(out, outsize, in, insize, settings);
+	}
+	else
+	{
+		return lodepng_inflate(out, outsize, in, insize, settings);
+	}
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflator (Compressor)                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static const size_t MAX_SUPPORTED_DEFLATE_LENGTH = 258;
+
+/*bitlen is the size in bits of the code*/
+static void addHuffmanSymbol(size_t* bp, ucvector* compressed, unsigned code, unsigned bitlen)
+{
+	addBitsToStreamReversed(bp, compressed, code, bitlen);
+}
+
+/*search the index in the array, that has the largest value smaller than or equal to the given value,
+given array must be sorted (if no value is smaller, it returns the size of the given array)*/
+static size_t searchCodeIndex(const unsigned* array, size_t array_size, size_t value)
+{
+	/*binary search (only small gain over linear). TODO: use CPU log2 instruction for getting symbols instead*/
+	size_t left = 1;
+	size_t right = array_size - 1;
+
+	while (left <= right) {
+		size_t mid = (left + right) >> 1;
+		if (array[mid] >= value) right = mid - 1;
+		else left = mid + 1;
+	}
+	if (left >= array_size || array[left] > value) left--;
+	return left;
+}
+
+static void addLengthDistance(uivector* values, size_t length, size_t distance)
+{
+	/*values in encoded vector are those used by deflate:
+	0-255: literal bytes
+	256: end
+	257-285: length/distance pair (length code, followed by extra length bits, distance code, extra distance bits)
+	286-287: invalid*/
+
+	unsigned length_code = (unsigned)searchCodeIndex(LENGTHBASE, 29, length);
+	unsigned extra_length = (unsigned)(length - LENGTHBASE[length_code]);
+	unsigned dist_code = (unsigned)searchCodeIndex(DISTANCEBASE, 30, distance);
+	unsigned extra_distance = (unsigned)(distance - DISTANCEBASE[dist_code]);
+
+	uivector_push_back(values, length_code + FIRST_LENGTH_CODE_INDEX);
+	uivector_push_back(values, extra_length);
+	uivector_push_back(values, dist_code);
+	uivector_push_back(values, extra_distance);
+}
+
+/*3 bytes of data get encoded into two bytes. The hash cannot use more than 3
+bytes as input because 3 is the minimum match length for deflate*/
+static const unsigned HASH_NUM_VALUES = 65536;
+static const unsigned HASH_BIT_MASK = 65535; /*HASH_NUM_VALUES - 1, but C90 does not like that as initializer*/
+
+typedef struct Hash
+{
+	int* head; /*hash value to head circular pos - can be outdated if went around window*/
+	/*circular pos to prev circular pos*/
+	unsigned short* chain;
+	int* val; /*circular pos to hash value*/
+
+	/*TODO: do this not only for zeros but for any repeated byte. However for PNG
+	it's always going to be the zeros that dominate, so not important for PNG*/
+	int* headz; /*similar to head, but for chainz*/
+	unsigned short* chainz; /*those with same amount of zeros*/
+	unsigned short* zeros; /*length of zeros streak, used as a second hash chain*/
+} Hash;
+
+static unsigned hash_init(Hash* hash, unsigned windowsize)
+{
+	unsigned i;
+	hash->head = (int*)lodepng_malloc(sizeof(int) * HASH_NUM_VALUES);
+	hash->val = (int*)lodepng_malloc(sizeof(int) * windowsize);
+	hash->chain = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+	hash->zeros = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+	hash->headz = (int*)lodepng_malloc(sizeof(int) * (MAX_SUPPORTED_DEFLATE_LENGTH + 1));
+	hash->chainz = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+	if (!hash->head || !hash->chain || !hash->val || !hash->headz || !hash->chainz || !hash->zeros)
+	{
+		return 83; /*alloc fail*/
+	}
+
+	/*initialize hash table*/
+	for (i = 0; i != HASH_NUM_VALUES; ++i) hash->head[i] = -1;
+	for (i = 0; i != windowsize; ++i) hash->val[i] = -1;
+	for (i = 0; i != windowsize; ++i) hash->chain[i] = i; /*same value as index indicates uninitialized*/
+
+	for (i = 0; i <= MAX_SUPPORTED_DEFLATE_LENGTH; ++i) hash->headz[i] = -1;
+	for (i = 0; i != windowsize; ++i) hash->chainz[i] = i; /*same value as index indicates uninitialized*/
+
+	return 0;
+}
+
+static void hash_cleanup(Hash* hash)
+{
+	lodepng_free(hash->head);
+	lodepng_free(hash->val);
+	lodepng_free(hash->chain);
+
+	lodepng_free(hash->zeros);
+	lodepng_free(hash->headz);
+	lodepng_free(hash->chainz);
+}
+
+
+
+static unsigned getHash(const unsigned char* data, size_t size, size_t pos)
+{
+	unsigned result = 0;
+	if (pos + 2 < size)
+	{
+		/*A simple shift and xor hash is used. Since the data of PNGs is dominated
+		by zeroes due to the filters, a better hash does not have a significant
+		effect on speed in traversing the chain, and causes more time spend on
+		calculating the hash.*/
+		result ^= (unsigned)(data[pos + 0] << 0u);
+		result ^= (unsigned)(data[pos + 1] << 4u);
+		result ^= (unsigned)(data[pos + 2] << 8u);
+	}
+	else {
+		size_t amount, i;
+		if (pos >= size) return 0;
+		amount = size - pos;
+		for (i = 0; i != amount; ++i) result ^= (unsigned)(data[pos + i] << (i * 8u));
+	}
+	return result & HASH_BIT_MASK;
+}
+
+static unsigned countZeros(const unsigned char* data, size_t size, size_t pos)
+{
+	const unsigned char* start = data + pos;
+	const unsigned char* end = start + MAX_SUPPORTED_DEFLATE_LENGTH;
+	if (end > data + size) end = data + size;
+	data = start;
+	while (data != end && *data == 0) ++data;
+	/*subtracting two addresses returned as 32-bit number (max value is MAX_SUPPORTED_DEFLATE_LENGTH)*/
+	return (unsigned)(data - start);
+}
+
+/*wpos = pos & (windowsize - 1)*/
+static void updateHashChain(Hash* hash, size_t wpos, unsigned hashval, unsigned short numzeros)
+{
+	hash->val[wpos] = (int)hashval;
+	if (hash->head[hashval] != -1) hash->chain[wpos] = hash->head[hashval];
+	hash->head[hashval] = wpos;
+
+	hash->zeros[wpos] = numzeros;
+	if (hash->headz[numzeros] != -1) hash->chainz[wpos] = hash->headz[numzeros];
+	hash->headz[numzeros] = wpos;
+}
+
+/*
+LZ77-encode the data. Return value is error code. The input are raw bytes, the output
+is in the form of unsigned integers with codes representing for example literal bytes, or
+length/distance pairs.
+It uses a hash table technique to let it encode faster. When doing LZ77 encoding, a
+sliding window (of windowsize) is used, and all past bytes in that window can be used as
+the "dictionary". A brute force search through all possible distances would be slow, and
+this hash technique is one out of several ways to speed this up.
+*/
+static unsigned encodeLZ77(uivector* out, Hash* hash,
+	const unsigned char* in, size_t inpos, size_t insize, unsigned windowsize,
+	unsigned minmatch, unsigned nicematch, unsigned lazymatching)
+{
+	size_t pos;
+	unsigned i, error = 0;
+	/*for large window lengths, assume the user wants no compression loss. Otherwise, max hash chain length speedup.*/
+	unsigned maxchainlength = windowsize >= 8192 ? windowsize : windowsize / 8;
+	unsigned maxlazymatch = windowsize >= 8192 ? MAX_SUPPORTED_DEFLATE_LENGTH : 64;
+
+	unsigned usezeros = 1; /*not sure if setting it to false for windowsize < 8192 is better or worse*/
+	unsigned numzeros = 0;
+
+	unsigned offset; /*the offset represents the distance in LZ77 terminology*/
+	unsigned length;
+	unsigned lazy = 0;
+	unsigned lazylength = 0, lazyoffset = 0;
+	unsigned hashval;
+	unsigned current_offset, current_length;
+	unsigned prev_offset;
+	const unsigned char *lastptr, *foreptr, *backptr;
+	unsigned hashpos;
+
+	if (windowsize == 0 || windowsize > 32768) return 60; /*error: windowsize smaller/larger than allowed*/
+	if ((windowsize & (windowsize - 1)) != 0) return 90; /*error: must be power of two*/
+
+	if (nicematch > MAX_SUPPORTED_DEFLATE_LENGTH) nicematch = MAX_SUPPORTED_DEFLATE_LENGTH;
+
+	for (pos = inpos; pos < insize; ++pos)
+	{
+		size_t wpos = pos & (windowsize - 1); /*position for in 'circular' hash buffers*/
+		unsigned chainlength = 0;
+
+		hashval = getHash(in, insize, pos);
+
+		if (usezeros && hashval == 0)
+		{
+			if (numzeros == 0) numzeros = countZeros(in, insize, pos);
+			else if (pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+		}
+		else
+		{
+			numzeros = 0;
+		}
+
+		updateHashChain(hash, wpos, hashval, numzeros);
+
+		/*the length and offset found for the current position*/
+		length = 0;
+		offset = 0;
+
+		hashpos = hash->chain[wpos];
+
+		lastptr = &in[insize < pos + MAX_SUPPORTED_DEFLATE_LENGTH ? insize : pos + MAX_SUPPORTED_DEFLATE_LENGTH];
+
+		/*search for the longest string*/
+		prev_offset = 0;
+		for (;;)
+		{
+			if (chainlength++ >= maxchainlength) break;
+			current_offset = hashpos <= wpos ? wpos - hashpos : wpos - hashpos + windowsize;
+
+			if (current_offset < prev_offset) break; /*stop when went completely around the circular buffer*/
+			prev_offset = current_offset;
+			if (current_offset > 0)
+			{
+				/*test the next characters*/
+				foreptr = &in[pos];
+				backptr = &in[pos - current_offset];
+
+				/*common case in PNGs is lots of zeros. Quickly skip over them as a speedup*/
+				if (numzeros >= 3)
+				{
+					unsigned skip = hash->zeros[hashpos];
+					if (skip > numzeros) skip = numzeros;
+					backptr += skip;
+					foreptr += skip;
+				}
+
+				while (foreptr != lastptr && *backptr == *foreptr) /*maximum supported length by deflate is max length*/
+				{
+					++backptr;
+					++foreptr;
+				}
+				current_length = (unsigned)(foreptr - &in[pos]);
+
+				if (current_length > length)
+				{
+					length = current_length; /*the longest length*/
+					offset = current_offset; /*the offset that is related to this longest length*/
+					/*jump out once a length of max length is found (speed gain). This also jumps
+					out if length is MAX_SUPPORTED_DEFLATE_LENGTH*/
+					if (current_length >= nicematch) break;
+				}
+			}
+
+			if (hashpos == hash->chain[hashpos]) break;
+
+			if (numzeros >= 3 && length > numzeros)
+			{
+				hashpos = hash->chainz[hashpos];
+				if (hash->zeros[hashpos] != numzeros) break;
+			}
+			else
+			{
+				hashpos = hash->chain[hashpos];
+				/*outdated hash value, happens if particular value was not encountered in whole last window*/
+				if (hash->val[hashpos] != (int)hashval) break;
+			}
+		}
+
+		if (lazymatching)
+		{
+			if (!lazy && length >= 3 && length <= maxlazymatch && length < MAX_SUPPORTED_DEFLATE_LENGTH)
+			{
+				lazy = 1;
+				lazylength = length;
+				lazyoffset = offset;
+				continue; /*try the next byte*/
+			}
+			if (lazy)
+			{
+				lazy = 0;
+				if (pos == 0) ERROR_BREAK(81);
+				if (length > lazylength + 1)
+				{
+					/*push the previous character as literal*/
+					if (!uivector_push_back(out, in[pos - 1])) ERROR_BREAK(83 /*alloc fail*/);
+				}
+				else
+				{
+					length = lazylength;
+					offset = lazyoffset;
+					hash->head[hashval] = -1; /*the same hashchain update will be done, this ensures no wrong alteration*/
+					hash->headz[numzeros] = -1; /*idem*/
+					--pos;
+				}
+			}
+		}
+		if (length >= 3 && offset > windowsize) ERROR_BREAK(86 /*too big (or overflown negative) offset*/);
+
+		/*encode it as length/distance pair or literal value*/
+		if (length < 3) /*only lengths of 3 or higher are supported as length/distance pair*/
+		{
+			if (!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+		}
+		else if (length < minmatch || (length == 3 && offset > 4096))
+		{
+			/*compensate for the fact that longer offsets have more extra bits, a
+			length of only 3 may be not worth it then*/
+			if (!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+		}
+		else
+		{
+			addLengthDistance(out, length, offset);
+			for (i = 1; i < length; ++i)
+			{
+				++pos;
+				wpos = pos & (windowsize - 1);
+				hashval = getHash(in, insize, pos);
+				if (usezeros && hashval == 0)
+				{
+					if (numzeros == 0) numzeros = countZeros(in, insize, pos);
+					else if (pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+				}
+				else
+				{
+					numzeros = 0;
+				}
+				updateHashChain(hash, wpos, hashval, numzeros);
+			}
+		}
+	} /*end of the loop through each character of input*/
+
+	return error;
+}
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+static unsigned deflateNoCompression(ucvector* out, const unsigned char* data, size_t datasize)
+{
+	/*non compressed deflate block data: 1 bit BFINAL,2 bits BTYPE,(5 bits): it jumps to start of next byte,
+	2 bytes LEN, 2 bytes NLEN, LEN bytes literal DATA*/
+
+	size_t i, j, numdeflateblocks = (datasize + 65534) / 65535;
+	unsigned datapos = 0;
+	for (i = 0; i != numdeflateblocks; ++i)
+	{
+		unsigned BFINAL, BTYPE, LEN, NLEN;
+		unsigned char firstbyte;
+
+		BFINAL = (i == numdeflateblocks - 1);
+		BTYPE = 0;
+
+		firstbyte = (unsigned char)(BFINAL + ((BTYPE & 1) << 1) + ((BTYPE & 2) << 1));
+		ucvector_push_back(out, firstbyte);
+
+		LEN = 65535;
+		if (datasize - datapos < 65535) LEN = (unsigned)datasize - datapos;
+		NLEN = 65535 - LEN;
+
+		ucvector_push_back(out, (unsigned char)(LEN & 255));
+		ucvector_push_back(out, (unsigned char)(LEN >> 8));
+		ucvector_push_back(out, (unsigned char)(NLEN & 255));
+		ucvector_push_back(out, (unsigned char)(NLEN >> 8));
+
+		/*Decompressed data*/
+		for (j = 0; j < 65535 && datapos < datasize; ++j)
+		{
+			ucvector_push_back(out, data[datapos++]);
+		}
+	}
+
+	return 0;
+}
+
+/*
+write the lz77-encoded data, which has lit, len and dist codes, to compressed stream using huffman trees.
+tree_ll: the tree for lit and len codes.
+tree_d: the tree for distance codes.
+*/
+static void writeLZ77data(size_t* bp, ucvector* out, const uivector* lz77_encoded,
+	const HuffmanTree* tree_ll, const HuffmanTree* tree_d)
+{
+	size_t i = 0;
+	for (i = 0; i != lz77_encoded->size; ++i)
+	{
+		unsigned val = lz77_encoded->data[i];
+		addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_ll, val), HuffmanTree_getLength(tree_ll, val));
+		if (val > 256) /*for a length code, 3 more things have to be added*/
+		{
+			unsigned length_index = val - FIRST_LENGTH_CODE_INDEX;
+			unsigned n_length_extra_bits = LENGTHEXTRA[length_index];
+			unsigned length_extra_bits = lz77_encoded->data[++i];
+
+			unsigned distance_code = lz77_encoded->data[++i];
+
+			unsigned distance_index = distance_code;
+			unsigned n_distance_extra_bits = DISTANCEEXTRA[distance_index];
+			unsigned distance_extra_bits = lz77_encoded->data[++i];
+
+			addBitsToStream(bp, out, length_extra_bits, n_length_extra_bits);
+			addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_d, distance_code),
+				HuffmanTree_getLength(tree_d, distance_code));
+			addBitsToStream(bp, out, distance_extra_bits, n_distance_extra_bits);
+		}
+	}
+}
+
+/*Deflate for a block of type "dynamic", that is, with freely, optimally, created huffman trees*/
+static unsigned deflateDynamic(ucvector* out, size_t* bp, Hash* hash,
+	const unsigned char* data, size_t datapos, size_t dataend,
+	const LodePNGCompressSettings* settings, unsigned final)
+{
+	unsigned error = 0;
+
+	/*
+	A block is compressed as follows: The PNG data is lz77 encoded, resulting in
+	literal bytes and length/distance pairs. This is then huffman compressed with
+	two huffman trees. One huffman tree is used for the lit and len values ("ll"),
+	another huffman tree is used for the dist values ("d"). These two trees are
+	stored using their code lengths, and to compress even more these code lengths
+	are also run-length encoded and huffman compressed. This gives a huffman tree
+	of code lengths "cl". The code lenghts used to describe this third tree are
+	the code length code lengths ("clcl").
+	*/
+
+	/*The lz77 encoded data, represented with integers since there will also be length and distance codes in it*/
+	uivector lz77_encoded;
+	HuffmanTree tree_ll; /*tree for lit,len values*/
+	HuffmanTree tree_d; /*tree for distance codes*/
+	HuffmanTree tree_cl; /*tree for encoding the code lengths representing tree_ll and tree_d*/
+	uivector frequencies_ll; /*frequency of lit,len codes*/
+	uivector frequencies_d; /*frequency of dist codes*/
+	uivector frequencies_cl; /*frequency of code length codes*/
+	uivector bitlen_lld; /*lit,len,dist code lenghts (int bits), literally (without repeat codes).*/
+	uivector bitlen_lld_e; /*bitlen_lld encoded with repeat codes (this is a rudemtary run length compression)*/
+	/*bitlen_cl is the code length code lengths ("clcl"). The bit lengths of codes to represent tree_cl
+	(these are written as is in the file, it would be crazy to compress these using yet another huffman
+	tree that needs to be represented by yet another set of code lengths)*/
+	uivector bitlen_cl;
+	size_t datasize = dataend - datapos;
+
+	/*
+	Due to the huffman compression of huffman tree representations ("two levels"), there are some anologies:
+	bitlen_lld is to tree_cl what data is to tree_ll and tree_d.
+	bitlen_lld_e is to bitlen_lld what lz77_encoded is to data.
+	bitlen_cl is to bitlen_lld_e what bitlen_lld is to lz77_encoded.
+	*/
+
+	unsigned BFINAL = final;
+	size_t numcodes_ll, numcodes_d, i;
+	unsigned HLIT, HDIST, HCLEN;
+
+	uivector_init(&lz77_encoded);
+	HuffmanTree_init(&tree_ll);
+	HuffmanTree_init(&tree_d);
+	HuffmanTree_init(&tree_cl);
+	uivector_init(&frequencies_ll);
+	uivector_init(&frequencies_d);
+	uivector_init(&frequencies_cl);
+	uivector_init(&bitlen_lld);
+	uivector_init(&bitlen_lld_e);
+	uivector_init(&bitlen_cl);
+
+	/*This while loop never loops due to a break at the end, it is here to
+	allow breaking out of it to the cleanup phase on error conditions.*/
+	while (!error)
+	{
+		if (settings->use_lz77)
+		{
+			error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+				settings->minmatch, settings->nicematch, settings->lazymatching);
+			if (error) break;
+		}
+		else
+		{
+			if (!uivector_resize(&lz77_encoded, datasize)) ERROR_BREAK(83 /*alloc fail*/);
+			for (i = datapos; i < dataend; ++i) lz77_encoded.data[i - datapos] = data[i]; /*no LZ77, but still will be Huffman compressed*/
+		}
+
+		if (!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83 /*alloc fail*/);
+		if (!uivector_resizev(&frequencies_d, 30, 0)) ERROR_BREAK(83 /*alloc fail*/);
+
+		/*Count the frequencies of lit, len and dist codes*/
+		for (i = 0; i != lz77_encoded.size; ++i)
+		{
+			unsigned symbol = lz77_encoded.data[i];
+			++frequencies_ll.data[symbol];
+			if (symbol > 256)
+			{
+				unsigned dist = lz77_encoded.data[i + 2];
+				++frequencies_d.data[dist];
+				i += 3;
+			}
+		}
+		frequencies_ll.data[256] = 1; /*there will be exactly 1 end code, at the end of the block*/
+
+		/*Make both huffman trees, one for the lit and len codes, one for the dist codes*/
+		error = HuffmanTree_makeFromFrequencies(&tree_ll, frequencies_ll.data, 257, frequencies_ll.size, 15);
+		if (error) break;
+		/*2, not 1, is chosen for mincodes: some buggy PNG decoders require at least 2 symbols in the dist tree*/
+		error = HuffmanTree_makeFromFrequencies(&tree_d, frequencies_d.data, 2, frequencies_d.size, 15);
+		if (error) break;
+
+		numcodes_ll = tree_ll.numcodes; if (numcodes_ll > 286) numcodes_ll = 286;
+		numcodes_d = tree_d.numcodes; if (numcodes_d > 30) numcodes_d = 30;
+		/*store the code lengths of both generated trees in bitlen_lld*/
+		for (i = 0; i != numcodes_ll; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_ll, (unsigned)i));
+		for (i = 0; i != numcodes_d; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_d, (unsigned)i));
+
+		/*run-length compress bitlen_ldd into bitlen_lld_e by using repeat codes 16 (copy length 3-6 times),
+		17 (3-10 zeroes), 18 (11-138 zeroes)*/
+		for (i = 0; i != (unsigned)bitlen_lld.size; ++i)
+		{
+			unsigned j = 0; /*amount of repititions*/
+			while (i + j + 1 < (unsigned)bitlen_lld.size && bitlen_lld.data[i + j + 1] == bitlen_lld.data[i]) ++j;
+
+			if (bitlen_lld.data[i] == 0 && j >= 2) /*repeat code for zeroes*/
+			{
+				++j; /*include the first zero*/
+				if (j <= 10) /*repeat code 17 supports max 10 zeroes*/
+				{
+					uivector_push_back(&bitlen_lld_e, 17);
+					uivector_push_back(&bitlen_lld_e, j - 3);
+				}
+				else /*repeat code 18 supports max 138 zeroes*/
+				{
+					if (j > 138) j = 138;
+					uivector_push_back(&bitlen_lld_e, 18);
+					uivector_push_back(&bitlen_lld_e, j - 11);
+				}
+				i += (j - 1);
+			}
+			else if (j >= 3) /*repeat code for value other than zero*/
+			{
+				size_t k;
+				unsigned num = j / 6, rest = j % 6;
+				uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+				for (k = 0; k < num; ++k)
+				{
+					uivector_push_back(&bitlen_lld_e, 16);
+					uivector_push_back(&bitlen_lld_e, 6 - 3);
+				}
+				if (rest >= 3)
+				{
+					uivector_push_back(&bitlen_lld_e, 16);
+					uivector_push_back(&bitlen_lld_e, rest - 3);
+				}
+				else j -= rest;
+				i += j;
+			}
+			else /*too short to benefit from repeat code*/
+			{
+				uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+			}
+		}
+
+		/*generate tree_cl, the huffmantree of huffmantrees*/
+
+		if (!uivector_resizev(&frequencies_cl, NUM_CODE_LENGTH_CODES, 0)) ERROR_BREAK(83 /*alloc fail*/);
+		for (i = 0; i != bitlen_lld_e.size; ++i)
+		{
+			++frequencies_cl.data[bitlen_lld_e.data[i]];
+			/*after a repeat code come the bits that specify the number of repetitions,
+			those don't need to be in the frequencies_cl calculation*/
+			if (bitlen_lld_e.data[i] >= 16) ++i;
+		}
+
+		error = HuffmanTree_makeFromFrequencies(&tree_cl, frequencies_cl.data,
+			frequencies_cl.size, frequencies_cl.size, 7);
+		if (error) break;
+
+		if (!uivector_resize(&bitlen_cl, tree_cl.numcodes)) ERROR_BREAK(83 /*alloc fail*/);
+		for (i = 0; i != tree_cl.numcodes; ++i)
+		{
+			/*lenghts of code length tree is in the order as specified by deflate*/
+			bitlen_cl.data[i] = HuffmanTree_getLength(&tree_cl, CLCL_ORDER[i]);
+		}
+		while (bitlen_cl.data[bitlen_cl.size - 1] == 0 && bitlen_cl.size > 4)
+		{
+			/*remove zeros at the end, but minimum size must be 4*/
+			if (!uivector_resize(&bitlen_cl, bitlen_cl.size - 1)) ERROR_BREAK(83 /*alloc fail*/);
+		}
+		if (error) break;
+
+		/*
+		Write everything into the output
+
+		After the BFINAL and BTYPE, the dynamic block consists out of the following:
+		- 5 bits HLIT, 5 bits HDIST, 4 bits HCLEN
+		- (HCLEN+4)*3 bits code lengths of code length alphabet
+		- HLIT + 257 code lenghts of lit/length alphabet (encoded using the code length
+		  alphabet, + possible repetition codes 16, 17, 18)
+		- HDIST + 1 code lengths of distance alphabet (encoded using the code length
+		  alphabet, + possible repetition codes 16, 17, 18)
+		- compressed data
+		- 256 (end code)
+		*/
+
+		/*Write block type*/
+		addBitToStream(bp, out, BFINAL);
+		addBitToStream(bp, out, 0); /*first bit of BTYPE "dynamic"*/
+		addBitToStream(bp, out, 1); /*second bit of BTYPE "dynamic"*/
+
+		/*write the HLIT, HDIST and HCLEN values*/
+		HLIT = (unsigned)(numcodes_ll - 257);
+		HDIST = (unsigned)(numcodes_d - 1);
+		HCLEN = (unsigned)bitlen_cl.size - 4;
+		/*trim zeroes for HCLEN. HLIT and HDIST were already trimmed at tree creation*/
+		while (!bitlen_cl.data[HCLEN + 4 - 1] && HCLEN > 0) --HCLEN;
+		addBitsToStream(bp, out, HLIT, 5);
+		addBitsToStream(bp, out, HDIST, 5);
+		addBitsToStream(bp, out, HCLEN, 4);
+
+		/*write the code lenghts of the code length alphabet*/
+		for (i = 0; i != HCLEN + 4; ++i) addBitsToStream(bp, out, bitlen_cl.data[i], 3);
+
+		/*write the lenghts of the lit/len AND the dist alphabet*/
+		for (i = 0; i != bitlen_lld_e.size; ++i)
+		{
+			addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_cl, bitlen_lld_e.data[i]),
+				HuffmanTree_getLength(&tree_cl, bitlen_lld_e.data[i]));
+			/*extra bits of repeat codes*/
+			if (bitlen_lld_e.data[i] == 16) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 2);
+			else if (bitlen_lld_e.data[i] == 17) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 3);
+			else if (bitlen_lld_e.data[i] == 18) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 7);
+		}
+
+		/*write the compressed data symbols*/
+		writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+		/*error: the length of the end code 256 must be larger than 0*/
+		if (HuffmanTree_getLength(&tree_ll, 256) == 0) ERROR_BREAK(64);
+
+		/*write the end code*/
+		addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+		break; /*end of error-while*/
+	}
+
+	/*cleanup*/
+	uivector_cleanup(&lz77_encoded);
+	HuffmanTree_cleanup(&tree_ll);
+	HuffmanTree_cleanup(&tree_d);
+	HuffmanTree_cleanup(&tree_cl);
+	uivector_cleanup(&frequencies_ll);
+	uivector_cleanup(&frequencies_d);
+	uivector_cleanup(&frequencies_cl);
+	uivector_cleanup(&bitlen_lld_e);
+	uivector_cleanup(&bitlen_lld);
+	uivector_cleanup(&bitlen_cl);
+
+	return error;
+}
+
+static unsigned deflateFixed(ucvector* out, size_t* bp, Hash* hash,
+	const unsigned char* data,
+	size_t datapos, size_t dataend,
+	const LodePNGCompressSettings* settings, unsigned final)
+{
+	HuffmanTree tree_ll; /*tree for literal values and length codes*/
+	HuffmanTree tree_d; /*tree for distance codes*/
+
+	unsigned BFINAL = final;
+	unsigned error = 0;
+	size_t i;
+
+	HuffmanTree_init(&tree_ll);
+	HuffmanTree_init(&tree_d);
+
+	generateFixedLitLenTree(&tree_ll);
+	generateFixedDistanceTree(&tree_d);
+
+	addBitToStream(bp, out, BFINAL);
+	addBitToStream(bp, out, 1); /*first bit of BTYPE*/
+	addBitToStream(bp, out, 0); /*second bit of BTYPE*/
+
+	if (settings->use_lz77) /*LZ77 encoded*/
+	{
+		uivector lz77_encoded;
+		uivector_init(&lz77_encoded);
+		error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+			settings->minmatch, settings->nicematch, settings->lazymatching);
+		if (!error) writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+		uivector_cleanup(&lz77_encoded);
+	}
+	else /*no LZ77, but still will be Huffman compressed*/
+	{
+		for (i = datapos; i < dataend; ++i)
+		{
+			addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, data[i]), HuffmanTree_getLength(&tree_ll, data[i]));
+		}
+	}
+	/*add END code*/
+	if (!error) addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+	/*cleanup*/
+	HuffmanTree_cleanup(&tree_ll);
+	HuffmanTree_cleanup(&tree_d);
+
+	return error;
+}
+
+static unsigned lodepng_deflatev(ucvector* out, const unsigned char* in, size_t insize,
+	const LodePNGCompressSettings* settings)
+{
+	unsigned error = 0;
+	size_t i, blocksize, numdeflateblocks;
+	size_t bp = 0; /*the bit pointer*/
+	Hash hash;
+
+	if (settings->btype > 2) return 61;
+	else if (settings->btype == 0) return deflateNoCompression(out, in, insize);
+	else if (settings->btype == 1) blocksize = insize;
+	else /*if(settings->btype == 2)*/
+	{
+		/*on PNGs, deflate blocks of 65-262k seem to give most dense encoding*/
+		blocksize = insize / 8 + 8;
+		if (blocksize < 65536) blocksize = 65536;
+		if (blocksize > 262144) blocksize = 262144;
+	}
+
+	numdeflateblocks = (insize + blocksize - 1) / blocksize;
+	if (numdeflateblocks == 0) numdeflateblocks = 1;
+
+	error = hash_init(&hash, settings->windowsize);
+	if (error) return error;
+
+	for (i = 0; i != numdeflateblocks && !error; ++i)
+	{
+		unsigned final = (i == numdeflateblocks - 1);
+		size_t start = i * blocksize;
+		size_t end = start + blocksize;
+		if (end > insize) end = insize;
+
+		if (settings->btype == 1) error = deflateFixed(out, &bp, &hash, in, start, end, settings, final);
+		else if (settings->btype == 2) error = deflateDynamic(out, &bp, &hash, in, start, end, settings, final);
+	}
+
+	hash_cleanup(&hash);
+
+	return error;
+}
+
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+	const unsigned char* in, size_t insize,
+	const LodePNGCompressSettings* settings)
+{
+	unsigned error;
+	ucvector v;
+	ucvector_init_buffer(&v, *out, *outsize);
+	error = lodepng_deflatev(&v, in, insize, settings);
+	*out = v.data;
+	*outsize = v.size;
+	return error;
+}
+
+static unsigned deflate(unsigned char** out, size_t* outsize,
+	const unsigned char* in, size_t insize,
+	const LodePNGCompressSettings* settings)
+{
+	if (settings->custom_deflate)
+	{
+		return settings->custom_deflate(out, outsize, in, insize, settings);
+	}
+	else
+	{
+		return lodepng_deflate(out, outsize, in, insize, settings);
+	}
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Adler32                                                                  */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned update_adler32(unsigned adler, const unsigned char* data, unsigned len)
+{
+	unsigned s1 = adler & 0xffff;
+	unsigned s2 = (adler >> 16) & 0xffff;
+
+	while (len > 0)
+	{
+		/*at least 5550 sums can be done before the sums overflow, saving a lot of module divisions*/
+		unsigned amount = len > 5550 ? 5550 : len;
+		len -= amount;
+		while (amount > 0)
+		{
+			s1 += (*data++);
+			s2 += s1;
+			--amount;
+		}
+		s1 %= 65521;
+		s2 %= 65521;
+	}
+
+	return (s2 << 16) | s1;
+}
+
+/*Return the adler32 of the bytes data[0..len-1]*/
+static unsigned adler32(const unsigned char* data, unsigned len)
+{
+	return update_adler32(1L, data, len);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Zlib                                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+	size_t insize, const LodePNGDecompressSettings* settings)
+{
+	unsigned error = 0;
+	unsigned CM, CINFO, FDICT;
+
+	if (insize < 2) return 53; /*error, size of zlib data too small*/
+	/*read information from zlib header*/
+	if ((in[0] * 256 + in[1]) % 31 != 0)
+	{
+		/*error: 256 * in[0] + in[1] must be a multiple of 31, the FCHECK value is supposed to be made that way*/
+		return 24;
+	}
+
+	CM = in[0] & 15;
+	CINFO = (in[0] >> 4) & 15;
+	/*FCHECK = in[1] & 31;*/ /*FCHECK is already tested above*/
+	FDICT = (in[1] >> 5) & 1;
+	/*FLEVEL = (in[1] >> 6) & 3;*/ /*FLEVEL is not used here*/
+
+	if (CM != 8 || CINFO > 7)
+	{
+		/*error: only compression method 8: inflate with sliding window of 32k is supported by the PNG spec*/
+		return 25;
+	}
+	if (FDICT != 0)
+	{
+		/*error: the specification of PNG says about the zlib stream:
+		  "The additional flags shall not specify a preset dictionary."*/
+		return 26;
+	}
+
+	error = inflate(out, outsize, in + 2, insize - 2, settings);
+	if (error) return error;
+
+	if (!settings->ignore_adler32)
+	{
+		unsigned ADLER32 = lodepng_read32bitInt(&in[insize - 4]);
+		unsigned checksum = adler32(*out, (unsigned)(*outsize));
+		if (checksum != ADLER32) return 58; /*error, adler checksum not correct, data must be corrupted*/
+	}
+
+	return 0; /*no error*/
+}
+
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+	size_t insize, const LodePNGDecompressSettings* settings)
+{
+	if (settings->custom_zlib)
+	{
+		return settings->custom_zlib(out, outsize, in, insize, settings);
+	}
+	else
+	{
+		return lodepng_zlib_decompress(out, outsize, in, insize, settings);
+	}
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+	size_t insize, const LodePNGCompressSettings* settings)
+{
+	/*initially, *out must be NULL and outsize 0, if you just give some random *out
+	that's pointing to a non allocated buffer, this'll crash*/
+	ucvector outv;
+	size_t i;
+	unsigned error;
+	unsigned char* deflatedata = 0;
+	size_t deflatesize = 0;
+
+	/*zlib data: 1 byte CMF (CM+CINFO), 1 byte FLG, deflate data, 4 byte ADLER32 checksum of the Decompressed data*/
+	unsigned CMF = 120; /*0b01111000: CM 8, CINFO 7. With CINFO 7, any window size up to 32768 can be used.*/
+	unsigned FLEVEL = 0;
+	unsigned FDICT = 0;
+	unsigned CMFFLG = 256 * CMF + FDICT * 32 + FLEVEL * 64;
+	unsigned FCHECK = 31 - CMFFLG % 31;
+	CMFFLG += FCHECK;
+
+	/*ucvector-controlled version of the output buffer, for dynamic array*/
+	ucvector_init_buffer(&outv, *out, *outsize);
+
+	ucvector_push_back(&outv, (unsigned char)(CMFFLG >> 8));
+	ucvector_push_back(&outv, (unsigned char)(CMFFLG & 255));
+
+	error = deflate(&deflatedata, &deflatesize, in, insize, settings);
+
+	if (!error)
+	{
+		unsigned ADLER32 = adler32(in, (unsigned)insize);
+		for (i = 0; i != deflatesize; ++i) ucvector_push_back(&outv, deflatedata[i]);
+		lodepng_free(deflatedata);
+		lodepng_add32bitInt(&outv, ADLER32);
+	}
+
+	*out = outv.data;
+	*outsize = outv.size;
+
+	return error;
+}
+
+/* compress using the default or custom zlib function */
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+	size_t insize, const LodePNGCompressSettings* settings)
+{
+	if (settings->custom_zlib)
+	{
+		return settings->custom_zlib(out, outsize, in, insize, settings);
+	}
+	else
+	{
+		return lodepng_zlib_compress(out, outsize, in, insize, settings);
+	}
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#else /*no LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+	size_t insize, const LodePNGDecompressSettings* settings)
+{
+	if (!settings->custom_zlib) return 87; /*no custom zlib function provided */
+	return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+	size_t insize, const LodePNGCompressSettings* settings)
+{
+	if (!settings->custom_zlib) return 87; /*no custom zlib function provided */
+	return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*this is a good tradeoff between speed and compression ratio*/
+#define DEFAULT_WINDOWSIZE 2048
+
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings)
+{
+	/*compress with dynamic huffman tree (not in the mathematical sense, just not the predefined one)*/
+	settings->btype = 2;
+	settings->use_lz77 = 1;
+	settings->windowsize = DEFAULT_WINDOWSIZE;
+	settings->minmatch = 3;
+	settings->nicematch = 128;
+	settings->lazymatching = 1;
+
+	settings->custom_zlib = 0;
+	settings->custom_deflate = 0;
+	settings->custom_context = 0;
+}
+
+const LodePNGCompressSettings lodepng_default_compress_settings = { 2, 1, DEFAULT_WINDOWSIZE, 3, 128, 1, 0, 0, 0 };
+
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings)
+{
+	settings->ignore_adler32 = 0;
+
+	settings->custom_zlib = 0;
+	settings->custom_inflate = 0;
+	settings->custom_context = 0;
+}
+
+const LodePNGDecompressSettings lodepng_default_decompress_settings = { 0, 0, 0, 0 };
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of Zlib related code. Begin of PNG related code.                 // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / CRC32                                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+
+#ifndef LODEPNG_NO_COMPILE_CRC
+/* CRC polynomial: 0xedb88320 */
+static unsigned lodepng_crc32_table[256] = {
+			  0u, 1996959894u, 3993919788u, 2567524794u,  124634137u, 1886057615u, 3915621685u, 2657392035u,
+	249268274u, 2044508324u, 3772115230u, 2547177864u,  162941995u, 2125561021u, 3887607047u, 2428444049u,
+	498536548u, 1789927666u, 4089016648u, 2227061214u,  450548861u, 1843258603u, 4107580753u, 2211677639u,
+	325883990u, 1684777152u, 4251122042u, 2321926636u,  335633487u, 1661365465u, 4195302755u, 2366115317u,
+	997073096u, 1281953886u, 3579855332u, 2724688242u, 1006888145u, 1258607687u, 3524101629u, 2768942443u,
+	901097722u, 1119000684u, 3686517206u, 2898065728u,  853044451u, 1172266101u, 3705015759u, 2882616665u,
+	651767980u, 1373503546u, 3369554304u, 3218104598u,  565507253u, 1454621731u, 3485111705u, 3099436303u,
+	671266974u, 1594198024u, 3322730930u, 2970347812u,  795835527u, 1483230225u, 3244367275u, 3060149565u,
+  1994146192u,   31158534u, 2563907772u, 4023717930u, 1907459465u,  112637215u, 2680153253u, 3904427059u,
+  2013776290u,  251722036u, 2517215374u, 3775830040u, 2137656763u,  141376813u, 2439277719u, 3865271297u,
+  1802195444u,  476864866u, 2238001368u, 4066508878u, 1812370925u,  453092731u, 2181625025u, 4111451223u,
+  1706088902u,  314042704u, 2344532202u, 4240017532u, 1658658271u,  366619977u, 2362670323u, 4224994405u,
+  1303535960u,  984961486u, 2747007092u, 3569037538u, 1256170817u, 1037604311u, 2765210733u, 3554079995u,
+  1131014506u,  879679996u, 2909243462u, 3663771856u, 1141124467u,  855842277u, 2852801631u, 3708648649u,
+  1342533948u,  654459306u, 3188396048u, 3373015174u, 1466479909u,  544179635u, 3110523913u, 3462522015u,
+  1591671054u,  702138776u, 2966460450u, 3352799412u, 1504918807u,  783551873u, 3082640443u, 3233442989u,
+  3988292384u, 2596254646u,   62317068u, 1957810842u, 3939845945u, 2647816111u,   81470997u, 1943803523u,
+  3814918930u, 2489596804u,  225274430u, 2053790376u, 3826175755u, 2466906013u,  167816743u, 2097651377u,
+  4027552580u, 2265490386u,  503444072u, 1762050814u, 4150417245u, 2154129355u,  426522225u, 1852507879u,
+  4275313526u, 2312317920u,  282753626u, 1742555852u, 4189708143u, 2394877945u,  397917763u, 1622183637u,
+  3604390888u, 2714866558u,  953729732u, 1340076626u, 3518719985u, 2797360999u, 1068828381u, 1219638859u,
+  3624741850u, 2936675148u,  906185462u, 1090812512u, 3747672003u, 2825379669u,  829329135u, 1181335161u,
+  3412177804u, 3160834842u,  628085408u, 1382605366u, 3423369109u, 3138078467u,  570562233u, 1426400815u,
+  3317316542u, 2998733608u,  733239954u, 1555261956u, 3268935591u, 3050360625u,  752459403u, 1541320221u,
+  2607071920u, 3965973030u, 1969922972u,   40735498u, 2617837225u, 3943577151u, 1913087877u,   83908371u,
+  2512341634u, 3803740692u, 2075208622u,  213261112u, 2463272603u, 3855990285u, 2094854071u,  198958881u,
+  2262029012u, 4057260610u, 1759359992u,  534414190u, 2176718541u, 4139329115u, 1873836001u,  414664567u,
+  2282248934u, 4279200368u, 1711684554u,  285281116u, 2405801727u, 4167216745u, 1634467795u,  376229701u,
+  2685067896u, 3608007406u, 1308918612u,  956543938u, 2808555105u, 3495958263u, 1231636301u, 1047427035u,
+  2932959818u, 3654703836u, 1088359270u,  936918000u, 2847714899u, 3736837829u, 1202900863u,  817233897u,
+  3183342108u, 3401237130u, 1404277552u,  615818150u, 3134207493u, 3453421203u, 1423857449u,  601450431u,
+  3009837614u, 3294710456u, 1567103746u,  711928724u, 3020668471u, 3272380065u, 1510334235u,  755167117u
+};
+
+/*Return the CRC of the bytes buf[0..len-1].*/
+unsigned lodepng_crc32(const unsigned char* data, size_t length)
+{
+	unsigned r = 0xffffffffu;
+	size_t i;
+	for (i = 0; i < length; ++i)
+	{
+		r = lodepng_crc32_table[(r ^ data[i]) & 0xff] ^ (r >> 8);
+	}
+	return r ^ 0xffffffffu;
+}
+#else /* !LODEPNG_NO_COMPILE_CRC */
+unsigned lodepng_crc32(const unsigned char* data, size_t length);
+#endif /* !LODEPNG_NO_COMPILE_CRC */
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Reading and writing single bits and bytes from/to stream for LodePNG   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned char readBitFromReversedStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+	unsigned char result = (unsigned char)((bitstream[(*bitpointer) >> 3] >> (7 - ((*bitpointer) & 0x7))) & 1);
+	++(*bitpointer);
+	return result;
+}
+
+static unsigned readBitsFromReversedStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+	unsigned result = 0;
+	size_t i;
+	for (i = 0; i < nbits; ++i)
+	{
+		result <<= 1;
+		result |= (unsigned)readBitFromReversedStream(bitpointer, bitstream);
+	}
+	return result;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+static void setBitOfReversedStream0(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+	/*the current bit in bitstream must be 0 for this to work*/
+	if (bit)
+	{
+		/*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/
+		bitstream[(*bitpointer) >> 3] |= (bit << (7 - ((*bitpointer) & 0x7)));
+	}
+	++(*bitpointer);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+static void setBitOfReversedStream(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+	/*the current bit in bitstream may be 0 or 1 for this to work*/
+	if (bit == 0) bitstream[(*bitpointer) >> 3] &= (unsigned char)(~(1 << (7 - ((*bitpointer) & 0x7))));
+	else         bitstream[(*bitpointer) >> 3] |= (1 << (7 - ((*bitpointer) & 0x7)));
+	++(*bitpointer);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG chunks                                                             / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_chunk_length(const unsigned char* chunk)
+{
+	return lodepng_read32bitInt(&chunk[0]);
+}
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk)
+{
+	unsigned i;
+	for (i = 0; i != 4; ++i) type[i] = (char)chunk[4 + i];
+	type[4] = 0; /*null termination char*/
+}
+
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type)
+{
+	if (strlen(type) != 4) return 0;
+	return (chunk[4] == type[0] && chunk[5] == type[1] && chunk[6] == type[2] && chunk[7] == type[3]);
+}
+
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk)
+{
+	return((chunk[4] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_private(const unsigned char* chunk)
+{
+	return((chunk[6] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk)
+{
+	return((chunk[7] & 32) != 0);
+}
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk)
+{
+	return &chunk[8];
+}
+
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk)
+{
+	return &chunk[8];
+}
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk)
+{
+	unsigned length = lodepng_chunk_length(chunk);
+	unsigned CRC = lodepng_read32bitInt(&chunk[length + 8]);
+	/*the CRC is taken of the data and the 4 chunk type letters, not the length*/
+	unsigned checksum = lodepng_crc32(&chunk[4], length + 4);
+	if (CRC != checksum) return 1;
+	else return 0;
+}
+
+void lodepng_chunk_generate_crc(unsigned char* chunk)
+{
+	unsigned length = lodepng_chunk_length(chunk);
+	unsigned CRC = lodepng_crc32(&chunk[4], length + 4);
+	lodepng_set32bitInt(chunk + 8 + length, CRC);
+}
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk)
+{
+	unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+	return &chunk[total_chunk_length];
+}
+
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk)
+{
+	unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+	return &chunk[total_chunk_length];
+}
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk)
+{
+	unsigned i;
+	unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+	unsigned char *chunk_start, *new_buffer;
+	size_t new_length = (*outlength) + total_chunk_length;
+	if (new_length < total_chunk_length || new_length < (*outlength)) return 77; /*integer overflow happened*/
+
+	new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+	if (!new_buffer) return 83; /*alloc fail*/
+	(*out) = new_buffer;
+	(*outlength) = new_length;
+	chunk_start = &(*out)[new_length - total_chunk_length];
+
+	for (i = 0; i != total_chunk_length; ++i) chunk_start[i] = chunk[i];
+
+	return 0;
+}
+
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+	const char* type, const unsigned char* data)
+{
+	unsigned i;
+	unsigned char *chunk, *new_buffer;
+	size_t new_length = (*outlength) + length + 12;
+	if (new_length < length + 12 || new_length < (*outlength)) return 77; /*integer overflow happened*/
+	new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+	if (!new_buffer) return 83; /*alloc fail*/
+	(*out) = new_buffer;
+	(*outlength) = new_length;
+	chunk = &(*out)[(*outlength) - length - 12];
+
+	/*1: length*/
+	lodepng_set32bitInt(chunk, (unsigned)length);
+
+	/*2: chunk name (4 letters)*/
+	chunk[4] = (unsigned char)type[0];
+	chunk[5] = (unsigned char)type[1];
+	chunk[6] = (unsigned char)type[2];
+	chunk[7] = (unsigned char)type[3];
+
+	/*3: the data*/
+	for (i = 0; i != length; ++i) chunk[8 + i] = data[i];
+
+	/*4: CRC (of the chunkname characters and the data)*/
+	lodepng_chunk_generate_crc(chunk);
+
+	return 0;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Color types and such                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*return type is a LodePNG error code*/
+static unsigned checkColorValidity(LodePNGColorType colortype, unsigned bd) /*bd = bitdepth*/
+{
+	switch (colortype)
+	{
+	case 0: if (!(bd == 1 || bd == 2 || bd == 4 || bd == 8 || bd == 16)) return 37; break; /*grey*/
+	case 2: if (!(bd == 8 || bd == 16)) return 37; break; /*RGB*/
+	case 3: if (!(bd == 1 || bd == 2 || bd == 4 || bd == 8)) return 37; break; /*palette*/
+	case 4: if (!(bd == 8 || bd == 16)) return 37; break; /*grey + alpha*/
+	case 6: if (!(bd == 8 || bd == 16)) return 37; break; /*RGBA*/
+	default: return 31;
+	}
+	return 0; /*allowed color type / bits combination*/
+}
+
+static unsigned getNumColorChannels(LodePNGColorType colortype)
+{
+	switch (colortype)
+	{
+	case 0: return 1; /*grey*/
+	case 2: return 3; /*RGB*/
+	case 3: return 1; /*palette*/
+	case 4: return 2; /*grey + alpha*/
+	case 6: return 4; /*RGBA*/
+	}
+	return 0; /*unexisting color type*/
+}
+
+static unsigned lodepng_get_bpp_lct(LodePNGColorType colortype, unsigned bitdepth)
+{
+	/*bits per pixel is amount of channels * bits per channel*/
+	return getNumColorChannels(colortype) * bitdepth;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+void lodepng_color_mode_init(LodePNGColorMode* info)
+{
+	info->key_defined = 0;
+	info->key_r = info->key_g = info->key_b = 0;
+	info->colortype = LCT_RGBA;
+	info->bitdepth = 8;
+	info->palette = 0;
+	info->palettesize = 0;
+}
+
+void lodepng_color_mode_cleanup(LodePNGColorMode* info)
+{
+	lodepng_palette_clear(info);
+}
+
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source)
+{
+	size_t i;
+	lodepng_color_mode_cleanup(dest);
+	*dest = *source;
+	if (source->palette)
+	{
+		dest->palette = (unsigned char*)lodepng_malloc(1024);
+		if (!dest->palette && source->palettesize) return 83; /*alloc fail*/
+		for (i = 0; i != source->palettesize * 4; ++i) dest->palette[i] = source->palette[i];
+	}
+	return 0;
+}
+
+static int lodepng_color_mode_equal(const LodePNGColorMode* a, const LodePNGColorMode* b)
+{
+	size_t i;
+	if (a->colortype != b->colortype) return 0;
+	if (a->bitdepth != b->bitdepth) return 0;
+	if (a->key_defined != b->key_defined) return 0;
+	if (a->key_defined)
+	{
+		if (a->key_r != b->key_r) return 0;
+		if (a->key_g != b->key_g) return 0;
+		if (a->key_b != b->key_b) return 0;
+	}
+	/*if one of the palette sizes is 0, then we consider it to be the same as the
+	other: it means that e.g. the palette was not given by the user and should be
+	considered the same as the palette inside the PNG.*/
+	if (1/*a->palettesize != 0 && b->palettesize != 0*/) {
+		if (a->palettesize != b->palettesize) return 0;
+		for (i = 0; i != a->palettesize * 4; ++i)
+		{
+			if (a->palette[i] != b->palette[i]) return 0;
+		}
+	}
+	return 1;
+}
+
+void lodepng_palette_clear(LodePNGColorMode* info)
+{
+	if (info->palette) lodepng_free(info->palette);
+	info->palette = 0;
+	info->palettesize = 0;
+}
+
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+	unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+	unsigned char* data;
+	/*the same resize technique as C++ std::vectors is used, and here it's made so that for a palette with
+	the max of 256 colors, it'll have the exact alloc size*/
+	if (!info->palette) /*allocate palette if empty*/
+	{
+		/*room for 256 colors with 4 bytes each*/
+		data = (unsigned char*)lodepng_realloc(info->palette, 1024);
+		if (!data) return 83; /*alloc fail*/
+		else info->palette = data;
+	}
+	info->palette[4 * info->palettesize + 0] = r;
+	info->palette[4 * info->palettesize + 1] = g;
+	info->palette[4 * info->palettesize + 2] = b;
+	info->palette[4 * info->palettesize + 3] = a;
+	++info->palettesize;
+	return 0;
+}
+
+unsigned lodepng_get_bpp(const LodePNGColorMode* info)
+{
+	/*calculate bits per pixel out of colortype and bitdepth*/
+	return lodepng_get_bpp_lct(info->colortype, info->bitdepth);
+}
+
+unsigned lodepng_get_channels(const LodePNGColorMode* info)
+{
+	return getNumColorChannels(info->colortype);
+}
+
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info)
+{
+	return info->colortype == LCT_GREY || info->colortype == LCT_GREY_ALPHA;
+}
+
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info)
+{
+	return (info->colortype & 4) != 0; /*4 or 6*/
+}
+
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info)
+{
+	return info->colortype == LCT_PALETTE;
+}
+
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info)
+{
+	size_t i;
+	for (i = 0; i != info->palettesize; ++i)
+	{
+		if (info->palette[i * 4 + 3] < 255) return 1;
+	}
+	return 0;
+}
+
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info)
+{
+	return info->key_defined
+		|| lodepng_is_alpha_type(info)
+		|| lodepng_has_palette_alpha(info);
+}
+
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+	/*will not overflow for any color type if roughly w * h < 268435455*/
+	size_t bpp = lodepng_get_bpp(color);
+	size_t n = w * h;
+	return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+size_t lodepng_get_raw_size_lct(unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+	/*will not overflow for any color type if roughly w * h < 268435455*/
+	size_t bpp = lodepng_get_bpp_lct(colortype, bitdepth);
+	size_t n = w * h;
+	return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_DECODER
+/*in an idat chunk, each scanline is a multiple of 8 bits, unlike the lodepng output buffer*/
+static size_t lodepng_get_raw_size_idat(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+	/*will not overflow for any color type if roughly w * h < 268435455*/
+	size_t bpp = lodepng_get_bpp(color);
+	size_t line = ((w / 8) * bpp) + ((w & 7) * bpp + 7) / 8;
+	return h * line;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static void LodePNGUnknownChunks_init(LodePNGInfo* info)
+{
+	unsigned i;
+	for (i = 0; i != 3; ++i) info->unknown_chunks_data[i] = 0;
+	for (i = 0; i != 3; ++i) info->unknown_chunks_size[i] = 0;
+}
+
+static void LodePNGUnknownChunks_cleanup(LodePNGInfo* info)
+{
+	unsigned i;
+	for (i = 0; i != 3; ++i) lodepng_free(info->unknown_chunks_data[i]);
+}
+
+static unsigned LodePNGUnknownChunks_copy(LodePNGInfo* dest, const LodePNGInfo* src)
+{
+	unsigned i;
+
+	LodePNGUnknownChunks_cleanup(dest);
+
+	for (i = 0; i != 3; ++i)
+	{
+		size_t j;
+		dest->unknown_chunks_size[i] = src->unknown_chunks_size[i];
+		dest->unknown_chunks_data[i] = (unsigned char*)lodepng_malloc(src->unknown_chunks_size[i]);
+		if (!dest->unknown_chunks_data[i] && dest->unknown_chunks_size[i]) return 83; /*alloc fail*/
+		for (j = 0; j < src->unknown_chunks_size[i]; ++j)
+		{
+			dest->unknown_chunks_data[i][j] = src->unknown_chunks_data[i][j];
+		}
+	}
+
+	return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGText_init(LodePNGInfo* info)
+{
+	info->text_num = 0;
+	info->text_keys = NULL;
+	info->text_strings = NULL;
+}
+
+static void LodePNGText_cleanup(LodePNGInfo* info)
+{
+	size_t i;
+	for (i = 0; i != info->text_num; ++i)
+	{
+		string_cleanup(&info->text_keys[i]);
+		string_cleanup(&info->text_strings[i]);
+	}
+	lodepng_free(info->text_keys);
+	lodepng_free(info->text_strings);
+}
+
+static unsigned LodePNGText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+	size_t i = 0;
+	dest->text_keys = 0;
+	dest->text_strings = 0;
+	dest->text_num = 0;
+	for (i = 0; i != source->text_num; ++i)
+	{
+		CERROR_TRY_RETURN(lodepng_add_text(dest, source->text_keys[i], source->text_strings[i]));
+	}
+	return 0;
+}
+
+void lodepng_clear_text(LodePNGInfo* info)
+{
+	LodePNGText_cleanup(info);
+}
+
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str)
+{
+	char** new_keys = (char**)(lodepng_realloc(info->text_keys, sizeof(char*) * (info->text_num + 1)));
+	char** new_strings = (char**)(lodepng_realloc(info->text_strings, sizeof(char*) * (info->text_num + 1)));
+	if (!new_keys || !new_strings)
+	{
+		lodepng_free(new_keys);
+		lodepng_free(new_strings);
+		return 83; /*alloc fail*/
+	}
+
+	++info->text_num;
+	info->text_keys = new_keys;
+	info->text_strings = new_strings;
+
+	string_init(&info->text_keys[info->text_num - 1]);
+	string_set(&info->text_keys[info->text_num - 1], key);
+
+	string_init(&info->text_strings[info->text_num - 1]);
+	string_set(&info->text_strings[info->text_num - 1], str);
+
+	return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGIText_init(LodePNGInfo* info)
+{
+	info->itext_num = 0;
+	info->itext_keys = NULL;
+	info->itext_langtags = NULL;
+	info->itext_transkeys = NULL;
+	info->itext_strings = NULL;
+}
+
+static void LodePNGIText_cleanup(LodePNGInfo* info)
+{
+	size_t i;
+	for (i = 0; i != info->itext_num; ++i)
+	{
+		string_cleanup(&info->itext_keys[i]);
+		string_cleanup(&info->itext_langtags[i]);
+		string_cleanup(&info->itext_transkeys[i]);
+		string_cleanup(&info->itext_strings[i]);
+	}
+	lodepng_free(info->itext_keys);
+	lodepng_free(info->itext_langtags);
+	lodepng_free(info->itext_transkeys);
+	lodepng_free(info->itext_strings);
+}
+
+static unsigned LodePNGIText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+	size_t i = 0;
+	dest->itext_keys = 0;
+	dest->itext_langtags = 0;
+	dest->itext_transkeys = 0;
+	dest->itext_strings = 0;
+	dest->itext_num = 0;
+	for (i = 0; i != source->itext_num; ++i)
+	{
+		CERROR_TRY_RETURN(lodepng_add_itext(dest, source->itext_keys[i], source->itext_langtags[i],
+			source->itext_transkeys[i], source->itext_strings[i]));
+	}
+	return 0;
+}
+
+void lodepng_clear_itext(LodePNGInfo* info)
+{
+	LodePNGIText_cleanup(info);
+}
+
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+	const char* transkey, const char* str)
+{
+	char** new_keys = (char**)(lodepng_realloc(info->itext_keys, sizeof(char*) * (info->itext_num + 1)));
+	char** new_langtags = (char**)(lodepng_realloc(info->itext_langtags, sizeof(char*) * (info->itext_num + 1)));
+	char** new_transkeys = (char**)(lodepng_realloc(info->itext_transkeys, sizeof(char*) * (info->itext_num + 1)));
+	char** new_strings = (char**)(lodepng_realloc(info->itext_strings, sizeof(char*) * (info->itext_num + 1)));
+	if (!new_keys || !new_langtags || !new_transkeys || !new_strings)
+	{
+		lodepng_free(new_keys);
+		lodepng_free(new_langtags);
+		lodepng_free(new_transkeys);
+		lodepng_free(new_strings);
+		return 83; /*alloc fail*/
+	}
+
+	++info->itext_num;
+	info->itext_keys = new_keys;
+	info->itext_langtags = new_langtags;
+	info->itext_transkeys = new_transkeys;
+	info->itext_strings = new_strings;
+
+	string_init(&info->itext_keys[info->itext_num - 1]);
+	string_set(&info->itext_keys[info->itext_num - 1], key);
+
+	string_init(&info->itext_langtags[info->itext_num - 1]);
+	string_set(&info->itext_langtags[info->itext_num - 1], langtag);
+
+	string_init(&info->itext_transkeys[info->itext_num - 1]);
+	string_set(&info->itext_transkeys[info->itext_num - 1], transkey);
+
+	string_init(&info->itext_strings[info->itext_num - 1]);
+	string_set(&info->itext_strings[info->itext_num - 1], str);
+
+	return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+void lodepng_info_init(LodePNGInfo* info)
+{
+	lodepng_color_mode_init(&info->color);
+	info->interlace_method = 0;
+	info->compression_method = 0;
+	info->filter_method = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+	info->background_defined = 0;
+	info->background_r = info->background_g = info->background_b = 0;
+
+	LodePNGText_init(info);
+	LodePNGIText_init(info);
+
+	info->time_defined = 0;
+	info->phys_defined = 0;
+
+	LodePNGUnknownChunks_init(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+void lodepng_info_cleanup(LodePNGInfo* info)
+{
+	lodepng_color_mode_cleanup(&info->color);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+	LodePNGText_cleanup(info);
+	LodePNGIText_cleanup(info);
+
+	LodePNGUnknownChunks_cleanup(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+	lodepng_info_cleanup(dest);
+	*dest = *source;
+	lodepng_color_mode_init(&dest->color);
+	CERROR_TRY_RETURN(lodepng_color_mode_copy(&dest->color, &source->color));
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+	CERROR_TRY_RETURN(LodePNGText_copy(dest, source));
+	CERROR_TRY_RETURN(LodePNGIText_copy(dest, source));
+
+	LodePNGUnknownChunks_init(dest);
+	CERROR_TRY_RETURN(LodePNGUnknownChunks_copy(dest, source));
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+	return 0;
+}
+
+void lodepng_info_swap(LodePNGInfo* a, LodePNGInfo* b)
+{
+	LodePNGInfo temp = *a;
+	*a = *b;
+	*b = temp;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*index: bitgroup index, bits: bitgroup size(1, 2 or 4), in: bitgroup value, out: octet array to add bits to*/
+static void addColorBits(unsigned char* out, size_t index, unsigned bits, unsigned in)
+{
+	unsigned m = bits == 1 ? 7 : bits == 2 ? 3 : 1; /*8 / bits - 1*/
+	/*p = the partial index in the byte, e.g. with 4 palettebits it is 0 for first half or 1 for second half*/
+	unsigned p = index & m;
+	in &= (1u << bits) - 1u; /*filter out any other bits of the input value*/
+	in = in << (bits * (m - p));
+	if (p == 0) out[index * bits / 8] = in;
+	else out[index * bits / 8] |= in;
+}
+
+typedef struct ColorTree ColorTree;
+
+/*
+One node of a color tree
+This is the data structure used to count the number of unique colors and to get a palette
+index for a color. It's like an octree, but because the alpha channel is used too, each
+node has 16 instead of 8 children.
+*/
+struct ColorTree
+{
+	ColorTree* children[16]; /*up to 16 pointers to ColorTree of next level*/
+	int index; /*the payload. Only has a meaningful value if this is in the last level*/
+};
+
+static void color_tree_init(ColorTree* tree)
+{
+	int i;
+	for (i = 0; i != 16; ++i) tree->children[i] = 0;
+	tree->index = -1;
+}
+
+static void color_tree_cleanup(ColorTree* tree)
+{
+	int i;
+	for (i = 0; i != 16; ++i)
+	{
+		if (tree->children[i])
+		{
+			color_tree_cleanup(tree->children[i]);
+			lodepng_free(tree->children[i]);
+		}
+	}
+}
+
+/*returns -1 if color not present, its index otherwise*/
+static int color_tree_get(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+	int bit = 0;
+	for (bit = 0; bit < 8; ++bit)
+	{
+		int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+		if (!tree->children[i]) return -1;
+		else tree = tree->children[i];
+	}
+	return tree ? tree->index : -1;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static int color_tree_has(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+	return color_tree_get(tree, r, g, b, a) >= 0;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*color is not allowed to already exist.
+Index should be >= 0 (it's signed to be compatible with using -1 for "doesn't exist")*/
+static void color_tree_add(ColorTree* tree,
+	unsigned char r, unsigned char g, unsigned char b, unsigned char a, unsigned index)
+{
+	int bit;
+	for (bit = 0; bit < 8; ++bit)
+	{
+		int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+		if (!tree->children[i])
+		{
+			tree->children[i] = (ColorTree*)lodepng_malloc(sizeof(ColorTree));
+			color_tree_init(tree->children[i]);
+		}
+		tree = tree->children[i];
+	}
+	tree->index = (int)index;
+}
+
+/*put a pixel, given its RGBA color, into image of any color type*/
+static unsigned rgba8ToPixel(unsigned char* out, size_t i,
+	const LodePNGColorMode* mode, ColorTree* tree /*for palette*/,
+	unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+	if (mode->colortype == LCT_GREY)
+	{
+		unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+		if (mode->bitdepth == 8) out[i] = grey;
+		else if (mode->bitdepth == 16) out[i * 2 + 0] = out[i * 2 + 1] = grey;
+		else
+		{
+			/*take the most significant bits of grey*/
+			grey = (grey >> (8 - mode->bitdepth)) & ((1 << mode->bitdepth) - 1);
+			addColorBits(out, i, mode->bitdepth, grey);
+		}
+	}
+	else if (mode->colortype == LCT_RGB)
+	{
+		if (mode->bitdepth == 8)
+		{
+			out[i * 3 + 0] = r;
+			out[i * 3 + 1] = g;
+			out[i * 3 + 2] = b;
+		}
+		else
+		{
+			out[i * 6 + 0] = out[i * 6 + 1] = r;
+			out[i * 6 + 2] = out[i * 6 + 3] = g;
+			out[i * 6 + 4] = out[i * 6 + 5] = b;
+		}
+	}
+	else if (mode->colortype == LCT_PALETTE)
+	{
+		int index = color_tree_get(tree, r, g, b, a);
+		if (index < 0) return 82; /*color not in palette*/
+		if (mode->bitdepth == 8) out[i] = index;
+		else addColorBits(out, i, mode->bitdepth, (unsigned)index);
+	}
+	else if (mode->colortype == LCT_GREY_ALPHA)
+	{
+		unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+		if (mode->bitdepth == 8)
+		{
+			out[i * 2 + 0] = grey;
+			out[i * 2 + 1] = a;
+		}
+		else if (mode->bitdepth == 16)
+		{
+			out[i * 4 + 0] = out[i * 4 + 1] = grey;
+			out[i * 4 + 2] = out[i * 4 + 3] = a;
+		}
+	}
+	else if (mode->colortype == LCT_RGBA)
+	{
+		if (mode->bitdepth == 8)
+		{
+			out[i * 4 + 0] = r;
+			out[i * 4 + 1] = g;
+			out[i * 4 + 2] = b;
+			out[i * 4 + 3] = a;
+		}
+		else
+		{
+			out[i * 8 + 0] = out[i * 8 + 1] = r;
+			out[i * 8 + 2] = out[i * 8 + 3] = g;
+			out[i * 8 + 4] = out[i * 8 + 5] = b;
+			out[i * 8 + 6] = out[i * 8 + 7] = a;
+		}
+	}
+
+	return 0; /*no error*/
+}
+
+/*put a pixel, given its RGBA16 color, into image of any color 16-bitdepth type*/
+static void rgba16ToPixel(unsigned char* out, size_t i,
+	const LodePNGColorMode* mode,
+	unsigned short r, unsigned short g, unsigned short b, unsigned short a)
+{
+	if (mode->colortype == LCT_GREY)
+	{
+		unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+		out[i * 2 + 0] = (grey >> 8) & 255;
+		out[i * 2 + 1] = grey & 255;
+	}
+	else if (mode->colortype == LCT_RGB)
+	{
+		out[i * 6 + 0] = (r >> 8) & 255;
+		out[i * 6 + 1] = r & 255;
+		out[i * 6 + 2] = (g >> 8) & 255;
+		out[i * 6 + 3] = g & 255;
+		out[i * 6 + 4] = (b >> 8) & 255;
+		out[i * 6 + 5] = b & 255;
+	}
+	else if (mode->colortype == LCT_GREY_ALPHA)
+	{
+		unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+		out[i * 4 + 0] = (grey >> 8) & 255;
+		out[i * 4 + 1] = grey & 255;
+		out[i * 4 + 2] = (a >> 8) & 255;
+		out[i * 4 + 3] = a & 255;
+	}
+	else if (mode->colortype == LCT_RGBA)
+	{
+		out[i * 8 + 0] = (r >> 8) & 255;
+		out[i * 8 + 1] = r & 255;
+		out[i * 8 + 2] = (g >> 8) & 255;
+		out[i * 8 + 3] = g & 255;
+		out[i * 8 + 4] = (b >> 8) & 255;
+		out[i * 8 + 5] = b & 255;
+		out[i * 8 + 6] = (a >> 8) & 255;
+		out[i * 8 + 7] = a & 255;
+	}
+}
+
+/*Get RGBA8 color of pixel with index i (y * width + x) from the raw image with given color type.*/
+static void getPixelColorRGBA8(unsigned char* r, unsigned char* g,
+	unsigned char* b, unsigned char* a,
+	const unsigned char* in, size_t i,
+	const LodePNGColorMode* mode)
+{
+	if (mode->colortype == LCT_GREY)
+	{
+		if (mode->bitdepth == 8)
+		{
+			*r = *g = *b = in[i];
+			if (mode->key_defined && *r == mode->key_r) *a = 0;
+			else *a = 255;
+		}
+		else if (mode->bitdepth == 16)
+		{
+			*r = *g = *b = in[i * 2 + 0];
+			if (mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+			else *a = 255;
+		}
+		else
+		{
+			unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+			size_t j = i * mode->bitdepth;
+			unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+			*r = *g = *b = (value * 255) / highest;
+			if (mode->key_defined && value == mode->key_r) *a = 0;
+			else *a = 255;
+		}
+	}
+	else if (mode->colortype == LCT_RGB)
+	{
+		if (mode->bitdepth == 8)
+		{
+			*r = in[i * 3 + 0]; *g = in[i * 3 + 1]; *b = in[i * 3 + 2];
+			if (mode->key_defined && *r == mode->key_r && *g == mode->key_g && *b == mode->key_b) *a = 0;
+			else *a = 255;
+		}
+		else
+		{
+			*r = in[i * 6 + 0];
+			*g = in[i * 6 + 2];
+			*b = in[i * 6 + 4];
+			if (mode->key_defined && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+				&& 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+				&& 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+			else *a = 255;
+		}
+	}
+	else if (mode->colortype == LCT_PALETTE)
+	{
+		unsigned index;
+		if (mode->bitdepth == 8) index = in[i];
+		else
+		{
+			size_t j = i * mode->bitdepth;
+			index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+		}
+
+		if (index >= mode->palettesize)
+		{
+			/*This is an error according to the PNG spec, but common PNG decoders make it black instead.
+			Done here too, slightly faster due to no error handling needed.*/
+			*r = *g = *b = 0;
+			*a = 255;
+		}
+		else
+		{
+			*r = mode->palette[index * 4 + 0];
+			*g = mode->palette[index * 4 + 1];
+			*b = mode->palette[index * 4 + 2];
+			*a = mode->palette[index * 4 + 3];
+		}
+	}
+	else if (mode->colortype == LCT_GREY_ALPHA)
+	{
+		if (mode->bitdepth == 8)
+		{
+			*r = *g = *b = in[i * 2 + 0];
+			*a = in[i * 2 + 1];
+		}
+		else
+		{
+			*r = *g = *b = in[i * 4 + 0];
+			*a = in[i * 4 + 2];
+		}
+	}
+	else if (mode->colortype == LCT_RGBA)
+	{
+		if (mode->bitdepth == 8)
+		{
+			*r = in[i * 4 + 0];
+			*g = in[i * 4 + 1];
+			*b = in[i * 4 + 2];
+			*a = in[i * 4 + 3];
+		}
+		else
+		{
+			*r = in[i * 8 + 0];
+			*g = in[i * 8 + 2];
+			*b = in[i * 8 + 4];
+			*a = in[i * 8 + 6];
+		}
+	}
+}
+
+/*Similar to getPixelColorRGBA8, but with all the for loops inside of the color
+mode test cases, optimized to convert the colors much faster, when converting
+to RGBA or RGB with 8 bit per cannel. buffer must be RGBA or RGB output with
+enough memory, if has_alpha is true the output is RGBA. mode has the color mode
+of the input buffer.*/
+static void getPixelColorsRGBA8(unsigned char* buffer, size_t numpixels,
+	unsigned has_alpha, const unsigned char* in,
+	const LodePNGColorMode* mode)
+{
+	unsigned num_channels = has_alpha ? 4 : 3;
+	size_t i;
+	if (mode->colortype == LCT_GREY)
+	{
+		if (mode->bitdepth == 8)
+		{
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				buffer[0] = buffer[1] = buffer[2] = in[i];
+				if (has_alpha) buffer[3] = mode->key_defined && in[i] == mode->key_r ? 0 : 255;
+			}
+		}
+		else if (mode->bitdepth == 16)
+		{
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				buffer[0] = buffer[1] = buffer[2] = in[i * 2];
+				if (has_alpha) buffer[3] = mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r ? 0 : 255;
+			}
+		}
+		else
+		{
+			unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+			size_t j = 0;
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+				buffer[0] = buffer[1] = buffer[2] = (value * 255) / highest;
+				if (has_alpha) buffer[3] = mode->key_defined && value == mode->key_r ? 0 : 255;
+			}
+		}
+	}
+	else if (mode->colortype == LCT_RGB)
+	{
+		if (mode->bitdepth == 8)
+		{
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				buffer[0] = in[i * 3 + 0];
+				buffer[1] = in[i * 3 + 1];
+				buffer[2] = in[i * 3 + 2];
+				if (has_alpha) buffer[3] = mode->key_defined && buffer[0] == mode->key_r
+					&& buffer[1] == mode->key_g && buffer[2] == mode->key_b ? 0 : 255;
+			}
+		}
+		else
+		{
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				buffer[0] = in[i * 6 + 0];
+				buffer[1] = in[i * 6 + 2];
+				buffer[2] = in[i * 6 + 4];
+				if (has_alpha) buffer[3] = mode->key_defined
+					&& 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+					&& 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+					&& 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b ? 0 : 255;
+			}
+		}
+	}
+	else if (mode->colortype == LCT_PALETTE)
+	{
+		unsigned index;
+		size_t j = 0;
+		for (i = 0; i != numpixels; ++i, buffer += num_channels)
+		{
+			if (mode->bitdepth == 8) index = in[i];
+			else index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+
+			if (index >= mode->palettesize)
+			{
+				/*This is an error according to the PNG spec, but most PNG decoders make it black instead.
+				Done here too, slightly faster due to no error handling needed.*/
+				buffer[0] = buffer[1] = buffer[2] = 0;
+				if (has_alpha) buffer[3] = 255;
+			}
+			else
+			{
+				buffer[0] = mode->palette[index * 4 + 0];
+				buffer[1] = mode->palette[index * 4 + 1];
+				buffer[2] = mode->palette[index * 4 + 2];
+				if (has_alpha) buffer[3] = mode->palette[index * 4 + 3];
+			}
+		}
+	}
+	else if (mode->colortype == LCT_GREY_ALPHA)
+	{
+		if (mode->bitdepth == 8)
+		{
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				buffer[0] = buffer[1] = buffer[2] = in[i * 2 + 0];
+				if (has_alpha) buffer[3] = in[i * 2 + 1];
+			}
+		}
+		else
+		{
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				buffer[0] = buffer[1] = buffer[2] = in[i * 4 + 0];
+				if (has_alpha) buffer[3] = in[i * 4 + 2];
+			}
+		}
+	}
+	else if (mode->colortype == LCT_RGBA)
+	{
+		if (mode->bitdepth == 8)
+		{
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				buffer[0] = in[i * 4 + 0];
+				buffer[1] = in[i * 4 + 1];
+				buffer[2] = in[i * 4 + 2];
+				if (has_alpha) buffer[3] = in[i * 4 + 3];
+			}
+		}
+		else
+		{
+			for (i = 0; i != numpixels; ++i, buffer += num_channels)
+			{
+				buffer[0] = in[i * 8 + 0];
+				buffer[1] = in[i * 8 + 2];
+				buffer[2] = in[i * 8 + 4];
+				if (has_alpha) buffer[3] = in[i * 8 + 6];
+			}
+		}
+	}
+}
+
+/*Get RGBA16 color of pixel with index i (y * width + x) from the raw image with
+given color type, but the given color type must be 16-bit itself.*/
+static void getPixelColorRGBA16(unsigned short* r, unsigned short* g, unsigned short* b, unsigned short* a,
+	const unsigned char* in, size_t i, const LodePNGColorMode* mode)
+{
+	if (mode->colortype == LCT_GREY)
+	{
+		*r = *g = *b = 256 * in[i * 2 + 0] + in[i * 2 + 1];
+		if (mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+		else *a = 65535;
+	}
+	else if (mode->colortype == LCT_RGB)
+	{
+		*r = 256u * in[i * 6 + 0] + in[i * 6 + 1];
+		*g = 256u * in[i * 6 + 2] + in[i * 6 + 3];
+		*b = 256u * in[i * 6 + 4] + in[i * 6 + 5];
+		if (mode->key_defined
+			&& 256u * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+			&& 256u * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+			&& 256u * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+		else *a = 65535;
+	}
+	else if (mode->colortype == LCT_GREY_ALPHA)
+	{
+		*r = *g = *b = 256u * in[i * 4 + 0] + in[i * 4 + 1];
+		*a = 256u * in[i * 4 + 2] + in[i * 4 + 3];
+	}
+	else if (mode->colortype == LCT_RGBA)
+	{
+		*r = 256u * in[i * 8 + 0] + in[i * 8 + 1];
+		*g = 256u * in[i * 8 + 2] + in[i * 8 + 3];
+		*b = 256u * in[i * 8 + 4] + in[i * 8 + 5];
+		*a = 256u * in[i * 8 + 6] + in[i * 8 + 7];
+	}
+}
+
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+	const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+	unsigned w, unsigned h)
+{
+	size_t i;
+	ColorTree tree;
+	size_t numpixels = w * h;
+
+	if (lodepng_color_mode_equal(mode_out, mode_in))
+	{
+		size_t numbytes = lodepng_get_raw_size(w, h, mode_in);
+		for (i = 0; i != numbytes; ++i) out[i] = in[i];
+		return 0;
+	}
+
+	if (mode_out->colortype == LCT_PALETTE)
+	{
+		size_t palettesize = mode_out->palettesize;
+		const unsigned char* palette = mode_out->palette;
+		size_t palsize = 1u << mode_out->bitdepth;
+		/*if the user specified output palette but did not give the values, assume
+		they want the values of the input color type (assuming that one is palette).
+		Note that we never create a new palette ourselves.*/
+		if (palettesize == 0)
+		{
+			palettesize = mode_in->palettesize;
+			palette = mode_in->palette;
+		}
+		if (palettesize < palsize) palsize = palettesize;
+		color_tree_init(&tree);
+		for (i = 0; i != palsize; ++i)
+		{
+			const unsigned char* p = &palette[i * 4];
+			color_tree_add(&tree, p[0], p[1], p[2], p[3], i);
+		}
+	}
+
+	if (mode_in->bitdepth == 16 && mode_out->bitdepth == 16)
+	{
+		for (i = 0; i != numpixels; ++i)
+		{
+			unsigned short r = 0, g = 0, b = 0, a = 0;
+			getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+			rgba16ToPixel(out, i, mode_out, r, g, b, a);
+		}
+	}
+	else if (mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGBA)
+	{
+		getPixelColorsRGBA8(out, numpixels, 1, in, mode_in);
+	}
+	else if (mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGB)
+	{
+		getPixelColorsRGBA8(out, numpixels, 0, in, mode_in);
+	}
+	else
+	{
+		unsigned char r = 0, g = 0, b = 0, a = 0;
+		for (i = 0; i != numpixels; ++i)
+		{
+			getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode_in);
+			CERROR_TRY_RETURN(rgba8ToPixel(out, i, mode_out, &tree, r, g, b, a));
+		}
+	}
+
+	if (mode_out->colortype == LCT_PALETTE)
+	{
+		color_tree_cleanup(&tree);
+	}
+
+	return 0; /*no error*/
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile)
+{
+	profile->colored = 0;
+	profile->key = 0;
+	profile->key_r = profile->key_g = profile->key_b = 0;
+	profile->alpha = 0;
+	profile->numcolors = 0;
+	profile->bits = 1;
+}
+
+/*function used for debug purposes with C++*/
+/*void printColorProfile(LodePNGColorProfile* p)
+{
+  std::cout << "colored: " << (int)p->colored << ", ";
+  std::cout << "key: " << (int)p->key << ", ";
+  std::cout << "key_r: " << (int)p->key_r << ", ";
+  std::cout << "key_g: " << (int)p->key_g << ", ";
+  std::cout << "key_b: " << (int)p->key_b << ", ";
+  std::cout << "alpha: " << (int)p->alpha << ", ";
+  std::cout << "numcolors: " << (int)p->numcolors << ", ";
+  std::cout << "bits: " << (int)p->bits << std::endl;
+}*/
+
+/*Returns how many bits needed to represent given value (max 8 bit)*/
+static unsigned getValueRequiredBits(unsigned char value)
+{
+	if (value == 0 || value == 255) return 1;
+	/*The scaling of 2-bit and 4-bit values uses multiples of 85 and 17*/
+	if (value % 17 == 0) return value % 85 == 0 ? 2 : 4;
+	return 8;
+}
+
+/*profile must already have been inited with mode.
+It's ok to set some parameters of profile to done already.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+	const unsigned char* in, unsigned w, unsigned h,
+	const LodePNGColorMode* mode)
+{
+	unsigned error = 0;
+	size_t i;
+	ColorTree tree;
+	size_t numpixels = w * h;
+
+	unsigned colored_done = lodepng_is_greyscale_type(mode) ? 1 : 0;
+	unsigned alpha_done = lodepng_can_have_alpha(mode) ? 0 : 1;
+	unsigned numcolors_done = 0;
+	unsigned bpp = lodepng_get_bpp(mode);
+	unsigned bits_done = bpp == 1 ? 1 : 0;
+	unsigned maxnumcolors = 257;
+	unsigned sixteen = 0;
+	if (bpp <= 8) maxnumcolors = bpp == 1 ? 2 : (bpp == 2 ? 4 : (bpp == 4 ? 16 : 256));
+
+	color_tree_init(&tree);
+
+	/*Check if the 16-bit input is truly 16-bit*/
+	if (mode->bitdepth == 16)
+	{
+		unsigned short r, g, b, a;
+		for (i = 0; i != numpixels; ++i)
+		{
+			getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+			if ((r & 255) != ((r >> 8) & 255) || (g & 255) != ((g >> 8) & 255) ||
+				(b & 255) != ((b >> 8) & 255) || (a & 255) != ((a >> 8) & 255)) /*first and second byte differ*/
+			{
+				sixteen = 1;
+				break;
+			}
+		}
+	}
+
+	if (sixteen)
+	{
+		unsigned short r = 0, g = 0, b = 0, a = 0;
+		profile->bits = 16;
+		bits_done = numcolors_done = 1; /*counting colors no longer useful, palette doesn't support 16-bit*/
+
+		for (i = 0; i != numpixels; ++i)
+		{
+			getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+
+			if (!colored_done && (r != g || r != b))
+			{
+				profile->colored = 1;
+				colored_done = 1;
+			}
+
+			if (!alpha_done)
+			{
+				unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+				if (a != 65535 && (a != 0 || (profile->key && !matchkey)))
+				{
+					profile->alpha = 1;
+					profile->key = 0;
+					alpha_done = 1;
+				}
+				else if (a == 0 && !profile->alpha && !profile->key)
+				{
+					profile->key = 1;
+					profile->key_r = r;
+					profile->key_g = g;
+					profile->key_b = b;
+				}
+				else if (a == 65535 && profile->key && matchkey)
+				{
+					/* Color key cannot be used if an opaque pixel also has that RGB color. */
+					profile->alpha = 1;
+					profile->key = 0;
+					alpha_done = 1;
+				}
+			}
+			if (alpha_done && numcolors_done && colored_done && bits_done) break;
+		}
+
+		if (profile->key && !profile->alpha)
+		{
+			for (i = 0; i != numpixels; ++i)
+			{
+				getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+				if (a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+				{
+					/* Color key cannot be used if an opaque pixel also has that RGB color. */
+					profile->alpha = 1;
+					profile->key = 0;
+					alpha_done = 1;
+				}
+			}
+		}
+	}
+	else /* < 16-bit */
+	{
+		unsigned char r = 0, g = 0, b = 0, a = 0;
+		for (i = 0; i != numpixels; ++i)
+		{
+			getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+
+			if (!bits_done && profile->bits < 8)
+			{
+				/*only r is checked, < 8 bits is only relevant for greyscale*/
+				unsigned bits = getValueRequiredBits(r);
+				if (bits > profile->bits) profile->bits = bits;
+			}
+			bits_done = (profile->bits >= bpp);
+
+			if (!colored_done && (r != g || r != b))
+			{
+				profile->colored = 1;
+				colored_done = 1;
+				if (profile->bits < 8) profile->bits = 8; /*PNG has no colored modes with less than 8-bit per channel*/
+			}
+
+			if (!alpha_done)
+			{
+				unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+				if (a != 255 && (a != 0 || (profile->key && !matchkey)))
+				{
+					profile->alpha = 1;
+					profile->key = 0;
+					alpha_done = 1;
+					if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+				}
+				else if (a == 0 && !profile->alpha && !profile->key)
+				{
+					profile->key = 1;
+					profile->key_r = r;
+					profile->key_g = g;
+					profile->key_b = b;
+				}
+				else if (a == 255 && profile->key && matchkey)
+				{
+					/* Color key cannot be used if an opaque pixel also has that RGB color. */
+					profile->alpha = 1;
+					profile->key = 0;
+					alpha_done = 1;
+					if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+				}
+			}
+
+			if (!numcolors_done)
+			{
+				if (!color_tree_has(&tree, r, g, b, a))
+				{
+					color_tree_add(&tree, r, g, b, a, profile->numcolors);
+					if (profile->numcolors < 256)
+					{
+						unsigned char* p = profile->palette;
+						unsigned n = profile->numcolors;
+						p[n * 4 + 0] = r;
+						p[n * 4 + 1] = g;
+						p[n * 4 + 2] = b;
+						p[n * 4 + 3] = a;
+					}
+					++profile->numcolors;
+					numcolors_done = profile->numcolors >= maxnumcolors;
+				}
+			}
+
+			if (alpha_done && numcolors_done && colored_done && bits_done) break;
+		}
+
+		if (profile->key && !profile->alpha)
+		{
+			for (i = 0; i != numpixels; ++i)
+			{
+				getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+				if (a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+				{
+					/* Color key cannot be used if an opaque pixel also has that RGB color. */
+					profile->alpha = 1;
+					profile->key = 0;
+					alpha_done = 1;
+					if (profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+				}
+			}
+		}
+
+		/*make the profile's key always 16-bit for consistency - repeat each byte twice*/
+		profile->key_r += (profile->key_r << 8);
+		profile->key_g += (profile->key_g << 8);
+		profile->key_b += (profile->key_b << 8);
+	}
+
+	color_tree_cleanup(&tree);
+	return error;
+}
+
+/*Automatically chooses color type that gives smallest amount of bits in the
+output image, e.g. grey if there are only greyscale pixels, palette if there
+are less than 256 colors, ...
+Updates values of mode with a potentially smaller color model. mode_out should
+contain the user chosen color model, but will be overwritten with the new chosen one.*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+	const unsigned char* image, unsigned w, unsigned h,
+	const LodePNGColorMode* mode_in)
+{
+	LodePNGColorProfile prof;
+	unsigned error = 0;
+	unsigned i, n, palettebits, palette_ok;
+
+	lodepng_color_profile_init(&prof);
+	error = lodepng_get_color_profile(&prof, image, w, h, mode_in);
+	if (error) return error;
+	mode_out->key_defined = 0;
+
+	if (prof.key && w * h <= 16)
+	{
+		prof.alpha = 1; /*too few pixels to justify tRNS chunk overhead*/
+		prof.key = 0;
+		if (prof.bits < 8) prof.bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+	}
+	n = prof.numcolors;
+	palettebits = n <= 2 ? 1 : (n <= 4 ? 2 : (n <= 16 ? 4 : 8));
+	palette_ok = n <= 256 && prof.bits <= 8;
+	if (w * h < n * 2) palette_ok = 0; /*don't add palette overhead if image has only a few pixels*/
+	if (!prof.colored && prof.bits <= palettebits) palette_ok = 0; /*grey is less overhead*/
+
+	if (palette_ok)
+	{
+		unsigned char* p = prof.palette;
+		lodepng_palette_clear(mode_out); /*remove potential earlier palette*/
+		for (i = 0; i != prof.numcolors; ++i)
+		{
+			error = lodepng_palette_add(mode_out, p[i * 4 + 0], p[i * 4 + 1], p[i * 4 + 2], p[i * 4 + 3]);
+			if (error) break;
+		}
+
+		mode_out->colortype = LCT_PALETTE;
+		mode_out->bitdepth = palettebits;
+
+		if (mode_in->colortype == LCT_PALETTE && mode_in->palettesize >= mode_out->palettesize
+			&& mode_in->bitdepth == mode_out->bitdepth)
+		{
+			/*If input should have same palette colors, keep original to preserve its order and prevent conversion*/
+			lodepng_color_mode_cleanup(mode_out);
+			lodepng_color_mode_copy(mode_out, mode_in);
+		}
+	}
+	else /*8-bit or 16-bit per channel*/
+	{
+		mode_out->bitdepth = prof.bits;
+		mode_out->colortype = prof.alpha ? (prof.colored ? LCT_RGBA : LCT_GREY_ALPHA)
+			: (prof.colored ? LCT_RGB : LCT_GREY);
+
+		if (prof.key)
+		{
+			unsigned mask = (1u << mode_out->bitdepth) - 1u; /*profile always uses 16-bit, mask converts it*/
+			mode_out->key_r = prof.key_r & mask;
+			mode_out->key_g = prof.key_g & mask;
+			mode_out->key_b = prof.key_b & mask;
+			mode_out->key_defined = 1;
+		}
+	}
+
+	return error;
+}
+
+#endif /* #ifdef LODEPNG_COMPILE_ENCODER */
+
+/*
+Paeth predicter, used by PNG filter type 4
+The parameters are of type short, but should come from unsigned chars, the shorts
+are only needed to make the paeth calculation correct.
+*/
+static unsigned char paethPredictor(short a, short b, short c)
+{
+	short pa = abs(b - c);
+	short pb = abs(a - c);
+	short pc = abs(a + b - c - c);
+
+	if (pc < pa && pc < pb) return (unsigned char)c;
+	else if (pb < pa) return (unsigned char)b;
+	else return (unsigned char)a;
+}
+
+/*shared values used by multiple Adam7 related functions*/
+
+static const unsigned ADAM7_IX[7] = { 0, 4, 0, 2, 0, 1, 0 }; /*x start values*/
+static const unsigned ADAM7_IY[7] = { 0, 0, 4, 0, 2, 0, 1 }; /*y start values*/
+static const unsigned ADAM7_DX[7] = { 8, 8, 4, 4, 2, 2, 1 }; /*x delta values*/
+static const unsigned ADAM7_DY[7] = { 8, 8, 8, 4, 4, 2, 2 }; /*y delta values*/
+
+/*
+Outputs various dimensions and positions in the image related to the Adam7 reduced images.
+passw: output containing the width of the 7 passes
+passh: output containing the height of the 7 passes
+filter_passstart: output containing the index of the start and end of each
+ reduced image with filter bytes
+padded_passstart output containing the index of the start and end of each
+ reduced image when without filter bytes but with padded scanlines
+passstart: output containing the index of the start and end of each reduced
+ image without padding between scanlines, but still padding between the images
+w, h: width and height of non-interlaced image
+bpp: bits per pixel
+"padded" is only relevant if bpp is less than 8 and a scanline or image does not
+ end at a full byte
+*/
+static void Adam7_getpassvalues(unsigned passw[7], unsigned passh[7], size_t filter_passstart[8],
+	size_t padded_passstart[8], size_t passstart[8], unsigned w, unsigned h, unsigned bpp)
+{
+	/*the passstart values have 8 values: the 8th one indicates the byte after the end of the 7th (= last) pass*/
+	unsigned i;
+
+	/*calculate width and height in pixels of each pass*/
+	for (i = 0; i != 7; ++i)
+	{
+		passw[i] = (w + ADAM7_DX[i] - ADAM7_IX[i] - 1) / ADAM7_DX[i];
+		passh[i] = (h + ADAM7_DY[i] - ADAM7_IY[i] - 1) / ADAM7_DY[i];
+		if (passw[i] == 0) passh[i] = 0;
+		if (passh[i] == 0) passw[i] = 0;
+	}
+
+	filter_passstart[0] = padded_passstart[0] = passstart[0] = 0;
+	for (i = 0; i != 7; ++i)
+	{
+		/*if passw[i] is 0, it's 0 bytes, not 1 (no filtertype-byte)*/
+		filter_passstart[i + 1] = filter_passstart[i]
+			+ ((passw[i] && passh[i]) ? passh[i] * (1 + (passw[i] * bpp + 7) / 8) : 0);
+		/*bits padded if needed to fill full byte at end of each scanline*/
+		padded_passstart[i + 1] = padded_passstart[i] + passh[i] * ((passw[i] * bpp + 7) / 8);
+		/*only padded at end of reduced image*/
+		passstart[i + 1] = passstart[i] + (passh[i] * passw[i] * bpp + 7) / 8;
+	}
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Decoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*read the information from the header and store it in the LodePNGInfo. return value is error*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h, LodePNGState* state,
+	const unsigned char* in, size_t insize)
+{
+	LodePNGInfo* info = &state->info_png;
+	if (insize == 0 || in == 0)
+	{
+		CERROR_RETURN_ERROR(state->error, 48); /*error: the given data is empty*/
+	}
+	if (insize < 33)
+	{
+		CERROR_RETURN_ERROR(state->error, 27); /*error: the data length is smaller than the length of a PNG header*/
+	}
+
+	/*when decoding a new PNG image, make sure all parameters created after previous decoding are reset*/
+	lodepng_info_cleanup(info);
+	lodepng_info_init(info);
+
+	if (in[0] != 137 || in[1] != 80 || in[2] != 78 || in[3] != 71
+		|| in[4] != 13 || in[5] != 10 || in[6] != 26 || in[7] != 10)
+	{
+		CERROR_RETURN_ERROR(state->error, 28); /*error: the first 8 bytes are not the correct PNG signature*/
+	}
+	if (lodepng_chunk_length(in + 8) != 13)
+	{
+		CERROR_RETURN_ERROR(state->error, 94); /*error: header size must be 13 bytes*/
+	}
+	if (!lodepng_chunk_type_equals(in + 8, "IHDR"))
+	{
+		CERROR_RETURN_ERROR(state->error, 29); /*error: it doesn't start with a IHDR chunk!*/
+	}
+
+	/*read the values given in the header*/
+	*w = lodepng_read32bitInt(&in[16]);
+	*h = lodepng_read32bitInt(&in[20]);
+	info->color.bitdepth = in[24];
+	info->color.colortype = (LodePNGColorType)in[25];
+	info->compression_method = in[26];
+	info->filter_method = in[27];
+	info->interlace_method = in[28];
+
+	if (*w == 0 || *h == 0)
+	{
+		CERROR_RETURN_ERROR(state->error, 93);
+	}
+
+	if (!state->decoder.ignore_crc)
+	{
+		unsigned CRC = lodepng_read32bitInt(&in[29]);
+		unsigned checksum = lodepng_crc32(&in[12], 17);
+		if (CRC != checksum)
+		{
+			CERROR_RETURN_ERROR(state->error, 57); /*invalid CRC*/
+		}
+	}
+
+	/*error: only compression method 0 is allowed in the specification*/
+	if (info->compression_method != 0) CERROR_RETURN_ERROR(state->error, 32);
+	/*error: only filter method 0 is allowed in the specification*/
+	if (info->filter_method != 0) CERROR_RETURN_ERROR(state->error, 33);
+	/*error: only interlace methods 0 and 1 exist in the specification*/
+	if (info->interlace_method > 1) CERROR_RETURN_ERROR(state->error, 34);
+
+	state->error = checkColorValidity(info->color.colortype, info->color.bitdepth);
+	return state->error;
+}
+
+static unsigned unfilterScanline(unsigned char* recon, const unsigned char* scanline, const unsigned char* precon,
+	size_t bytewidth, unsigned char filterType, size_t length)
+{
+	/*
+	For PNG filter method 0
+	unfilter a PNG image scanline by scanline. when the pixels are smaller than 1 byte,
+	the filter works byte per byte (bytewidth = 1)
+	precon is the previous unfiltered scanline, recon the result, scanline the current one
+	the incoming scanlines do NOT include the filtertype byte, that one is given in the parameter filterType instead
+	recon and scanline MAY be the same memory address! precon must be disjoint.
+	*/
+
+	size_t i;
+	switch (filterType)
+	{
+	case 0:
+		for (i = 0; i != length; ++i) recon[i] = scanline[i];
+		break;
+	case 1:
+		for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+		for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + recon[i - bytewidth];
+		break;
+	case 2:
+		if (precon)
+		{
+			for (i = 0; i != length; ++i) recon[i] = scanline[i] + precon[i];
+		}
+		else
+		{
+			for (i = 0; i != length; ++i) recon[i] = scanline[i];
+		}
+		break;
+	case 3:
+		if (precon)
+		{
+			for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i] + (precon[i] >> 1);
+			for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + ((recon[i - bytewidth] + precon[i]) >> 1);
+		}
+		else
+		{
+			for (i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+			for (i = bytewidth; i < length; ++i) recon[i] = scanline[i] + (recon[i - bytewidth] >> 1);
+		}
+		break;
+	case 4:
+		if (precon)
+		{
+			for (i = 0; i != bytewidth; ++i)
+			{
+				recon[i] = (scanline[i] + precon[i]); /*paethPredictor(0, precon[i], 0) is always precon[i]*/
+			}
+			for (i = bytewidth; i < length; ++i)
+			{
+				recon[i] = (scanline[i] + paethPredictor(recon[i - bytewidth], precon[i], precon[i - bytewidth]));
+			}
+		}
+		else
+		{
+			for (i = 0; i != bytewidth; ++i)
+			{
+				recon[i] = scanline[i];
+			}
+			for (i = bytewidth; i < length; ++i)
+			{
+				/*paethPredictor(recon[i - bytewidth], 0, 0) is always recon[i - bytewidth]*/
+				recon[i] = (scanline[i] + recon[i - bytewidth]);
+			}
+		}
+		break;
+	default: return 36; /*error: unexisting filter type given*/
+	}
+	return 0;
+}
+
+static unsigned unfilter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+	/*
+	For PNG filter method 0
+	this function unfilters a single image (e.g. without interlacing this is called once, with Adam7 seven times)
+	out must have enough bytes allocated already, in must have the scanlines + 1 filtertype byte per scanline
+	w and h are image dimensions or dimensions of reduced image, bpp is bits per pixel
+	in and out are allowed to be the same memory address (but aren't the same size since in has the extra filter bytes)
+	*/
+
+	unsigned y;
+	unsigned char* prevline = 0;
+
+	/*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+	size_t bytewidth = (bpp + 7) / 8;
+	size_t linebytes = (w * bpp + 7) / 8;
+
+	for (y = 0; y < h; ++y)
+	{
+		size_t outindex = linebytes * y;
+		size_t inindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+		unsigned char filterType = in[inindex];
+
+		CERROR_TRY_RETURN(unfilterScanline(&out[outindex], &in[inindex + 1], prevline, bytewidth, filterType, linebytes));
+
+		prevline = &out[outindex];
+	}
+
+	return 0;
+}
+
+/*
+in: Adam7 interlaced image, with no padding bits between scanlines, but between
+ reduced images so that each reduced image starts at a byte.
+out: the same pixels, but re-ordered so that they're now a non-interlaced image with size w*h
+bpp: bits per pixel
+out has the following size in bits: w * h * bpp.
+in is possibly bigger due to padding bits between reduced images.
+out must be big enough AND must be 0 everywhere if bpp < 8 in the current implementation
+(because that's likely a little bit faster)
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_deinterlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+	unsigned passw[7], passh[7];
+	size_t filter_passstart[8], padded_passstart[8], passstart[8];
+	unsigned i;
+
+	Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+	if (bpp >= 8)
+	{
+		for (i = 0; i != 7; ++i)
+		{
+			unsigned x, y, b;
+			size_t bytewidth = bpp / 8;
+			for (y = 0; y < passh[i]; ++y)
+				for (x = 0; x < passw[i]; ++x)
+				{
+					size_t pixelinstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+					size_t pixeloutstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+					for (b = 0; b < bytewidth; ++b)
+					{
+						out[pixeloutstart + b] = in[pixelinstart + b];
+					}
+				}
+		}
+	}
+	else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+	{
+		for (i = 0; i != 7; ++i)
+		{
+			unsigned x, y, b;
+			unsigned ilinebits = bpp * passw[i];
+			unsigned olinebits = bpp * w;
+			size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+			for (y = 0; y < passh[i]; ++y)
+				for (x = 0; x < passw[i]; ++x)
+				{
+					ibp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+					obp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+					for (b = 0; b < bpp; ++b)
+					{
+						unsigned char bit = readBitFromReversedStream(&ibp, in);
+						/*note that this function assumes the out buffer is completely 0, use setBitOfReversedStream otherwise*/
+						setBitOfReversedStream0(&obp, out, bit);
+					}
+				}
+		}
+	}
+}
+
+static void removePaddingBits(unsigned char* out, const unsigned char* in,
+	size_t olinebits, size_t ilinebits, unsigned h)
+{
+	/*
+	After filtering there are still padding bits if scanlines have non multiple of 8 bit amounts. They need
+	to be removed (except at last scanline of (Adam7-reduced) image) before working with pure image buffers
+	for the Adam7 code, the color convert code and the output to the user.
+	in and out are allowed to be the same buffer, in may also be higher but still overlapping; in must
+	have >= ilinebits*h bits, out must have >= olinebits*h bits, olinebits must be <= ilinebits
+	also used to move bits after earlier such operations happened, e.g. in a sequence of reduced images from Adam7
+	only useful if (ilinebits - olinebits) is a value in the range 1..7
+	*/
+	unsigned y;
+	size_t diff = ilinebits - olinebits;
+	size_t ibp = 0, obp = 0; /*input and output bit pointers*/
+	for (y = 0; y < h; ++y)
+	{
+		size_t x;
+		for (x = 0; x < olinebits; ++x)
+		{
+			unsigned char bit = readBitFromReversedStream(&ibp, in);
+			setBitOfReversedStream(&obp, out, bit);
+		}
+		ibp += diff;
+	}
+}
+
+/*out must be buffer big enough to contain full image, and in must contain the full decompressed data from
+the IDAT chunks (with filter index bytes and possible padding bits)
+return value is error*/
+static unsigned postProcessScanlines(unsigned char* out, unsigned char* in,
+	unsigned w, unsigned h, const LodePNGInfo* info_png)
+{
+	/*
+	This function converts the filtered-padded-interlaced data into pure 2D image buffer with the PNG's colortype.
+	Steps:
+	*) if no Adam7: 1) unfilter 2) remove padding bits (= posible extra bits per scanline if bpp < 8)
+	*) if adam7: 1) 7x unfilter 2) 7x remove padding bits 3) Adam7_deinterlace
+	NOTE: the in buffer will be overwritten with intermediate data!
+	*/
+	unsigned bpp = lodepng_get_bpp(&info_png->color);
+	if (bpp == 0) return 31; /*error: invalid colortype*/
+
+	if (info_png->interlace_method == 0)
+	{
+		if (bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+		{
+			CERROR_TRY_RETURN(unfilter(in, in, w, h, bpp));
+			removePaddingBits(out, in, w * bpp, ((w * bpp + 7) / 8) * 8, h);
+		}
+		/*we can immediately filter into the out buffer, no other steps needed*/
+		else CERROR_TRY_RETURN(unfilter(out, in, w, h, bpp));
+	}
+	else /*interlace_method is 1 (Adam7)*/
+	{
+		unsigned passw[7], passh[7]; size_t filter_passstart[8], padded_passstart[8], passstart[8];
+		unsigned i;
+
+		Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+		for (i = 0; i != 7; ++i)
+		{
+			CERROR_TRY_RETURN(unfilter(&in[padded_passstart[i]], &in[filter_passstart[i]], passw[i], passh[i], bpp));
+			/*TODO: possible efficiency improvement: if in this reduced image the bits fit nicely in 1 scanline,
+			move bytes instead of bits or move not at all*/
+			if (bpp < 8)
+			{
+				/*remove padding bits in scanlines; after this there still may be padding
+				bits between the different reduced images: each reduced image still starts nicely at a byte*/
+				removePaddingBits(&in[passstart[i]], &in[padded_passstart[i]], passw[i] * bpp,
+					((passw[i] * bpp + 7) / 8) * 8, passh[i]);
+			}
+		}
+
+		Adam7_deinterlace(out, in, w, h, bpp);
+	}
+
+	return 0;
+}
+
+static unsigned readChunk_PLTE(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+	unsigned pos = 0, i;
+	if (color->palette) lodepng_free(color->palette);
+	color->palettesize = chunkLength / 3;
+	color->palette = (unsigned char*)lodepng_malloc(4 * color->palettesize);
+	if (!color->palette && color->palettesize)
+	{
+		color->palettesize = 0;
+		return 83; /*alloc fail*/
+	}
+	if (color->palettesize > 256) return 38; /*error: palette too big*/
+
+	for (i = 0; i != color->palettesize; ++i)
+	{
+		color->palette[4 * i + 0] = data[pos++]; /*R*/
+		color->palette[4 * i + 1] = data[pos++]; /*G*/
+		color->palette[4 * i + 2] = data[pos++]; /*B*/
+		color->palette[4 * i + 3] = 255; /*alpha*/
+	}
+
+	return 0; /* OK */
+}
+
+static unsigned readChunk_tRNS(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+	unsigned i;
+	if (color->colortype == LCT_PALETTE)
+	{
+		/*error: more alpha values given than there are palette entries*/
+		if (chunkLength > color->palettesize) return 38;
+
+		for (i = 0; i != chunkLength; ++i) color->palette[4 * i + 3] = data[i];
+	}
+	else if (color->colortype == LCT_GREY)
+	{
+		/*error: this chunk must be 2 bytes for greyscale image*/
+		if (chunkLength != 2) return 30;
+
+		color->key_defined = 1;
+		color->key_r = color->key_g = color->key_b = 256u * data[0] + data[1];
+	}
+	else if (color->colortype == LCT_RGB)
+	{
+		/*error: this chunk must be 6 bytes for RGB image*/
+		if (chunkLength != 6) return 41;
+
+		color->key_defined = 1;
+		color->key_r = 256u * data[0] + data[1];
+		color->key_g = 256u * data[2] + data[3];
+		color->key_b = 256u * data[4] + data[5];
+	}
+	else return 42; /*error: tRNS chunk not allowed for other color models*/
+
+	return 0; /* OK */
+}
+
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*background color chunk (bKGD)*/
+static unsigned readChunk_bKGD(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+	if (info->color.colortype == LCT_PALETTE)
+	{
+		/*error: this chunk must be 1 byte for indexed color image*/
+		if (chunkLength != 1) return 43;
+
+		info->background_defined = 1;
+		info->background_r = info->background_g = info->background_b = data[0];
+	}
+	else if (info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+	{
+		/*error: this chunk must be 2 bytes for greyscale image*/
+		if (chunkLength != 2) return 44;
+
+		info->background_defined = 1;
+		info->background_r = info->background_g = info->background_b = 256u * data[0] + data[1];
+	}
+	else if (info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+	{
+		/*error: this chunk must be 6 bytes for greyscale image*/
+		if (chunkLength != 6) return 45;
+
+		info->background_defined = 1;
+		info->background_r = 256u * data[0] + data[1];
+		info->background_g = 256u * data[2] + data[3];
+		info->background_b = 256u * data[4] + data[5];
+	}
+
+	return 0; /* OK */
+}
+
+/*text chunk (tEXt)*/
+static unsigned readChunk_tEXt(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+	unsigned error = 0;
+	char *key = 0, *str = 0;
+	unsigned i;
+
+	while (!error) /*not really a while loop, only used to break on error*/
+	{
+		unsigned length, string2_begin;
+
+		length = 0;
+		while (length < chunkLength && data[length] != 0) ++length;
+		/*even though it's not allowed by the standard, no error is thrown if
+		there's no null termination char, if the text is empty*/
+		if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+		key = (char*)lodepng_malloc(length + 1);
+		if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+		key[length] = 0;
+		for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+		string2_begin = length + 1; /*skip keyword null terminator*/
+
+		length = chunkLength < string2_begin ? 0 : chunkLength - string2_begin;
+		str = (char*)lodepng_malloc(length + 1);
+		if (!str) CERROR_BREAK(error, 83); /*alloc fail*/
+
+		str[length] = 0;
+		for (i = 0; i != length; ++i) str[i] = (char)data[string2_begin + i];
+
+		error = lodepng_add_text(info, key, str);
+
+		break;
+	}
+
+	lodepng_free(key);
+	lodepng_free(str);
+
+	return error;
+}
+
+/*compressed text chunk (zTXt)*/
+static unsigned readChunk_zTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+	const unsigned char* data, size_t chunkLength)
+{
+	unsigned error = 0;
+	unsigned i;
+
+	unsigned length, string2_begin;
+	char *key = 0;
+	ucvector decoded;
+
+	ucvector_init(&decoded);
+
+	while (!error) /*not really a while loop, only used to break on error*/
+	{
+		for (length = 0; length < chunkLength && data[length] != 0; ++length);
+		if (length + 2 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+		if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+		key = (char*)lodepng_malloc(length + 1);
+		if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+		key[length] = 0;
+		for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+		if (data[length + 1] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+		string2_begin = length + 2;
+		if (string2_begin > chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+
+		length = chunkLength - string2_begin;
+		/*will fail if zlib error, e.g. if length is too small*/
+		error = zlib_decompress(&decoded.data, &decoded.size,
+			(unsigned char*)(&data[string2_begin]),
+			length, zlibsettings);
+		if (error) break;
+		ucvector_push_back(&decoded, 0);
+
+		error = lodepng_add_text(info, key, (char*)decoded.data);
+
+		break;
+	}
+
+	lodepng_free(key);
+	ucvector_cleanup(&decoded);
+
+	return error;
+}
+
+/*international text chunk (iTXt)*/
+static unsigned readChunk_iTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+	const unsigned char* data, size_t chunkLength)
+{
+	unsigned error = 0;
+	unsigned i;
+
+	unsigned length, begin, compressed;
+	char *key = 0, *langtag = 0, *transkey = 0;
+	ucvector decoded;
+	ucvector_init(&decoded);
+
+	while (!error) /*not really a while loop, only used to break on error*/
+	{
+		/*Quick check if the chunk length isn't too small. Even without check
+		it'd still fail with other error checks below if it's too short. This just gives a different error code.*/
+		if (chunkLength < 5) CERROR_BREAK(error, 30); /*iTXt chunk too short*/
+
+		/*read the key*/
+		for (length = 0; length < chunkLength && data[length] != 0; ++length);
+		if (length + 3 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination char, corrupt?*/
+		if (length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+		key = (char*)lodepng_malloc(length + 1);
+		if (!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+		key[length] = 0;
+		for (i = 0; i != length; ++i) key[i] = (char)data[i];
+
+		/*read the compression method*/
+		compressed = data[length + 1];
+		if (data[length + 2] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+		/*even though it's not allowed by the standard, no error is thrown if
+		there's no null termination char, if the text is empty for the next 3 texts*/
+
+		/*read the langtag*/
+		begin = length + 3;
+		length = 0;
+		for (i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+		langtag = (char*)lodepng_malloc(length + 1);
+		if (!langtag) CERROR_BREAK(error, 83); /*alloc fail*/
+
+		langtag[length] = 0;
+		for (i = 0; i != length; ++i) langtag[i] = (char)data[begin + i];
+
+		/*read the transkey*/
+		begin += length + 1;
+		length = 0;
+		for (i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+		transkey = (char*)lodepng_malloc(length + 1);
+		if (!transkey) CERROR_BREAK(error, 83); /*alloc fail*/
+
+		transkey[length] = 0;
+		for (i = 0; i != length; ++i) transkey[i] = (char)data[begin + i];
+
+		/*read the actual text*/
+		begin += length + 1;
+
+		length = chunkLength < begin ? 0 : chunkLength - begin;
+
+		if (compressed)
+		{
+			/*will fail if zlib error, e.g. if length is too small*/
+			error = zlib_decompress(&decoded.data, &decoded.size,
+				(unsigned char*)(&data[begin]),
+				length, zlibsettings);
+			if (error) break;
+			if (decoded.allocsize < decoded.size) decoded.allocsize = decoded.size;
+			ucvector_push_back(&decoded, 0);
+		}
+		else
+		{
+			if (!ucvector_resize(&decoded, length + 1)) CERROR_BREAK(error, 83 /*alloc fail*/);
+
+			decoded.data[length] = 0;
+			for (i = 0; i != length; ++i) decoded.data[i] = data[begin + i];
+		}
+
+		error = lodepng_add_itext(info, key, langtag, transkey, (char*)decoded.data);
+
+		break;
+	}
+
+	lodepng_free(key);
+	lodepng_free(langtag);
+	lodepng_free(transkey);
+	ucvector_cleanup(&decoded);
+
+	return error;
+}
+
+static unsigned readChunk_tIME(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+	if (chunkLength != 7) return 73; /*invalid tIME chunk size*/
+
+	info->time_defined = 1;
+	info->time.year = 256u * data[0] + data[1];
+	info->time.month = data[2];
+	info->time.day = data[3];
+	info->time.hour = data[4];
+	info->time.minute = data[5];
+	info->time.second = data[6];
+
+	return 0; /* OK */
+}
+
+static unsigned readChunk_pHYs(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+	if (chunkLength != 9) return 74; /*invalid pHYs chunk size*/
+
+	info->phys_defined = 1;
+	info->phys_x = 16777216u * data[0] + 65536u * data[1] + 256u * data[2] + data[3];
+	info->phys_y = 16777216u * data[4] + 65536u * data[5] + 256u * data[6] + data[7];
+	info->phys_unit = data[8];
+
+	return 0; /* OK */
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*read a PNG, the result will be in the same color type as the PNG (hence "generic")*/
+static void decodeGeneric(unsigned char** out, unsigned* w, unsigned* h,
+	LodePNGState* state,
+	const unsigned char* in, size_t insize)
+{
+	unsigned char IEND = 0;
+	const unsigned char* chunk;
+	size_t i;
+	ucvector idat; /*the data from idat chunks*/
+	ucvector scanlines;
+	size_t predict;
+	size_t numpixels;
+	size_t outsize = 0;
+
+	/*for unknown chunk order*/
+	unsigned unknown = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+	unsigned critical_pos = 1; /*1 = after IHDR, 2 = after PLTE, 3 = after IDAT*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+  /*provide some proper output values if error will happen*/
+	*out = 0;
+
+	state->error = lodepng_inspect(w, h, state, in, insize); /*reads header and resets other parameters in state->info_png*/
+	if (state->error) return;
+
+	numpixels = *w * *h;
+
+	/*multiplication overflow*/
+	if (*h != 0 && numpixels / *h != *w) CERROR_RETURN(state->error, 92);
+	/*multiplication overflow possible further below. Allows up to 2^31-1 pixel
+	bytes with 16-bit RGBA, the rest is room for filter bytes.*/
+	if (numpixels > 268435455) CERROR_RETURN(state->error, 92);
+
+	ucvector_init(&idat);
+	chunk = &in[33]; /*first byte of the first chunk after the header*/
+
+	/*loop through the chunks, ignoring unknown chunks and stopping at IEND chunk.
+	IDAT data is put at the start of the in buffer*/
+	while (!IEND && !state->error)
+	{
+		unsigned chunkLength;
+		const unsigned char* data; /*the data in the chunk*/
+
+		/*error: size of the in buffer too small to contain next chunk*/
+		if ((size_t)((chunk - in) + 12) > insize || chunk < in) CERROR_BREAK(state->error, 30);
+
+		/*length of the data of the chunk, excluding the length bytes, chunk type and CRC bytes*/
+		chunkLength = lodepng_chunk_length(chunk);
+		/*error: chunk length larger than the max PNG chunk size*/
+		if (chunkLength > 2147483647) CERROR_BREAK(state->error, 63);
+
+		if ((size_t)((chunk - in) + chunkLength + 12) > insize || (chunk + chunkLength + 12) < in)
+		{
+			CERROR_BREAK(state->error, 64); /*error: size of the in buffer too small to contain next chunk*/
+		}
+
+		data = lodepng_chunk_data_const(chunk);
+
+		/*IDAT chunk, containing compressed image data*/
+		if (lodepng_chunk_type_equals(chunk, "IDAT"))
+		{
+			size_t oldsize = idat.size;
+			if (!ucvector_resize(&idat, oldsize + chunkLength)) CERROR_BREAK(state->error, 83 /*alloc fail*/);
+			for (i = 0; i != chunkLength; ++i) idat.data[oldsize + i] = data[i];
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+			critical_pos = 3;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+		}
+		/*IEND chunk*/
+		else if (lodepng_chunk_type_equals(chunk, "IEND"))
+		{
+			IEND = 1;
+		}
+		/*palette chunk (PLTE)*/
+		else if (lodepng_chunk_type_equals(chunk, "PLTE"))
+		{
+			state->error = readChunk_PLTE(&state->info_png.color, data, chunkLength);
+			if (state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+			critical_pos = 2;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+		}
+		/*palette transparency chunk (tRNS)*/
+		else if (lodepng_chunk_type_equals(chunk, "tRNS"))
+		{
+			state->error = readChunk_tRNS(&state->info_png.color, data, chunkLength);
+			if (state->error) break;
+		}
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+		/*background color chunk (bKGD)*/
+		else if (lodepng_chunk_type_equals(chunk, "bKGD"))
+		{
+			state->error = readChunk_bKGD(&state->info_png, data, chunkLength);
+			if (state->error) break;
+		}
+		/*text chunk (tEXt)*/
+		else if (lodepng_chunk_type_equals(chunk, "tEXt"))
+		{
+			if (state->decoder.read_text_chunks)
+			{
+				state->error = readChunk_tEXt(&state->info_png, data, chunkLength);
+				if (state->error) break;
+			}
+		}
+		/*compressed text chunk (zTXt)*/
+		else if (lodepng_chunk_type_equals(chunk, "zTXt"))
+		{
+			if (state->decoder.read_text_chunks)
+			{
+				state->error = readChunk_zTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+				if (state->error) break;
+			}
+		}
+		/*international text chunk (iTXt)*/
+		else if (lodepng_chunk_type_equals(chunk, "iTXt"))
+		{
+			if (state->decoder.read_text_chunks)
+			{
+				state->error = readChunk_iTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+				if (state->error) break;
+			}
+		}
+		else if (lodepng_chunk_type_equals(chunk, "tIME"))
+		{
+			state->error = readChunk_tIME(&state->info_png, data, chunkLength);
+			if (state->error) break;
+		}
+		else if (lodepng_chunk_type_equals(chunk, "pHYs"))
+		{
+			state->error = readChunk_pHYs(&state->info_png, data, chunkLength);
+			if (state->error) break;
+		}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+		else /*it's not an implemented chunk type, so ignore it: skip over the data*/
+		{
+			/*error: unknown critical chunk (5th bit of first byte of chunk type is 0)*/
+			if (!lodepng_chunk_ancillary(chunk)) CERROR_BREAK(state->error, 69);
+
+			unknown = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+			if (state->decoder.remember_unknown_chunks)
+			{
+				state->error = lodepng_chunk_append(&state->info_png.unknown_chunks_data[critical_pos - 1],
+					&state->info_png.unknown_chunks_size[critical_pos - 1], chunk);
+				if (state->error) break;
+			}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+		}
+
+		if (!state->decoder.ignore_crc && !unknown) /*check CRC if wanted, only on known chunk types*/
+		{
+			if (lodepng_chunk_check_crc(chunk)) CERROR_BREAK(state->error, 57); /*invalid CRC*/
+		}
+
+		if (!IEND) chunk = lodepng_chunk_next_const(chunk);
+	}
+
+	ucvector_init(&scanlines);
+	/*predict output size, to allocate exact size for output buffer to avoid more dynamic allocation.
+	If the decompressed size does not match the prediction, the image must be corrupt.*/
+	if (state->info_png.interlace_method == 0)
+	{
+		/*The extra *h is added because this are the filter bytes every scanline starts with*/
+		predict = lodepng_get_raw_size_idat(*w, *h, &state->info_png.color) + *h;
+	}
+	else
+	{
+		/*Adam-7 interlaced: predicted size is the sum of the 7 sub-images sizes*/
+		const LodePNGColorMode* color = &state->info_png.color;
+		predict = 0;
+		predict += lodepng_get_raw_size_idat((*w + 7) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+		if (*w > 4) predict += lodepng_get_raw_size_idat((*w + 3) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+		predict += lodepng_get_raw_size_idat((*w + 3) >> 2, (*h + 3) >> 3, color) + ((*h + 3) >> 3);
+		if (*w > 2) predict += lodepng_get_raw_size_idat((*w + 1) >> 2, (*h + 3) >> 2, color) + ((*h + 3) >> 2);
+		predict += lodepng_get_raw_size_idat((*w + 1) >> 1, (*h + 1) >> 2, color) + ((*h + 1) >> 2);
+		if (*w > 1) predict += lodepng_get_raw_size_idat((*w + 0) >> 1, (*h + 1) >> 1, color) + ((*h + 1) >> 1);
+		predict += lodepng_get_raw_size_idat((*w + 0), (*h + 0) >> 1, color) + ((*h + 0) >> 1);
+	}
+	if (!state->error && !ucvector_reserve(&scanlines, predict)) state->error = 83; /*alloc fail*/
+	if (!state->error)
+	{
+		state->error = zlib_decompress(&scanlines.data, &scanlines.size, idat.data,
+			idat.size, &state->decoder.zlibsettings);
+		if (!state->error && scanlines.size != predict) state->error = 91; /*decompressed size doesn't match prediction*/
+	}
+	ucvector_cleanup(&idat);
+
+	if (!state->error)
+	{
+		outsize = lodepng_get_raw_size(*w, *h, &state->info_png.color);
+		*out = (unsigned char*)lodepng_malloc(outsize);
+		if (!*out) state->error = 83; /*alloc fail*/
+	}
+	if (!state->error)
+	{
+		for (i = 0; i < outsize; i++) (*out)[i] = 0;
+		state->error = postProcessScanlines(*out, scanlines.data, *w, *h, &state->info_png);
+	}
+	ucvector_cleanup(&scanlines);
+}
+
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+	LodePNGState* state,
+	const unsigned char* in, size_t insize)
+{
+	*out = 0;
+	decodeGeneric(out, w, h, state, in, insize);
+	if (state->error) return state->error;
+	if (!state->decoder.color_convert || lodepng_color_mode_equal(&state->info_raw, &state->info_png.color))
+	{
+		/*same color type, no copying or converting of data needed*/
+		/*store the info_png color settings on the info_raw so that the info_raw still reflects what colortype
+		the raw image has to the end user*/
+		if (!state->decoder.color_convert)
+		{
+			state->error = lodepng_color_mode_copy(&state->info_raw, &state->info_png.color);
+			if (state->error) return state->error;
+		}
+	}
+	else
+	{
+		/*color conversion needed; sort of copy of the data*/
+		unsigned char* data = *out;
+		size_t outsize;
+
+		/*TODO: check if this works according to the statement in the documentation: "The converter can convert
+		from greyscale input color type, to 8-bit greyscale or greyscale with alpha"*/
+		if (!(state->info_raw.colortype == LCT_RGB || state->info_raw.colortype == LCT_RGBA)
+			&& !(state->info_raw.bitdepth == 8))
+		{
+			return 56; /*unsupported color mode conversion*/
+		}
+
+		outsize = lodepng_get_raw_size(*w, *h, &state->info_raw);
+		*out = (unsigned char*)lodepng_malloc(outsize);
+		if (!(*out))
+		{
+			state->error = 83; /*alloc fail*/
+		}
+		else state->error = lodepng_convert(*out, data, &state->info_raw,
+			&state->info_png.color, *w, *h);
+		lodepng_free(data);
+	}
+	return state->error;
+}
+
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in,
+	size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+{
+	unsigned error;
+	LodePNGState state;
+	lodepng_state_init(&state);
+	state.info_raw.colortype = colortype;
+	state.info_raw.bitdepth = bitdepth;
+	error = lodepng_decode(out, w, h, &state, in, insize);
+	lodepng_state_cleanup(&state);
+	return error;
+}
+
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+	return lodepng_decode_memory(out, w, h, in, insize, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+	return lodepng_decode_memory(out, w, h, in, insize, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename,
+	LodePNGColorType colortype, unsigned bitdepth)
+{
+	unsigned char* buffer = 0;
+	size_t buffersize;
+	unsigned error;
+	error = lodepng_load_file(&buffer, &buffersize, filename);
+	if (!error) error = lodepng_decode_memory(out, w, h, buffer, buffersize, colortype, bitdepth);
+	lodepng_free(buffer);
+	return error;
+}
+
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+	return lodepng_decode_file(out, w, h, filename, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+	return lodepng_decode_file(out, w, h, filename, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings)
+{
+	settings->color_convert = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+	settings->read_text_chunks = 1;
+	settings->remember_unknown_chunks = 0;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+	settings->ignore_crc = 0;
+	lodepng_decompress_settings_init(&settings->zlibsettings);
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+
+void lodepng_state_init(LodePNGState* state)
+{
+#ifdef LODEPNG_COMPILE_DECODER
+	lodepng_decoder_settings_init(&state->decoder);
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+	lodepng_encoder_settings_init(&state->encoder);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+	lodepng_color_mode_init(&state->info_raw);
+	lodepng_info_init(&state->info_png);
+	state->error = 1;
+}
+
+void lodepng_state_cleanup(LodePNGState* state)
+{
+	lodepng_color_mode_cleanup(&state->info_raw);
+	lodepng_info_cleanup(&state->info_png);
+}
+
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source)
+{
+	lodepng_state_cleanup(dest);
+	*dest = *source;
+	lodepng_color_mode_init(&dest->info_raw);
+	lodepng_info_init(&dest->info_png);
+	dest->error = lodepng_color_mode_copy(&dest->info_raw, &source->info_raw); if (dest->error) return;
+	dest->error = lodepng_info_copy(&dest->info_png, &source->info_png); if (dest->error) return;
+}
+
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Encoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*chunkName must be string of 4 characters*/
+static unsigned addChunk(ucvector* out, const char* chunkName, const unsigned char* data, size_t length)
+{
+	CERROR_TRY_RETURN(lodepng_chunk_create(&out->data, &out->size, (unsigned)length, chunkName, data));
+	out->allocsize = out->size; /*fix the allocsize again*/
+	return 0;
+}
+
+static void writeSignature(ucvector* out)
+{
+	/*8 bytes PNG signature, aka the magic bytes*/
+	ucvector_push_back(out, 137);
+	ucvector_push_back(out, 80);
+	ucvector_push_back(out, 78);
+	ucvector_push_back(out, 71);
+	ucvector_push_back(out, 13);
+	ucvector_push_back(out, 10);
+	ucvector_push_back(out, 26);
+	ucvector_push_back(out, 10);
+}
+
+static unsigned addChunk_IHDR(ucvector* out, unsigned w, unsigned h,
+	LodePNGColorType colortype, unsigned bitdepth, unsigned interlace_method)
+{
+	unsigned error = 0;
+	ucvector header;
+	ucvector_init(&header);
+
+	lodepng_add32bitInt(&header, w); /*width*/
+	lodepng_add32bitInt(&header, h); /*height*/
+	ucvector_push_back(&header, (unsigned char)bitdepth); /*bit depth*/
+	ucvector_push_back(&header, (unsigned char)colortype); /*color type*/
+	ucvector_push_back(&header, 0); /*compression method*/
+	ucvector_push_back(&header, 0); /*filter method*/
+	ucvector_push_back(&header, interlace_method); /*interlace method*/
+
+	error = addChunk(out, "IHDR", header.data, header.size);
+	ucvector_cleanup(&header);
+
+	return error;
+}
+
+static unsigned addChunk_PLTE(ucvector* out, const LodePNGColorMode* info)
+{
+	unsigned error = 0;
+	size_t i;
+	ucvector PLTE;
+	ucvector_init(&PLTE);
+	for (i = 0; i != info->palettesize * 4; ++i)
+	{
+		/*add all channels except alpha channel*/
+		if (i % 4 != 3) ucvector_push_back(&PLTE, info->palette[i]);
+	}
+	error = addChunk(out, "PLTE", PLTE.data, PLTE.size);
+	ucvector_cleanup(&PLTE);
+
+	return error;
+}
+
+static unsigned addChunk_tRNS(ucvector* out, const LodePNGColorMode* info)
+{
+	unsigned error = 0;
+	size_t i;
+	ucvector tRNS;
+	ucvector_init(&tRNS);
+	if (info->colortype == LCT_PALETTE)
+	{
+		size_t amount = info->palettesize;
+		/*the tail of palette values that all have 255 as alpha, does not have to be encoded*/
+		for (i = info->palettesize; i != 0; --i)
+		{
+			if (info->palette[4 * (i - 1) + 3] == 255) --amount;
+			else break;
+		}
+		/*add only alpha channel*/
+		for (i = 0; i != amount; ++i) ucvector_push_back(&tRNS, info->palette[4 * i + 3]);
+	}
+	else if (info->colortype == LCT_GREY)
+	{
+		if (info->key_defined)
+		{
+			ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+			ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+		}
+	}
+	else if (info->colortype == LCT_RGB)
+	{
+		if (info->key_defined)
+		{
+			ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+			ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+			ucvector_push_back(&tRNS, (unsigned char)(info->key_g >> 8));
+			ucvector_push_back(&tRNS, (unsigned char)(info->key_g & 255));
+			ucvector_push_back(&tRNS, (unsigned char)(info->key_b >> 8));
+			ucvector_push_back(&tRNS, (unsigned char)(info->key_b & 255));
+		}
+	}
+
+	error = addChunk(out, "tRNS", tRNS.data, tRNS.size);
+	ucvector_cleanup(&tRNS);
+
+	return error;
+}
+
+static unsigned addChunk_IDAT(ucvector* out, const unsigned char* data, size_t datasize,
+	LodePNGCompressSettings* zlibsettings)
+{
+	ucvector zlibdata;
+	unsigned error = 0;
+
+	/*compress with the Zlib compressor*/
+	ucvector_init(&zlibdata);
+	error = zlib_compress(&zlibdata.data, &zlibdata.size, data, datasize, zlibsettings);
+	if (!error) error = addChunk(out, "IDAT", zlibdata.data, zlibdata.size);
+	ucvector_cleanup(&zlibdata);
+
+	return error;
+}
+
+static unsigned addChunk_IEND(ucvector* out)
+{
+	unsigned error = 0;
+	error = addChunk(out, "IEND", 0, 0);
+	return error;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static unsigned addChunk_tEXt(ucvector* out, const char* keyword, const char* textstring)
+{
+	unsigned error = 0;
+	size_t i;
+	ucvector text;
+	ucvector_init(&text);
+	for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)keyword[i]);
+	if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+	ucvector_push_back(&text, 0); /*0 termination char*/
+	for (i = 0; textstring[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)textstring[i]);
+	error = addChunk(out, "tEXt", text.data, text.size);
+	ucvector_cleanup(&text);
+
+	return error;
+}
+
+static unsigned addChunk_zTXt(ucvector* out, const char* keyword, const char* textstring,
+	LodePNGCompressSettings* zlibsettings)
+{
+	unsigned error = 0;
+	ucvector data, compressed;
+	size_t i, textsize = strlen(textstring);
+
+	ucvector_init(&data);
+	ucvector_init(&compressed);
+	for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+	if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+	ucvector_push_back(&data, 0); /*0 termination char*/
+	ucvector_push_back(&data, 0); /*compression method: 0*/
+
+	error = zlib_compress(&compressed.data, &compressed.size,
+		(unsigned char*)textstring, textsize, zlibsettings);
+	if (!error)
+	{
+		for (i = 0; i != compressed.size; ++i) ucvector_push_back(&data, compressed.data[i]);
+		error = addChunk(out, "zTXt", data.data, data.size);
+	}
+
+	ucvector_cleanup(&compressed);
+	ucvector_cleanup(&data);
+	return error;
+}
+
+static unsigned addChunk_iTXt(ucvector* out, unsigned compressed, const char* keyword, const char* langtag,
+	const char* transkey, const char* textstring, LodePNGCompressSettings* zlibsettings)
+{
+	unsigned error = 0;
+	ucvector data;
+	size_t i, textsize = strlen(textstring);
+
+	ucvector_init(&data);
+
+	for (i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+	if (i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+	ucvector_push_back(&data, 0); /*null termination char*/
+	ucvector_push_back(&data, compressed ? 1 : 0); /*compression flag*/
+	ucvector_push_back(&data, 0); /*compression method*/
+	for (i = 0; langtag[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)langtag[i]);
+	ucvector_push_back(&data, 0); /*null termination char*/
+	for (i = 0; transkey[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)transkey[i]);
+	ucvector_push_back(&data, 0); /*null termination char*/
+
+	if (compressed)
+	{
+		ucvector compressed_data;
+		ucvector_init(&compressed_data);
+		error = zlib_compress(&compressed_data.data, &compressed_data.size,
+			(unsigned char*)textstring, textsize, zlibsettings);
+		if (!error)
+		{
+			for (i = 0; i != compressed_data.size; ++i) ucvector_push_back(&data, compressed_data.data[i]);
+		}
+		ucvector_cleanup(&compressed_data);
+	}
+	else /*not compressed*/
+	{
+		for (i = 0; textstring[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)textstring[i]);
+	}
+
+	if (!error) error = addChunk(out, "iTXt", data.data, data.size);
+	ucvector_cleanup(&data);
+	return error;
+}
+
+static unsigned addChunk_bKGD(ucvector* out, const LodePNGInfo* info)
+{
+	unsigned error = 0;
+	ucvector bKGD;
+	ucvector_init(&bKGD);
+	if (info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+	{
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+	}
+	else if (info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+	{
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_g >> 8));
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_g & 255));
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_b >> 8));
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_b & 255));
+	}
+	else if (info->color.colortype == LCT_PALETTE)
+	{
+		ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255)); /*palette index*/
+	}
+
+	error = addChunk(out, "bKGD", bKGD.data, bKGD.size);
+	ucvector_cleanup(&bKGD);
+
+	return error;
+}
+
+static unsigned addChunk_tIME(ucvector* out, const LodePNGTime* time)
+{
+	unsigned error = 0;
+	unsigned char* data = (unsigned char*)lodepng_malloc(7);
+	if (!data) return 83; /*alloc fail*/
+	data[0] = (unsigned char)(time->year >> 8);
+	data[1] = (unsigned char)(time->year & 255);
+	data[2] = (unsigned char)time->month;
+	data[3] = (unsigned char)time->day;
+	data[4] = (unsigned char)time->hour;
+	data[5] = (unsigned char)time->minute;
+	data[6] = (unsigned char)time->second;
+	error = addChunk(out, "tIME", data, 7);
+	lodepng_free(data);
+	return error;
+}
+
+static unsigned addChunk_pHYs(ucvector* out, const LodePNGInfo* info)
+{
+	unsigned error = 0;
+	ucvector data;
+	ucvector_init(&data);
+
+	lodepng_add32bitInt(&data, info->phys_x);
+	lodepng_add32bitInt(&data, info->phys_y);
+	ucvector_push_back(&data, info->phys_unit);
+
+	error = addChunk(out, "pHYs", data.data, data.size);
+	ucvector_cleanup(&data);
+
+	return error;
+}
+
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+static void filterScanline(unsigned char* out, const unsigned char* scanline, const unsigned char* prevline,
+	size_t length, size_t bytewidth, unsigned char filterType)
+{
+	size_t i;
+	switch (filterType)
+	{
+	case 0: /*None*/
+		for (i = 0; i != length; ++i) out[i] = scanline[i];
+		break;
+	case 1: /*Sub*/
+		for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+		for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - scanline[i - bytewidth];
+		break;
+	case 2: /*Up*/
+		if (prevline)
+		{
+			for (i = 0; i != length; ++i) out[i] = scanline[i] - prevline[i];
+		}
+		else
+		{
+			for (i = 0; i != length; ++i) out[i] = scanline[i];
+		}
+		break;
+	case 3: /*Average*/
+		if (prevline)
+		{
+			for (i = 0; i != bytewidth; ++i) out[i] = scanline[i] - (prevline[i] >> 1);
+			for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - ((scanline[i - bytewidth] + prevline[i]) >> 1);
+		}
+		else
+		{
+			for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+			for (i = bytewidth; i < length; ++i) out[i] = scanline[i] - (scanline[i - bytewidth] >> 1);
+		}
+		break;
+	case 4: /*Paeth*/
+		if (prevline)
+		{
+			/*paethPredictor(0, prevline[i], 0) is always prevline[i]*/
+			for (i = 0; i != bytewidth; ++i) out[i] = (scanline[i] - prevline[i]);
+			for (i = bytewidth; i < length; ++i)
+			{
+				out[i] = (scanline[i] - paethPredictor(scanline[i - bytewidth], prevline[i], prevline[i - bytewidth]));
+			}
+		}
+		else
+		{
+			for (i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+			/*paethPredictor(scanline[i - bytewidth], 0, 0) is always scanline[i - bytewidth]*/
+			for (i = bytewidth; i < length; ++i) out[i] = (scanline[i] - scanline[i - bytewidth]);
+		}
+		break;
+	default: return; /*unexisting filter type given*/
+	}
+}
+
+/* log2 approximation. A slight bit faster than std::log. */
+static float flog2(float f)
+{
+	float result = 0;
+	while (f > 32) { result += 4; f /= 16; }
+	while (f > 2) { ++result; f /= 2; }
+	return result + 1.442695f * (f * f * f / 3 - 3 * f * f / 2 + 3 * f - 1.83333f);
+}
+
+static unsigned filter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h,
+	const LodePNGColorMode* info, const LodePNGEncoderSettings* settings)
+{
+	/*
+	For PNG filter method 0
+	out must be a buffer with as size: h + (w * h * bpp + 7) / 8, because there are
+	the scanlines with 1 extra byte per scanline
+	*/
+
+	unsigned bpp = lodepng_get_bpp(info);
+	/*the width of a scanline in bytes, not including the filter type*/
+	size_t linebytes = (w * bpp + 7) / 8;
+	/*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+	size_t bytewidth = (bpp + 7) / 8;
+	const unsigned char* prevline = 0;
+	unsigned x, y;
+	unsigned error = 0;
+	LodePNGFilterStrategy strategy = settings->filter_strategy;
+
+	/*
+	There is a heuristic called the minimum sum of absolute differences heuristic, suggested by the PNG standard:
+	 *  If the image type is Palette, or the bit depth is smaller than 8, then do not filter the image (i.e.
+		 use fixed filtering, with the filter None).
+	 * (The other case) If the image type is Grayscale or RGB (with or without Alpha), and the bit depth is
+		not smaller than 8, then use adaptive filtering heuristic as follows: independently for each row, apply
+		all five filters and select the filter that produces the smallest sum of absolute values per row.
+	This heuristic is used if filter strategy is LFS_MINSUM and filter_palette_zero is true.
+
+	If filter_palette_zero is true and filter_strategy is not LFS_MINSUM, the above heuristic is followed,
+	but for "the other case", whatever strategy filter_strategy is set to instead of the minimum sum
+	heuristic is used.
+	*/
+	if (settings->filter_palette_zero &&
+		(info->colortype == LCT_PALETTE || info->bitdepth < 8)) strategy = LFS_ZERO;
+
+	if (bpp == 0) return 31; /*error: invalid color type*/
+
+	if (strategy == LFS_ZERO)
+	{
+		for (y = 0; y != h; ++y)
+		{
+			size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+			size_t inindex = linebytes * y;
+			out[outindex] = 0; /*filter type byte*/
+			filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, 0);
+			prevline = &in[inindex];
+		}
+	}
+	else if (strategy == LFS_MINSUM)
+	{
+		/*adaptive filtering*/
+		size_t sum[5];
+		unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+		size_t smallest = 0;
+		unsigned char type, bestType = 0;
+
+		for (type = 0; type != 5; ++type)
+		{
+			attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+			if (!attempt[type]) return 83; /*alloc fail*/
+		}
+
+		if (!error)
+		{
+			for (y = 0; y != h; ++y)
+			{
+				/*try the 5 filter types*/
+				for (type = 0; type != 5; ++type)
+				{
+					filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+
+					/*calculate the sum of the result*/
+					sum[type] = 0;
+					if (type == 0)
+					{
+						for (x = 0; x != linebytes; ++x) sum[type] += (unsigned char)(attempt[type][x]);
+					}
+					else
+					{
+						for (x = 0; x != linebytes; ++x)
+						{
+							/*For differences, each byte should be treated as signed, values above 127 are negative
+							(converted to signed char). Filtertype 0 isn't a difference though, so use unsigned there.
+							This means filtertype 0 is almost never chosen, but that is justified.*/
+							unsigned char s = attempt[type][x];
+							sum[type] += s < 128 ? s : (255U - s);
+						}
+					}
+
+					/*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+					if (type == 0 || sum[type] < smallest)
+					{
+						bestType = type;
+						smallest = sum[type];
+					}
+				}
+
+				prevline = &in[y * linebytes];
+
+				/*now fill the out values*/
+				out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+				for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+			}
+		}
+
+		for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+	}
+	else if (strategy == LFS_ENTROPY)
+	{
+		float sum[5];
+		unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+		float smallest = 0;
+		unsigned type, bestType = 0;
+		unsigned count[256];
+
+		for (type = 0; type != 5; ++type)
+		{
+			attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+			if (!attempt[type]) return 83; /*alloc fail*/
+		}
+
+		for (y = 0; y != h; ++y)
+		{
+			/*try the 5 filter types*/
+			for (type = 0; type != 5; ++type)
+			{
+				filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+				for (x = 0; x != 256; ++x) count[x] = 0;
+				for (x = 0; x != linebytes; ++x) ++count[attempt[type][x]];
+				++count[type]; /*the filter type itself is part of the scanline*/
+				sum[type] = 0;
+				for (x = 0; x != 256; ++x)
+				{
+					float p = count[x] / (float)(linebytes + 1);
+					sum[type] += count[x] == 0 ? 0 : flog2(1 / p) * p;
+				}
+				/*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+				if (type == 0 || sum[type] < smallest)
+				{
+					bestType = type;
+					smallest = sum[type];
+				}
+			}
+
+			prevline = &in[y * linebytes];
+
+			/*now fill the out values*/
+			out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+			for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+		}
+
+		for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+	}
+	else if (strategy == LFS_PREDEFINED)
+	{
+		for (y = 0; y != h; ++y)
+		{
+			size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+			size_t inindex = linebytes * y;
+			unsigned char type = settings->predefined_filters[y];
+			out[outindex] = type; /*filter type byte*/
+			filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, type);
+			prevline = &in[inindex];
+		}
+	}
+	else if (strategy == LFS_BRUTE_FORCE)
+	{
+		/*brute force filter chooser.
+		deflate the scanline after every filter attempt to see which one deflates best.
+		This is very slow and gives only slightly smaller, sometimes even larger, result*/
+		size_t size[5];
+		unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+		size_t smallest = 0;
+		unsigned type = 0, bestType = 0;
+		unsigned char* dummy;
+		LodePNGCompressSettings zlibsettings = settings->zlibsettings;
+		/*use fixed tree on the attempts so that the tree is not adapted to the filtertype on purpose,
+		to simulate the true case where the tree is the same for the whole image. Sometimes it gives
+		better result with dynamic tree anyway. Using the fixed tree sometimes gives worse, but in rare
+		cases better compression. It does make this a bit less slow, so it's worth doing this.*/
+		zlibsettings.btype = 1;
+		/*a custom encoder likely doesn't read the btype setting and is optimized for complete PNG
+		images only, so disable it*/
+		zlibsettings.custom_zlib = 0;
+		zlibsettings.custom_deflate = 0;
+		for (type = 0; type != 5; ++type)
+		{
+			attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+			if (!attempt[type]) return 83; /*alloc fail*/
+		}
+		for (y = 0; y != h; ++y) /*try the 5 filter types*/
+		{
+			for (type = 0; type != 5; ++type)
+			{
+				unsigned testsize = linebytes;
+				/*if(testsize > 8) testsize /= 8;*/ /*it already works good enough by testing a part of the row*/
+
+				filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+				size[type] = 0;
+				dummy = 0;
+				zlib_compress(&dummy, &size[type], attempt[type], testsize, &zlibsettings);
+				lodepng_free(dummy);
+				/*check if this is smallest size (or if type == 0 it's the first case so always store the values)*/
+				if (type == 0 || size[type] < smallest)
+				{
+					bestType = type;
+					smallest = size[type];
+				}
+			}
+			prevline = &in[y * linebytes];
+			out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+			for (x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+		}
+		for (type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+	}
+	else return 88; /* unknown filter strategy */
+
+	return error;
+}
+
+static void addPaddingBits(unsigned char* out, const unsigned char* in,
+	size_t olinebits, size_t ilinebits, unsigned h)
+{
+	/*The opposite of the removePaddingBits function
+	olinebits must be >= ilinebits*/
+	unsigned y;
+	size_t diff = olinebits - ilinebits;
+	size_t obp = 0, ibp = 0; /*bit pointers*/
+	for (y = 0; y != h; ++y)
+	{
+		size_t x;
+		for (x = 0; x < ilinebits; ++x)
+		{
+			unsigned char bit = readBitFromReversedStream(&ibp, in);
+			setBitOfReversedStream(&obp, out, bit);
+		}
+		/*obp += diff; --> no, fill in some value in the padding bits too, to avoid
+		"Use of uninitialised value of size ###" warning from valgrind*/
+		for (x = 0; x != diff; ++x) setBitOfReversedStream(&obp, out, 0);
+	}
+}
+
+/*
+in: non-interlaced image with size w*h
+out: the same pixels, but re-ordered according to PNG's Adam7 interlacing, with
+ no padding bits between scanlines, but between reduced images so that each
+ reduced image starts at a byte.
+bpp: bits per pixel
+there are no padding bits, not between scanlines, not between reduced images
+in has the following size in bits: w * h * bpp.
+out is possibly bigger due to padding bits between reduced images
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_interlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+	unsigned passw[7], passh[7];
+	size_t filter_passstart[8], padded_passstart[8], passstart[8];
+	unsigned i;
+
+	Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+	if (bpp >= 8)
+	{
+		for (i = 0; i != 7; ++i)
+		{
+			unsigned x, y, b;
+			size_t bytewidth = bpp / 8;
+			for (y = 0; y < passh[i]; ++y)
+				for (x = 0; x < passw[i]; ++x)
+				{
+					size_t pixelinstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+					size_t pixeloutstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+					for (b = 0; b < bytewidth; ++b)
+					{
+						out[pixeloutstart + b] = in[pixelinstart + b];
+					}
+				}
+		}
+	}
+	else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+	{
+		for (i = 0; i != 7; ++i)
+		{
+			unsigned x, y, b;
+			unsigned ilinebits = bpp * passw[i];
+			unsigned olinebits = bpp * w;
+			size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+			for (y = 0; y < passh[i]; ++y)
+				for (x = 0; x < passw[i]; ++x)
+				{
+					ibp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+					obp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+					for (b = 0; b < bpp; ++b)
+					{
+						unsigned char bit = readBitFromReversedStream(&ibp, in);
+						setBitOfReversedStream(&obp, out, bit);
+					}
+				}
+		}
+	}
+}
+
+/*out must be buffer big enough to contain uncompressed IDAT chunk data, and in must contain the full image.
+return value is error**/
+static unsigned preProcessScanlines(unsigned char** out, size_t* outsize, const unsigned char* in,
+	unsigned w, unsigned h,
+	const LodePNGInfo* info_png, const LodePNGEncoderSettings* settings)
+{
+	/*
+	This function converts the pure 2D image with the PNG's colortype, into filtered-padded-interlaced data. Steps:
+	*) if no Adam7: 1) add padding bits (= posible extra bits per scanline if bpp < 8) 2) filter
+	*) if adam7: 1) Adam7_interlace 2) 7x add padding bits 3) 7x filter
+	*/
+	unsigned bpp = lodepng_get_bpp(&info_png->color);
+	unsigned error = 0;
+
+	if (info_png->interlace_method == 0)
+	{
+		*outsize = h + (h * ((w * bpp + 7) / 8)); /*image size plus an extra byte per scanline + possible padding bits*/
+		*out = (unsigned char*)lodepng_malloc(*outsize);
+		if (!(*out) && (*outsize)) error = 83; /*alloc fail*/
+
+		if (!error)
+		{
+			/*non multiple of 8 bits per scanline, padding bits needed per scanline*/
+			if (bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+			{
+				unsigned char* padded = (unsigned char*)lodepng_malloc(h * ((w * bpp + 7) / 8));
+				if (!padded) error = 83; /*alloc fail*/
+				if (!error)
+				{
+					addPaddingBits(padded, in, ((w * bpp + 7) / 8) * 8, w * bpp, h);
+					error = filter(*out, padded, w, h, &info_png->color, settings);
+				}
+				lodepng_free(padded);
+			}
+			else
+			{
+				/*we can immediately filter into the out buffer, no other steps needed*/
+				error = filter(*out, in, w, h, &info_png->color, settings);
+			}
+		}
+	}
+	else /*interlace_method is 1 (Adam7)*/
+	{
+		unsigned passw[7], passh[7];
+		size_t filter_passstart[8], padded_passstart[8], passstart[8];
+		unsigned char* adam7;
+
+		Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+		*outsize = filter_passstart[7]; /*image size plus an extra byte per scanline + possible padding bits*/
+		*out = (unsigned char*)lodepng_malloc(*outsize);
+		if (!(*out)) error = 83; /*alloc fail*/
+
+		adam7 = (unsigned char*)lodepng_malloc(passstart[7]);
+		if (!adam7 && passstart[7]) error = 83; /*alloc fail*/
+
+		if (!error)
+		{
+			unsigned i;
+
+			Adam7_interlace(adam7, in, w, h, bpp);
+			for (i = 0; i != 7; ++i)
+			{
+				if (bpp < 8)
+				{
+					unsigned char* padded = (unsigned char*)lodepng_malloc(padded_passstart[i + 1] - padded_passstart[i]);
+					if (!padded) ERROR_BREAK(83); /*alloc fail*/
+					addPaddingBits(padded, &adam7[passstart[i]],
+						((passw[i] * bpp + 7) / 8) * 8, passw[i] * bpp, passh[i]);
+					error = filter(&(*out)[filter_passstart[i]], padded,
+						passw[i], passh[i], &info_png->color, settings);
+					lodepng_free(padded);
+				}
+				else
+				{
+					error = filter(&(*out)[filter_passstart[i]], &adam7[padded_passstart[i]],
+						passw[i], passh[i], &info_png->color, settings);
+				}
+
+				if (error) break;
+			}
+		}
+
+		lodepng_free(adam7);
+	}
+
+	return error;
+}
+
+/*
+palette must have 4 * palettesize bytes allocated, and given in format RGBARGBARGBARGBA...
+returns 0 if the palette is opaque,
+returns 1 if the palette has a single color with alpha 0 ==> color key
+returns 2 if the palette is semi-translucent.
+*/
+static unsigned getPaletteTranslucency(const unsigned char* palette, size_t palettesize)
+{
+	size_t i;
+	unsigned key = 0;
+	unsigned r = 0, g = 0, b = 0; /*the value of the color with alpha 0, so long as color keying is possible*/
+	for (i = 0; i != palettesize; ++i)
+	{
+		if (!key && palette[4 * i + 3] == 0)
+		{
+			r = palette[4 * i + 0]; g = palette[4 * i + 1]; b = palette[4 * i + 2];
+			key = 1;
+			i = (size_t)(-1); /*restart from beginning, to detect earlier opaque colors with key's value*/
+		}
+		else if (palette[4 * i + 3] != 255) return 2;
+		/*when key, no opaque RGB may have key's RGB*/
+		else if (key && r == palette[i * 4 + 0] && g == palette[i * 4 + 1] && b == palette[i * 4 + 2]) return 2;
+	}
+	return key;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+static unsigned addUnknownChunks(ucvector* out, unsigned char* data, size_t datasize)
+{
+	unsigned char* inchunk = data;
+	while ((size_t)(inchunk - data) < datasize)
+	{
+		CERROR_TRY_RETURN(lodepng_chunk_append(&out->data, &out->size, inchunk));
+		out->allocsize = out->size; /*fix the allocsize again*/
+		inchunk = lodepng_chunk_next(inchunk);
+	}
+	return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+	const unsigned char* image, unsigned w, unsigned h,
+	LodePNGState* state)
+{
+	LodePNGInfo info;
+	ucvector outv;
+	unsigned char* data = 0; /*uncompressed version of the IDAT chunk data*/
+	size_t datasize = 0;
+
+	/*provide some proper output values if error will happen*/
+	*out = 0;
+	*outsize = 0;
+	state->error = 0;
+
+	lodepng_info_init(&info);
+	lodepng_info_copy(&info, &state->info_png);
+
+	if ((info.color.colortype == LCT_PALETTE || state->encoder.force_palette)
+		&& (info.color.palettesize == 0 || info.color.palettesize > 256))
+	{
+		state->error = 68; /*invalid palette size, it is only allowed to be 1-256*/
+		return state->error;
+	}
+
+	if (state->encoder.auto_convert)
+	{
+		state->error = lodepng_auto_choose_color(&info.color, image, w, h, &state->info_raw);
+	}
+	if (state->error) return state->error;
+
+	if (state->encoder.zlibsettings.btype > 2)
+	{
+		CERROR_RETURN_ERROR(state->error, 61); /*error: unexisting btype*/
+	}
+	if (state->info_png.interlace_method > 1)
+	{
+		CERROR_RETURN_ERROR(state->error, 71); /*error: unexisting interlace mode*/
+	}
+
+	state->error = checkColorValidity(info.color.colortype, info.color.bitdepth);
+	if (state->error) return state->error; /*error: unexisting color type given*/
+	state->error = checkColorValidity(state->info_raw.colortype, state->info_raw.bitdepth);
+	if (state->error) return state->error; /*error: unexisting color type given*/
+
+	if (!lodepng_color_mode_equal(&state->info_raw, &info.color))
+	{
+		unsigned char* converted;
+		size_t size = (w * h * (size_t)lodepng_get_bpp(&info.color) + 7) / 8;
+
+		converted = (unsigned char*)lodepng_malloc(size);
+		if (!converted && size) state->error = 83; /*alloc fail*/
+		if (!state->error)
+		{
+			state->error = lodepng_convert(converted, image, &info.color, &state->info_raw, w, h);
+		}
+		if (!state->error) preProcessScanlines(&data, &datasize, converted, w, h, &info, &state->encoder);
+		lodepng_free(converted);
+	}
+	else preProcessScanlines(&data, &datasize, image, w, h, &info, &state->encoder);
+
+	ucvector_init(&outv);
+	while (!state->error) /*while only executed once, to break on error*/
+	{
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+		size_t i;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+		/*write signature and chunks*/
+		writeSignature(&outv);
+		/*IHDR*/
+		addChunk_IHDR(&outv, w, h, info.color.colortype, info.color.bitdepth, info.interlace_method);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+		/*unknown chunks between IHDR and PLTE*/
+		if (info.unknown_chunks_data[0])
+		{
+			state->error = addUnknownChunks(&outv, info.unknown_chunks_data[0], info.unknown_chunks_size[0]);
+			if (state->error) break;
+		}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+		/*PLTE*/
+		if (info.color.colortype == LCT_PALETTE)
+		{
+			addChunk_PLTE(&outv, &info.color);
+		}
+		if (state->encoder.force_palette && (info.color.colortype == LCT_RGB || info.color.colortype == LCT_RGBA))
+		{
+			addChunk_PLTE(&outv, &info.color);
+		}
+		/*tRNS*/
+		if (info.color.colortype == LCT_PALETTE && getPaletteTranslucency(info.color.palette, info.color.palettesize) != 0)
+		{
+			addChunk_tRNS(&outv, &info.color);
+		}
+		if ((info.color.colortype == LCT_GREY || info.color.colortype == LCT_RGB) && info.color.key_defined)
+		{
+			addChunk_tRNS(&outv, &info.color);
+		}
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+		/*bKGD (must come between PLTE and the IDAt chunks*/
+		if (info.background_defined) addChunk_bKGD(&outv, &info);
+		/*pHYs (must come before the IDAT chunks)*/
+		if (info.phys_defined) addChunk_pHYs(&outv, &info);
+
+		/*unknown chunks between PLTE and IDAT*/
+		if (info.unknown_chunks_data[1])
+		{
+			state->error = addUnknownChunks(&outv, info.unknown_chunks_data[1], info.unknown_chunks_size[1]);
+			if (state->error) break;
+		}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+		/*IDAT (multiple IDAT chunks must be consecutive)*/
+		state->error = addChunk_IDAT(&outv, data, datasize, &state->encoder.zlibsettings);
+		if (state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+		/*tIME*/
+		if (info.time_defined) addChunk_tIME(&outv, &info.time);
+		/*tEXt and/or zTXt*/
+		for (i = 0; i != info.text_num; ++i)
+		{
+			if (strlen(info.text_keys[i]) > 79)
+			{
+				state->error = 66; /*text chunk too large*/
+				break;
+			}
+			if (strlen(info.text_keys[i]) < 1)
+			{
+				state->error = 67; /*text chunk too small*/
+				break;
+			}
+			if (state->encoder.text_compression)
+			{
+				addChunk_zTXt(&outv, info.text_keys[i], info.text_strings[i], &state->encoder.zlibsettings);
+			}
+			else
+			{
+				addChunk_tEXt(&outv, info.text_keys[i], info.text_strings[i]);
+			}
+		}
+		/*LodePNG version id in text chunk*/
+		if (state->encoder.add_id)
+		{
+			unsigned alread_added_id_text = 0;
+			for (i = 0; i != info.text_num; ++i)
+			{
+				if (!strcmp(info.text_keys[i], "LodePNG"))
+				{
+					alread_added_id_text = 1;
+					break;
+				}
+			}
+			if (alread_added_id_text == 0)
+			{
+				addChunk_tEXt(&outv, "LodePNG", LODEPNG_VERSION_STRING); /*it's shorter as tEXt than as zTXt chunk*/
+			}
+		}
+		/*iTXt*/
+		for (i = 0; i != info.itext_num; ++i)
+		{
+			if (strlen(info.itext_keys[i]) > 79)
+			{
+				state->error = 66; /*text chunk too large*/
+				break;
+			}
+			if (strlen(info.itext_keys[i]) < 1)
+			{
+				state->error = 67; /*text chunk too small*/
+				break;
+			}
+			addChunk_iTXt(&outv, state->encoder.text_compression,
+				info.itext_keys[i], info.itext_langtags[i], info.itext_transkeys[i], info.itext_strings[i],
+				&state->encoder.zlibsettings);
+		}
+
+		/*unknown chunks between IDAT and IEND*/
+		if (info.unknown_chunks_data[2])
+		{
+			state->error = addUnknownChunks(&outv, info.unknown_chunks_data[2], info.unknown_chunks_size[2]);
+			if (state->error) break;
+		}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+		addChunk_IEND(&outv);
+
+		break; /*this isn't really a while loop; no error happened so break out now!*/
+	}
+
+	lodepng_info_cleanup(&info);
+	lodepng_free(data);
+	/*instead of cleaning the vector up, give it to the output*/
+	*out = outv.data;
+	*outsize = outv.size;
+
+	return state->error;
+}
+
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize, const unsigned char* image,
+	unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+	unsigned error;
+	LodePNGState state;
+	lodepng_state_init(&state);
+	state.info_raw.colortype = colortype;
+	state.info_raw.bitdepth = bitdepth;
+	state.info_png.color.colortype = colortype;
+	state.info_png.color.bitdepth = bitdepth;
+	lodepng_encode(out, outsize, image, w, h, &state);
+	error = state.error;
+	lodepng_state_cleanup(&state);
+	return error;
+}
+
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+	return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+	return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_encode_file(const char* filename, const unsigned char* image, unsigned w, unsigned h,
+	LodePNGColorType colortype, unsigned bitdepth)
+{
+	unsigned char* buffer;
+	size_t buffersize;
+	unsigned error = lodepng_encode_memory(&buffer, &buffersize, image, w, h, colortype, bitdepth);
+	if (!error) error = lodepng_save_file(buffer, buffersize, filename);
+	lodepng_free(buffer);
+	return error;
+}
+
+unsigned lodepng_encode32_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+	return lodepng_encode_file(filename, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+	return lodepng_encode_file(filename, image, w, h, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings)
+{
+	lodepng_compress_settings_init(&settings->zlibsettings);
+	settings->filter_palette_zero = 1;
+	settings->filter_strategy = LFS_MINSUM;
+	settings->auto_convert = 1;
+	settings->force_palette = 0;
+	settings->predefined_filters = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+	settings->add_id = 0;
+	settings->text_compression = 1;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*
+This returns the description of a numerical error code in English. This is also
+the documentation of all the error codes.
+*/
+const char* lodepng_error_text(unsigned code)
+{
+	switch (code)
+	{
+	case 0: return "no error, everything went ok";
+	case 1: return "nothing done yet"; /*the Encoder/Decoder has done nothing yet, error checking makes no sense yet*/
+	case 10: return "end of input memory reached without huffman end code"; /*while huffman decoding*/
+	case 11: return "error in code tree made it jump outside of huffman tree"; /*while huffman decoding*/
+	case 13: return "problem while processing dynamic deflate block";
+	case 14: return "problem while processing dynamic deflate block";
+	case 15: return "problem while processing dynamic deflate block";
+	case 16: return "unexisting code while processing dynamic deflate block";
+	case 17: return "end of out buffer memory reached while inflating";
+	case 18: return "invalid distance code while inflating";
+	case 19: return "end of out buffer memory reached while inflating";
+	case 20: return "invalid deflate block BTYPE encountered while decoding";
+	case 21: return "NLEN is not ones complement of LEN in a deflate block";
+		/*end of out buffer memory reached while inflating:
+		This can happen if the inflated deflate data is longer than the amount of bytes required to fill up
+		all the pixels of the image, given the color depth and image dimensions. Something that doesn't
+		happen in a normal, well encoded, PNG image.*/
+	case 22: return "end of out buffer memory reached while inflating";
+	case 23: return "end of in buffer memory reached while inflating";
+	case 24: return "invalid FCHECK in zlib header";
+	case 25: return "invalid compression method in zlib header";
+	case 26: return "FDICT encountered in zlib header while it's not used for PNG";
+	case 27: return "PNG file is smaller than a PNG header";
+		/*Checks the magic file header, the first 8 bytes of the PNG file*/
+	case 28: return "incorrect PNG signature, it's no PNG or corrupted";
+	case 29: return "first chunk is not the header chunk";
+	case 30: return "chunk length too large, chunk broken off at end of file";
+	case 31: return "illegal PNG color type or bpp";
+	case 32: return "illegal PNG compression method";
+	case 33: return "illegal PNG filter method";
+	case 34: return "illegal PNG interlace method";
+	case 35: return "chunk length of a chunk is too large or the chunk too small";
+	case 36: return "illegal PNG filter type encountered";
+	case 37: return "illegal bit depth for this color type given";
+	case 38: return "the palette is too big"; /*more than 256 colors*/
+	case 39: return "more palette alpha values given in tRNS chunk than there are colors in the palette";
+	case 40: return "tRNS chunk has wrong size for greyscale image";
+	case 41: return "tRNS chunk has wrong size for RGB image";
+	case 42: return "tRNS chunk appeared while it was not allowed for this color type";
+	case 43: return "bKGD chunk has wrong size for palette image";
+	case 44: return "bKGD chunk has wrong size for greyscale image";
+	case 45: return "bKGD chunk has wrong size for RGB image";
+	case 48: return "empty input buffer given to decoder. Maybe caused by non-existing file?";
+	case 49: return "jumped past memory while generating dynamic huffman tree";
+	case 50: return "jumped past memory while generating dynamic huffman tree";
+	case 51: return "jumped past memory while inflating huffman block";
+	case 52: return "jumped past memory while inflating";
+	case 53: return "size of zlib data too small";
+	case 54: return "repeat symbol in tree while there was no value symbol yet";
+		/*jumped past tree while generating huffman tree, this could be when the
+		tree will have more leaves than symbols after generating it out of the
+		given lenghts. They call this an oversubscribed dynamic bit lengths tree in zlib.*/
+	case 55: return "jumped past tree while generating huffman tree";
+	case 56: return "given output image colortype or bitdepth not supported for color conversion";
+	case 57: return "invalid CRC encountered (checking CRC can be disabled)";
+	case 58: return "invalid ADLER32 encountered (checking ADLER32 can be disabled)";
+	case 59: return "requested color conversion not supported";
+	case 60: return "invalid window size given in the settings of the encoder (must be 0-32768)";
+	case 61: return "invalid BTYPE given in the settings of the encoder (only 0, 1 and 2 are allowed)";
+		/*LodePNG leaves the choice of RGB to greyscale conversion formula to the user.*/
+	case 62: return "conversion from color to greyscale not supported";
+	case 63: return "length of a chunk too long, max allowed for PNG is 2147483647 bytes per chunk"; /*(2^31-1)*/
+	/*this would result in the inability of a deflated block to ever contain an end code. It must be at least 1.*/
+	case 64: return "the length of the END symbol 256 in the Huffman tree is 0";
+	case 66: return "the length of a text chunk keyword given to the encoder is longer than the maximum of 79 bytes";
+	case 67: return "the length of a text chunk keyword given to the encoder is smaller than the minimum of 1 byte";
+	case 68: return "tried to encode a PLTE chunk with a palette that has less than 1 or more than 256 colors";
+	case 69: return "unknown chunk type with 'critical' flag encountered by the decoder";
+	case 71: return "unexisting interlace mode given to encoder (must be 0 or 1)";
+	case 72: return "while decoding, unexisting compression method encountering in zTXt or iTXt chunk (it must be 0)";
+	case 73: return "invalid tIME chunk size";
+	case 74: return "invalid pHYs chunk size";
+		/*length could be wrong, or data chopped off*/
+	case 75: return "no null termination char found while decoding text chunk";
+	case 76: return "iTXt chunk too short to contain required bytes";
+	case 77: return "integer overflow in buffer size";
+	case 78: return "failed to open file for reading"; /*file doesn't exist or couldn't be opened for reading*/
+	case 79: return "failed to open file for writing";
+	case 80: return "tried creating a tree of 0 symbols";
+	case 81: return "lazy matching at pos 0 is impossible";
+	case 82: return "color conversion to palette requested while a color isn't in palette";
+	case 83: return "memory allocation failed";
+	case 84: return "given image too small to contain all pixels to be encoded";
+	case 86: return "impossible offset in lz77 encoding (internal bug)";
+	case 87: return "must provide custom zlib function pointer if LODEPNG_COMPILE_ZLIB is not defined";
+	case 88: return "invalid filter strategy given for LodePNGEncoderSettings.filter_strategy";
+	case 89: return "text chunk keyword too short or long: must have size 1-79";
+		/*the windowsize in the LodePNGCompressSettings. Requiring POT(==> & instead of %) makes encoding 12% faster.*/
+	case 90: return "windowsize must be a power of two";
+	case 91: return "invalid decompressed idat size";
+	case 92: return "too many pixels, not supported";
+	case 93: return "zero width or height is invalid";
+	case 94: return "header chunk must have a size of 13 bytes";
+	}
+	return "unknown error code";
+}
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // C++ Wrapper                                                          // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+
+#ifdef LODEPNG_COMPILE_DISK
+	unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename)
+	{
+		long size = lodepng_filesize(filename.c_str());
+		if (size < 0) return 78;
+		buffer.resize((size_t)size);
+		return size == 0 ? 0 : lodepng_buffer_file(&buffer[0], (size_t)size, filename.c_str());
+	}
+
+	/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+	unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename)
+	{
+		return lodepng_save_file(buffer.empty() ? 0 : &buffer[0], buffer.size(), filename.c_str());
+	}
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+	unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+		const LodePNGDecompressSettings& settings)
+	{
+		unsigned char* buffer = 0;
+		size_t buffersize = 0;
+		unsigned error = zlib_decompress(&buffer, &buffersize, in, insize, &settings);
+		if (buffer)
+		{
+			out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+			lodepng_free(buffer);
+		}
+		return error;
+	}
+
+	unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+		const LodePNGDecompressSettings& settings)
+	{
+		return decompress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+	}
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+	unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+		const LodePNGCompressSettings& settings)
+	{
+		unsigned char* buffer = 0;
+		size_t buffersize = 0;
+		unsigned error = zlib_compress(&buffer, &buffersize, in, insize, &settings);
+		if (buffer)
+		{
+			out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+			lodepng_free(buffer);
+		}
+		return error;
+	}
+
+	unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+		const LodePNGCompressSettings& settings)
+	{
+		return compress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+	}
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+
+
+#ifdef LODEPNG_COMPILE_PNG
+
+	State::State()
+	{
+		lodepng_state_init(this);
+	}
+
+	State::State(const State& other)
+	{
+		lodepng_state_init(this);
+		lodepng_state_copy(this, &other);
+	}
+
+	State::~State()
+	{
+		lodepng_state_cleanup(this);
+	}
+
+	State& State::operator=(const State& other)
+	{
+		lodepng_state_copy(this, &other);
+		return *this;
+	}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const unsigned char* in,
+		size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+	{
+		unsigned char* buffer;
+		unsigned error = lodepng_decode_memory(&buffer, &w, &h, in, insize, colortype, bitdepth);
+		if (buffer && !error)
+		{
+			State state;
+			state.info_raw.colortype = colortype;
+			state.info_raw.bitdepth = bitdepth;
+			size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+			out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+			lodepng_free(buffer);
+		}
+		return error;
+	}
+
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+		const std::vector<unsigned char>& in, LodePNGColorType colortype, unsigned bitdepth)
+	{
+		return decode(out, w, h, in.empty() ? 0 : &in[0], (unsigned)in.size(), colortype, bitdepth);
+	}
+
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+		State& state,
+		const unsigned char* in, size_t insize)
+	{
+		unsigned char* buffer = NULL;
+		unsigned error = lodepng_decode(&buffer, &w, &h, &state, in, insize);
+		if (buffer && !error)
+		{
+			size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+			out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+		}
+		lodepng_free(buffer);
+		return error;
+	}
+
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+		State& state,
+		const std::vector<unsigned char>& in)
+	{
+		return decode(out, w, h, state, in.empty() ? 0 : &in[0], in.size());
+	}
+
+#ifdef LODEPNG_COMPILE_DISK
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const std::string& filename,
+		LodePNGColorType colortype, unsigned bitdepth)
+	{
+		std::vector<unsigned char> buffer;
+		unsigned error = load_file(buffer, filename);
+		if (error) return error;
+		return decode(out, w, h, buffer, colortype, bitdepth);
+	}
+#endif /* LODEPNG_COMPILE_DECODER */
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+	unsigned encode(std::vector<unsigned char>& out, const unsigned char* in, unsigned w, unsigned h,
+		LodePNGColorType colortype, unsigned bitdepth)
+	{
+		unsigned char* buffer;
+		size_t buffersize;
+		unsigned error = lodepng_encode_memory(&buffer, &buffersize, in, w, h, colortype, bitdepth);
+		if (buffer)
+		{
+			out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+			lodepng_free(buffer);
+		}
+		return error;
+	}
+
+	unsigned encode(std::vector<unsigned char>& out,
+		const std::vector<unsigned char>& in, unsigned w, unsigned h,
+		LodePNGColorType colortype, unsigned bitdepth)
+	{
+		if (lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+		return encode(out, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+	}
+
+	unsigned encode(std::vector<unsigned char>& out,
+		const unsigned char* in, unsigned w, unsigned h,
+		State& state)
+	{
+		unsigned char* buffer;
+		size_t buffersize;
+		unsigned error = lodepng_encode(&buffer, &buffersize, in, w, h, &state);
+		if (buffer)
+		{
+			out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+			lodepng_free(buffer);
+		}
+		return error;
+	}
+
+	unsigned encode(std::vector<unsigned char>& out,
+		const std::vector<unsigned char>& in, unsigned w, unsigned h,
+		State& state)
+	{
+		if (lodepng_get_raw_size(w, h, &state.info_raw) > in.size()) return 84;
+		return encode(out, in.empty() ? 0 : &in[0], w, h, state);
+	}
+
+#ifdef LODEPNG_COMPILE_DISK
+	unsigned encode(const std::string& filename,
+		const unsigned char* in, unsigned w, unsigned h,
+		LodePNGColorType colortype, unsigned bitdepth)
+	{
+		std::vector<unsigned char> buffer;
+		unsigned error = encode(buffer, in, w, h, colortype, bitdepth);
+		if (!error) error = save_file(buffer, filename);
+		return error;
+	}
+
+	unsigned encode(const std::string& filename,
+		const std::vector<unsigned char>& in, unsigned w, unsigned h,
+		LodePNGColorType colortype, unsigned bitdepth)
+	{
+		if (lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+		return encode(filename, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+	}
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_PNG */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
diff --git a/lodepng.h b/lodepng.h
new file mode 100644
index 0000000..df0f1e7
--- /dev/null
+++ b/lodepng.h
@@ -0,0 +1,1761 @@
+/*
+LodePNG version 20161127
+
+Copyright (c) 2005-2016 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+	 1. The origin of this software must not be misrepresented; you must not
+	 claim that you wrote the original software. If you use this software
+	 in a product, an acknowledgment in the product documentation would be
+	 appreciated but is not required.
+
+	 2. Altered source versions must be plainly marked as such, and must not be
+	 misrepresented as being the original software.
+
+	 3. This notice may not be removed or altered from any source
+	 distribution.
+*/
+
+#ifndef LODEPNG_H
+#define LODEPNG_H
+
+#include <string.h> /*for size_t*/
+
+extern const char* LODEPNG_VERSION_STRING;
+
+/*
+The following #defines are used to create code sections. They can be disabled
+to disable code sections, which can give faster compile time and smaller binary.
+The "NO_COMPILE" defines are designed to be used to pass as defines to the
+compiler command to disable them without modifying this header, e.g.
+-DLODEPNG_NO_COMPILE_ZLIB for gcc.
+In addition to those below, you can also define LODEPNG_NO_COMPILE_CRC to
+allow implementing a custom lodepng_crc32.
+*/
+/*deflate & zlib. If disabled, you must specify alternative zlib functions in
+the custom_zlib field of the compress and decompress settings*/
+#ifndef LODEPNG_NO_COMPILE_ZLIB
+#define LODEPNG_COMPILE_ZLIB
+#endif
+/*png encoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_PNG
+#define LODEPNG_COMPILE_PNG
+#endif
+/*deflate&zlib decoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_DECODER
+#define LODEPNG_COMPILE_DECODER
+#endif
+/*deflate&zlib encoder and png encoder*/
+#ifndef LODEPNG_NO_COMPILE_ENCODER
+#define LODEPNG_COMPILE_ENCODER
+#endif
+/*the optional built in harddisk file loading and saving functions*/
+#ifndef LODEPNG_NO_COMPILE_DISK
+#define LODEPNG_COMPILE_DISK
+#endif
+/*support for chunks other than IHDR, IDAT, PLTE, tRNS, IEND: ancillary and unknown chunks*/
+#ifndef LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
+#define LODEPNG_COMPILE_ANCILLARY_CHUNKS
+#endif
+/*ability to convert error numerical codes to English text string*/
+#ifndef LODEPNG_NO_COMPILE_ERROR_TEXT
+#define LODEPNG_COMPILE_ERROR_TEXT
+#endif
+/*Compile the default allocators (C's free, malloc and realloc). If you disable this,
+you can define the functions lodepng_free, lodepng_malloc and lodepng_realloc in your
+source files with custom allocators.*/
+#ifndef LODEPNG_NO_COMPILE_ALLOCATORS
+#define LODEPNG_COMPILE_ALLOCATORS
+#endif
+/*compile the C++ version (you can disable the C++ wrapper here even when compiling for C++)*/
+#ifdef __cplusplus
+#ifndef LODEPNG_NO_COMPILE_CPP
+#define LODEPNG_COMPILE_CPP
+#endif
+#endif
+
+#ifdef LODEPNG_COMPILE_CPP
+#include <vector>
+#include <string>
+#endif /*LODEPNG_COMPILE_CPP*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*The PNG color types (also used for raw).*/
+typedef enum LodePNGColorType
+{
+	LCT_GREY = 0, /*greyscale: 1,2,4,8,16 bit*/
+	LCT_RGB = 2, /*RGB: 8,16 bit*/
+	LCT_PALETTE = 3, /*palette: 1,2,4,8 bit*/
+	LCT_GREY_ALPHA = 4, /*greyscale with alpha: 8,16 bit*/
+	LCT_RGBA = 6 /*RGB with alpha: 8,16 bit*/
+} LodePNGColorType;
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Converts PNG data in memory to raw pixel data.
+out: Output parameter. Pointer to buffer that will contain the raw pixel data.
+	  After decoding, its size is w * h * (bytes per pixel) bytes larger than
+	  initially. Bytes per pixel depends on colortype and bitdepth.
+	  Must be freed after usage with free(*out).
+	  Note: for 16-bit per channel colors, uses big endian format like PNG does.
+w: Output parameter. Pointer to width of pixel data.
+h: Output parameter. Pointer to height of pixel data.
+in: Memory buffer with the PNG file.
+insize: size of the in buffer.
+colortype: the desired color type for the raw output image. See explanation on PNG color types.
+bitdepth: the desired bit depth for the raw output image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h,
+	const unsigned char* in, size_t insize,
+	LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_memory, but always decodes to 32-bit RGBA raw image*/
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h,
+	const unsigned char* in, size_t insize);
+
+/*Same as lodepng_decode_memory, but always decodes to 24-bit RGB raw image*/
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h,
+	const unsigned char* in, size_t insize);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load PNG from disk, from file with given name.
+Same as the other decode functions, but instead takes a filename as input.
+*/
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h,
+	const char* filename,
+	LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_file, but always decodes to 32-bit RGBA raw image.*/
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h,
+	const char* filename);
+
+/*Same as lodepng_decode_file, but always decodes to 24-bit RGB raw image.*/
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h,
+	const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Converts raw pixel data into a PNG image in memory. The colortype and bitdepth
+  of the output PNG image cannot be chosen, they are automatically determined
+  by the colortype, bitdepth and content of the input pixel data.
+  Note: for 16-bit per channel colors, needs big endian format like PNG does.
+out: Output parameter. Pointer to buffer that will contain the PNG image data.
+	  Must be freed after usage with free(*out).
+outsize: Output parameter. Pointer to the size in bytes of the out buffer.
+image: The raw pixel data to encode. The size of this buffer should be
+		 w * h * (bytes per pixel), bytes per pixel depends on colortype and bitdepth.
+w: width of the raw pixel data in pixels.
+h: height of the raw pixel data in pixels.
+colortype: the color type of the raw input image. See explanation on PNG color types.
+bitdepth: the bit depth of the raw input image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize,
+	const unsigned char* image, unsigned w, unsigned h,
+	LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_memory, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize,
+	const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_memory, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize,
+	const unsigned char* image, unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts raw pixel data into a PNG file on disk.
+Same as the other encode functions, but instead takes a filename as output.
+NOTE: This overwrites existing files without warning!
+*/
+unsigned lodepng_encode_file(const char* filename,
+	const unsigned char* image, unsigned w, unsigned h,
+	LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_file, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32_file(const char* filename,
+	const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_file, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24_file(const char* filename,
+	const unsigned char* image, unsigned w, unsigned h);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_DECODER
+	/*Same as lodepng_decode_memory, but decodes to an std::vector. The colortype
+	is the format to output the pixels to. Default is RGBA 8-bit per channel.*/
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+		const unsigned char* in, size_t insize,
+		LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+		const std::vector<unsigned char>& in,
+		LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+	/*
+	Converts PNG file from disk to raw pixel data in memory.
+	Same as the other decode functions, but instead takes a filename as input.
+	*/
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+		const std::string& filename,
+		LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+	/*Same as lodepng_encode_memory, but encodes to an std::vector. colortype
+	is that of the raw input data. The output PNG color type will be auto chosen.*/
+	unsigned encode(std::vector<unsigned char>& out,
+		const unsigned char* in, unsigned w, unsigned h,
+		LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+	unsigned encode(std::vector<unsigned char>& out,
+		const std::vector<unsigned char>& in, unsigned w, unsigned h,
+		LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+	/*
+	Converts 32-bit RGBA raw pixel data into a PNG file on disk.
+	Same as the other encode functions, but instead takes a filename as output.
+	NOTE: This overwrites existing files without warning!
+	*/
+	unsigned encode(const std::string& filename,
+		const unsigned char* in, unsigned w, unsigned h,
+		LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+	unsigned encode(const std::string& filename,
+		const std::vector<unsigned char>& in, unsigned w, unsigned h,
+		LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*Returns an English description of the numerical error code.*/
+const char* lodepng_error_text(unsigned code);
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Settings for zlib decompression*/
+typedef struct LodePNGDecompressSettings LodePNGDecompressSettings;
+struct LodePNGDecompressSettings
+{
+	unsigned ignore_adler32; /*if 1, continue and don't give an error message if the Adler32 checksum is corrupted*/
+
+	/*use custom zlib decoder instead of built in one (default: null)*/
+	unsigned(*custom_zlib)(unsigned char**, size_t*,
+		const unsigned char*, size_t,
+		const LodePNGDecompressSettings*);
+	/*use custom deflate decoder instead of built in one (default: null)
+	if custom_zlib is used, custom_deflate is ignored since only the built in
+	zlib function will call custom_deflate*/
+	unsigned(*custom_inflate)(unsigned char**, size_t*,
+		const unsigned char*, size_t,
+		const LodePNGDecompressSettings*);
+
+	const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGDecompressSettings lodepng_default_decompress_settings;
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Settings for zlib compression. Tweaking these settings tweaks the balance
+between speed and compression ratio.
+*/
+typedef struct LodePNGCompressSettings LodePNGCompressSettings;
+struct LodePNGCompressSettings /*deflate = compress*/
+{
+	/*LZ77 related settings*/
+	unsigned btype; /*the block type for LZ (0, 1, 2 or 3, see zlib standard). Should be 2 for proper compression.*/
+	unsigned use_lz77; /*whether or not to use LZ77. Should be 1 for proper compression.*/
+	unsigned windowsize; /*must be a power of two <= 32768. higher compresses more but is slower. Default value: 2048.*/
+	unsigned minmatch; /*mininum lz77 length. 3 is normally best, 6 can be better for some PNGs. Default: 0*/
+	unsigned nicematch; /*stop searching if >= this length found. Set to 258 for best compression. Default: 128*/
+	unsigned lazymatching; /*use lazy matching: better compression but a bit slower. Default: true*/
+
+	/*use custom zlib encoder instead of built in one (default: null)*/
+	unsigned(*custom_zlib)(unsigned char**, size_t*,
+		const unsigned char*, size_t,
+		const LodePNGCompressSettings*);
+	/*use custom deflate encoder instead of built in one (default: null)
+	if custom_zlib is used, custom_deflate is ignored since only the built in
+	zlib function will call custom_deflate*/
+	unsigned(*custom_deflate)(unsigned char**, size_t*,
+		const unsigned char*, size_t,
+		const LodePNGCompressSettings*);
+
+	const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGCompressSettings lodepng_default_compress_settings;
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*
+Color mode of an image. Contains all information required to decode the pixel
+bits to RGBA colors. This information is the same as used in the PNG file
+format, and is used both for PNG and raw image data in LodePNG.
+*/
+typedef struct LodePNGColorMode
+{
+	/*header (IHDR)*/
+	LodePNGColorType colortype; /*color type, see PNG standard or documentation further in this header file*/
+	unsigned bitdepth;  /*bits per sample, see PNG standard or documentation further in this header file*/
+
+	/*
+	palette (PLTE and tRNS)
+
+	Dynamically allocated with the colors of the palette, including alpha.
+	When encoding a PNG, to store your colors in the palette of the LodePNGColorMode, first use
+	lodepng_palette_clear, then for each color use lodepng_palette_add.
+	If you encode an image without alpha with palette, don't forget to put value 255 in each A byte of the palette.
+
+	When decoding, by default you can ignore this palette, since LodePNG already
+	fills the palette colors in the pixels of the raw RGBA output.
+
+	The palette is only supported for color type 3.
+	*/
+	unsigned char* palette; /*palette in RGBARGBA... order. When allocated, must be either 0, or have size 1024*/
+	size_t palettesize; /*palette size in number of colors (amount of bytes is 4 * palettesize)*/
+
+	/*
+	transparent color key (tRNS)
+
+	This color uses the same bit depth as the bitdepth value in this struct, which can be 1-bit to 16-bit.
+	For greyscale PNGs, r, g and b will all 3 be set to the same.
+
+	When decoding, by default you can ignore this information, since LodePNG sets
+	pixels with this key to transparent already in the raw RGBA output.
+
+	The color key is only supported for color types 0 and 2.
+	*/
+	unsigned key_defined; /*is a transparent color key given? 0 = false, 1 = true*/
+	unsigned key_r;       /*red/greyscale component of color key*/
+	unsigned key_g;       /*green component of color key*/
+	unsigned key_b;       /*blue component of color key*/
+} LodePNGColorMode;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_color_mode_init(LodePNGColorMode* info);
+void lodepng_color_mode_cleanup(LodePNGColorMode* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source);
+
+void lodepng_palette_clear(LodePNGColorMode* info);
+/*add 1 color to the palette*/
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+	unsigned char r, unsigned char g, unsigned char b, unsigned char a);
+
+/*get the total amount of bits per pixel, based on colortype and bitdepth in the struct*/
+unsigned lodepng_get_bpp(const LodePNGColorMode* info);
+/*get the amount of color channels used, based on colortype in the struct.
+If a palette is used, it counts as 1 channel.*/
+unsigned lodepng_get_channels(const LodePNGColorMode* info);
+/*is it a greyscale type? (only colortype 0 or 4)*/
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info);
+/*has it got an alpha channel? (only colortype 2 or 6)*/
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info);
+/*has it got a palette? (only colortype 3)*/
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info);
+/*only returns true if there is a palette and there is a value in the palette with alpha < 255.
+Loops through the palette to check this.*/
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info);
+/*
+Check if the given color info indicates the possibility of having non-opaque pixels in the PNG image.
+Returns true if the image can have translucent or invisible pixels (it still be opaque if it doesn't use such pixels).
+Returns false if the image can only have opaque pixels.
+In detail, it returns true only if it's a color type with alpha, or has a palette with non-opaque values,
+or if "key_defined" is true.
+*/
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info);
+/*Returns the byte size of a raw image buffer with given width, height and color mode*/
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*The information of a Time chunk in PNG.*/
+typedef struct LodePNGTime
+{
+	unsigned year;    /*2 bytes used (0-65535)*/
+	unsigned month;   /*1-12*/
+	unsigned day;     /*1-31*/
+	unsigned hour;    /*0-23*/
+	unsigned minute;  /*0-59*/
+	unsigned second;  /*0-60 (to allow for leap seconds)*/
+} LodePNGTime;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*Information about the PNG image, except pixels, width and height.*/
+typedef struct LodePNGInfo
+{
+	/*header (IHDR), palette (PLTE) and transparency (tRNS) chunks*/
+	unsigned compression_method;/*compression method of the original file. Always 0.*/
+	unsigned filter_method;     /*filter method of the original file*/
+	unsigned interlace_method;  /*interlace method of the original file*/
+	LodePNGColorMode color;     /*color type and bits, palette and transparency of the PNG file*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  /*
+  suggested background color chunk (bKGD)
+  This color uses the same color mode as the PNG (except alpha channel), which can be 1-bit to 16-bit.
+
+  For greyscale PNGs, r, g and b will all 3 be set to the same. When encoding
+  the encoder writes the red one. For palette PNGs: When decoding, the RGB value
+  will be stored, not a palette index. But when encoding, specify the index of
+  the palette in background_r, the other two are then ignored.
+
+  The decoder does not use this background color to edit the color of pixels.
+  */
+	unsigned background_defined; /*is a suggested background color given?*/
+	unsigned background_r;       /*red component of suggested background color*/
+	unsigned background_g;       /*green component of suggested background color*/
+	unsigned background_b;       /*blue component of suggested background color*/
+
+	/*
+	non-international text chunks (tEXt and zTXt)
+
+	The char** arrays each contain num strings. The actual messages are in
+	text_strings, while text_keys are keywords that give a short description what
+	the actual text represents, e.g. Title, Author, Description, or anything else.
+
+	A keyword is minimum 1 character and maximum 79 characters long. It's
+	discouraged to use a single line length longer than 79 characters for texts.
+
+	Don't allocate these text buffers yourself. Use the init/cleanup functions
+	correctly and use lodepng_add_text and lodepng_clear_text.
+	*/
+	size_t text_num; /*the amount of texts in these char** buffers (there may be more texts in itext)*/
+	char** text_keys; /*the keyword of a text chunk (e.g. "Comment")*/
+	char** text_strings; /*the actual text*/
+
+	/*
+	international text chunks (iTXt)
+	Similar to the non-international text chunks, but with additional strings
+	"langtags" and "transkeys".
+	*/
+	size_t itext_num; /*the amount of international texts in this PNG*/
+	char** itext_keys; /*the English keyword of the text chunk (e.g. "Comment")*/
+	char** itext_langtags; /*language tag for this text's language, ISO/IEC 646 string, e.g. ISO 639 language tag*/
+	char** itext_transkeys; /*keyword translated to the international language - UTF-8 string*/
+	char** itext_strings; /*the actual international text - UTF-8 string*/
+
+	/*time chunk (tIME)*/
+	unsigned time_defined; /*set to 1 to make the encoder generate a tIME chunk*/
+	LodePNGTime time;
+
+	/*phys chunk (pHYs)*/
+	unsigned phys_defined; /*if 0, there is no pHYs chunk and the values below are undefined, if 1 else there is one*/
+	unsigned phys_x; /*pixels per unit in x direction*/
+	unsigned phys_y; /*pixels per unit in y direction*/
+	unsigned phys_unit; /*may be 0 (unknown unit) or 1 (metre)*/
+
+	/*
+	unknown chunks
+	There are 3 buffers, one for each position in the PNG where unknown chunks can appear
+	each buffer contains all unknown chunks for that position consecutively
+	The 3 buffers are the unknown chunks between certain critical chunks:
+	0: IHDR-PLTE, 1: PLTE-IDAT, 2: IDAT-IEND
+	Do not allocate or traverse this data yourself. Use the chunk traversing functions declared
+	later, such as lodepng_chunk_next and lodepng_chunk_append, to read/write this struct.
+	*/
+	unsigned char* unknown_chunks_data[3];
+	size_t unknown_chunks_size[3]; /*size in bytes of the unknown chunks, given for protection*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGInfo;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_info_init(LodePNGInfo* info);
+void lodepng_info_cleanup(LodePNGInfo* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+void lodepng_clear_text(LodePNGInfo* info); /*use this to clear the texts again after you filled them in*/
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str); /*push back both texts at once*/
+
+void lodepng_clear_itext(LodePNGInfo* info); /*use this to clear the itexts again after you filled them in*/
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+	const char* transkey, const char* str); /*push back the 4 texts of 1 chunk at once*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*
+Converts raw buffer from one color type to another color type, based on
+LodePNGColorMode structs to describe the input and output color type.
+See the reference manual at the end of this header file to see which color conversions are supported.
+return value = LodePNG error code (0 if all went ok, an error if the conversion isn't supported)
+The out buffer must have size (w * h * bpp + 7) / 8, where bpp is the bits per pixel
+of the output color type (lodepng_get_bpp).
+For < 8 bpp images, there should not be padding bits at the end of scanlines.
+For 16-bit per channel colors, uses big endian format like PNG does.
+Return value is LodePNG error code
+*/
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+	const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+	unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Settings for the decoder. This contains settings for the PNG and the Zlib
+decoder, but not the Info settings from the Info structs.
+*/
+typedef struct LodePNGDecoderSettings
+{
+	LodePNGDecompressSettings zlibsettings; /*in here is the setting to ignore Adler32 checksums*/
+
+	unsigned ignore_crc; /*ignore CRC checksums*/
+
+	unsigned color_convert; /*whether to convert the PNG to the color type you want. Default: yes*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+	unsigned read_text_chunks; /*if false but remember_unknown_chunks is true, they're stored in the unknown chunks*/
+	/*store all bytes from unknown chunks in the LodePNGInfo (off by default, useful for a png editor)*/
+	unsigned remember_unknown_chunks;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGDecoderSettings;
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*automatically use color type with less bits per pixel if losslessly possible. Default: AUTO*/
+typedef enum LodePNGFilterStrategy
+{
+	/*every filter at zero*/
+	LFS_ZERO,
+	/*Use filter that gives minimum sum, as described in the official PNG filter heuristic.*/
+	LFS_MINSUM,
+	/*Use the filter type that gives smallest Shannon entropy for this scanline. Depending
+	on the image, this is better or worse than minsum.*/
+	LFS_ENTROPY,
+	/*
+	Brute-force-search PNG filters by compressing each filter for each scanline.
+	Experimental, very slow, and only rarely gives better compression than MINSUM.
+	*/
+	LFS_BRUTE_FORCE,
+	/*use predefined_filters buffer: you specify the filter type for each scanline*/
+	LFS_PREDEFINED
+} LodePNGFilterStrategy;
+
+/*Gives characteristics about the colors of the image, which helps decide which color model to use for encoding.
+Used internally by default if "auto_convert" is enabled. Public because it's useful for custom algorithms.*/
+typedef struct LodePNGColorProfile
+{
+	unsigned colored; /*not greyscale*/
+	unsigned key; /*image is not opaque and color key is possible instead of full alpha*/
+	unsigned short key_r; /*key values, always as 16-bit, in 8-bit case the byte is duplicated, e.g. 65535 means 255*/
+	unsigned short key_g;
+	unsigned short key_b;
+	unsigned alpha; /*image is not opaque and alpha channel or alpha palette required*/
+	unsigned numcolors; /*amount of colors, up to 257. Not valid if bits == 16.*/
+	unsigned char palette[1024]; /*Remembers up to the first 256 RGBA colors, in no particular order*/
+	unsigned bits; /*bits per channel (not for palette). 1,2 or 4 for greyscale only. 16 if 16-bit per channel required.*/
+} LodePNGColorProfile;
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile);
+
+/*Get a LodePNGColorProfile of the image.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+	const unsigned char* image, unsigned w, unsigned h,
+	const LodePNGColorMode* mode_in);
+/*The function LodePNG uses internally to decide the PNG color with auto_convert.
+Chooses an optimal color model, e.g. grey if only grey pixels, palette if < 256 colors, ...*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+	const unsigned char* image, unsigned w, unsigned h,
+	const LodePNGColorMode* mode_in);
+
+/*Settings for the encoder.*/
+typedef struct LodePNGEncoderSettings
+{
+	LodePNGCompressSettings zlibsettings; /*settings for the zlib encoder, such as window size, ...*/
+
+	unsigned auto_convert; /*automatically choose output PNG color type. Default: true*/
+
+	/*If true, follows the official PNG heuristic: if the PNG uses a palette or lower than
+	8 bit depth, set all filters to zero. Otherwise use the filter_strategy. Note that to
+	completely follow the official PNG heuristic, filter_palette_zero must be true and
+	filter_strategy must be LFS_MINSUM*/
+	unsigned filter_palette_zero;
+	/*Which filter strategy to use when not using zeroes due to filter_palette_zero.
+	Set filter_palette_zero to 0 to ensure always using your chosen strategy. Default: LFS_MINSUM*/
+	LodePNGFilterStrategy filter_strategy;
+	/*used if filter_strategy is LFS_PREDEFINED. In that case, this must point to a buffer with
+	the same length as the amount of scanlines in the image, and each value must <= 5. You
+	have to cleanup this buffer, LodePNG will never free it. Don't forget that filter_palette_zero
+	must be set to 0 to ensure this is also used on palette or low bitdepth images.*/
+	const unsigned char* predefined_filters;
+
+	/*force creating a PLTE chunk if colortype is 2 or 6 (= a suggested palette).
+	If colortype is 3, PLTE is _always_ created.*/
+	unsigned force_palette;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+	/*add LodePNG identifier and version as a text chunk, for debugging*/
+	unsigned add_id;
+	/*encode text chunks as zTXt chunks instead of tEXt chunks, and use compression in iTXt chunks*/
+	unsigned text_compression;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGEncoderSettings;
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+/*The settings, state and information for extended encoding and decoding.*/
+typedef struct LodePNGState
+{
+#ifdef LODEPNG_COMPILE_DECODER
+	LodePNGDecoderSettings decoder; /*the decoding settings*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+	LodePNGEncoderSettings encoder; /*the encoding settings*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+	LodePNGColorMode info_raw; /*specifies the format in which you would like to get the raw pixel buffer*/
+	LodePNGInfo info_png; /*info of the PNG image obtained after decoding*/
+	unsigned error;
+#ifdef LODEPNG_COMPILE_CPP
+	/* For the lodepng::State subclass. */
+	virtual ~LodePNGState() {}
+#endif
+} LodePNGState;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_state_init(LodePNGState* state);
+void lodepng_state_cleanup(LodePNGState* state);
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source);
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Same as lodepng_decode_memory, but uses a LodePNGState to allow custom settings and
+getting much more information about the PNG image and color mode.
+*/
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+	LodePNGState* state,
+	const unsigned char* in, size_t insize);
+
+/*
+Read the PNG header, but not the actual data. This returns only the information
+that is in the header chunk of the PNG, such as width, height and color type. The
+information is placed in the info_png field of the LodePNGState.
+*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h,
+	LodePNGState* state,
+	const unsigned char* in, size_t insize);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*This function allocates the out buffer with standard malloc and stores the size in *outsize.*/
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+	const unsigned char* image, unsigned w, unsigned h,
+	LodePNGState* state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*
+The lodepng_chunk functions are normally not needed, except to traverse the
+unknown chunks stored in the LodePNGInfo struct, or add new ones to it.
+It also allows traversing the chunks of an encoded PNG file yourself.
+
+PNG standard chunk naming conventions:
+First byte: uppercase = critical, lowercase = ancillary
+Second byte: uppercase = public, lowercase = private
+Third byte: must be uppercase
+Fourth byte: uppercase = unsafe to copy, lowercase = safe to copy
+*/
+
+/*
+Gets the length of the data of the chunk. Total chunk length has 12 bytes more.
+There must be at least 4 bytes to read from. If the result value is too large,
+it may be corrupt data.
+*/
+unsigned lodepng_chunk_length(const unsigned char* chunk);
+
+/*puts the 4-byte type in null terminated string*/
+void lodepng_chunk_type(char type[5], const unsigned char* chunk);
+
+/*check if the type is the given type*/
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type);
+
+/*0: it's one of the critical chunk types, 1: it's an ancillary chunk (see PNG standard)*/
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk);
+
+/*0: public, 1: private (see PNG standard)*/
+unsigned char lodepng_chunk_private(const unsigned char* chunk);
+
+/*0: the chunk is unsafe to copy, 1: the chunk is safe to copy (see PNG standard)*/
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk);
+
+/*get pointer to the data of the chunk, where the input points to the header of the chunk*/
+unsigned char* lodepng_chunk_data(unsigned char* chunk);
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk);
+
+/*returns 0 if the crc is correct, 1 if it's incorrect (0 for OK as usual!)*/
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk);
+
+/*generates the correct CRC from the data and puts it in the last 4 bytes of the chunk*/
+void lodepng_chunk_generate_crc(unsigned char* chunk);
+
+/*iterate to next chunks. don't use on IEND chunk, as there is no next chunk then*/
+unsigned char* lodepng_chunk_next(unsigned char* chunk);
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk);
+
+/*
+Appends chunk to the data in out. The given chunk should already have its chunk header.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returns error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk);
+
+/*
+Appends new chunk to out. The chunk to append is given by giving its length, type
+and data separately. The type is a 4-letter string.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returne error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+	const char* type, const unsigned char* data);
+
+
+/*Calculate CRC32 of buffer*/
+unsigned lodepng_crc32(const unsigned char* buf, size_t len);
+#endif /*LODEPNG_COMPILE_PNG*/
+
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*
+This zlib part can be used independently to zlib compress and decompress a
+buffer. It cannot be used to create gzip files however, and it only supports the
+part of zlib that is required for PNG, it does not support dictionaries.
+*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Inflate a buffer. Inflate is the decompression step of deflate. Out buffer must be freed after use.*/
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+	const unsigned char* in, size_t insize,
+	const LodePNGDecompressSettings* settings);
+
+/*
+Decompresses Zlib data. Reallocates the out buffer and appends the data. The
+data must be according to the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize,
+	const unsigned char* in, size_t insize,
+	const LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Compresses data with Zlib. Reallocates the out buffer and appends the data.
+Zlib adds a small header and trailer around the deflate data.
+The data is output in the format of the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize,
+	const unsigned char* in, size_t insize,
+	const LodePNGCompressSettings* settings);
+
+/*
+Find length-limited Huffman code for given frequencies. This function is in the
+public interface only for tests, it's used internally by lodepng_deflate.
+*/
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+	size_t numcodes, unsigned maxbitlen);
+
+/*Compress a buffer with deflate. See RFC 1951. Out buffer must be freed after use.*/
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+	const unsigned char* in, size_t insize,
+	const LodePNGCompressSettings* settings);
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load a file from disk into buffer. The function allocates the out buffer, and
+after usage you should free it.
+out: output parameter, contains pointer to loaded buffer.
+outsize: output parameter, size of the allocated out buffer
+filename: the path to the file to load
+return value: error code (0 means ok)
+*/
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename);
+
+/*
+Save a file from buffer to disk. Warning, if it exists, this function overwrites
+the file without warning!
+buffer: the buffer to write
+buffersize: size of the buffer to write
+filename: the path to the file to save to
+return value: error code (0 means ok)
+*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+
+#ifdef LODEPNG_COMPILE_CPP
+/* The LodePNG C++ wrapper uses std::vectors instead of manually allocated memory buffers. */
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_PNG
+	class State : public LodePNGState
+	{
+	public:
+		State();
+		State(const State& other);
+		virtual ~State();
+		State& operator=(const State& other);
+	};
+
+#ifdef LODEPNG_COMPILE_DECODER
+	/* Same as other lodepng::decode, but using a State for more settings and information. */
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+		State& state,
+		const unsigned char* in, size_t insize);
+	unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+		State& state,
+		const std::vector<unsigned char>& in);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+	/* Same as other lodepng::encode, but using a State for more settings and information. */
+	unsigned encode(std::vector<unsigned char>& out,
+		const unsigned char* in, unsigned w, unsigned h,
+		State& state);
+	unsigned encode(std::vector<unsigned char>& out,
+		const std::vector<unsigned char>& in, unsigned w, unsigned h,
+		State& state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DISK
+	/*
+	Load a file from disk into an std::vector.
+	return value: error code (0 means ok)
+	*/
+	unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename);
+
+	/*
+	Save the binary data in an std::vector to a file on disk. The file is overwritten
+	without warning.
+	*/
+	unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_PNG */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+	/* Zlib-decompress an unsigned char buffer */
+	unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+		const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+
+	/* Zlib-decompress an std::vector */
+	unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+		const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+	/* Zlib-compress an unsigned char buffer */
+	unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+		const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+
+	/* Zlib-compress an std::vector */
+	unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+		const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+
+/*
+TODO:
+[.] test if there are no memory leaks or security exploits - done a lot but needs to be checked often
+[.] check compatibility with various compilers  - done but needs to be redone for every newer version
+[X] converting color to 16-bit per channel types
+[ ] read all public PNG chunk types (but never let the color profile and gamma ones touch RGB values)
+[ ] make sure encoder generates no chunks with size > (2^31)-1
+[ ] partial decoding (stream processing)
+[X] let the "isFullyOpaque" function check color keys and transparent palettes too
+[X] better name for the variables "codes", "codesD", "codelengthcodes", "clcl" and "lldl"
+[ ] don't stop decoding on errors like 69, 57, 58 (make warnings)
+[ ] make warnings like: oob palette, checksum fail, data after iend, wrong/unknown crit chunk, no null terminator in text, ...
+[ ] let the C++ wrapper catch exceptions coming from the standard library and return LodePNG error codes
+[ ] allow user to provide custom color conversion functions, e.g. for premultiplied alpha, padding bits or not, ...
+[ ] allow user to give data (void*) to custom allocator
+*/
+
+#endif /*LODEPNG_H inclusion guard*/
+
+/*
+LodePNG Documentation
+---------------------
+
+0. table of contents
+--------------------
+
+  1. about
+	1.1. supported features
+	1.2. features not supported
+  2. C and C++ version
+  3. security
+  4. decoding
+  5. encoding
+  6. color conversions
+	 6.1. PNG color types
+	 6.2. color conversions
+	 6.3. padding bits
+	 6.4. A note about 16-bits per channel and endianness
+  7. error values
+  8. chunks and PNG editing
+  9. compiler support
+  10. examples
+	10.1. decoder C++ example
+	10.2. decoder C example
+  11. state settings reference
+  12. changes
+  13. contact information
+
+
+1. about
+--------
+
+PNG is a file format to store raster images losslessly with good compression,
+supporting different color types and alpha channel.
+
+LodePNG is a PNG codec according to the Portable Network Graphics (PNG)
+Specification (Second Edition) - W3C Recommendation 10 November 2003.
+
+The specifications used are:
+
+*) Portable Network Graphics (PNG) Specification (Second Edition):
+	  http://www.w3.org/TR/2003/REC-PNG-20031110
+*) RFC 1950 ZLIB Compressed Data Format version 3.3:
+	  http://www.gzip.org/zlib/rfc-zlib.html
+*) RFC 1951 DEFLATE Compressed Data Format Specification ver 1.3:
+	  http://www.gzip.org/zlib/rfc-deflate.html
+
+The most recent version of LodePNG can currently be found at
+http://lodev.org/lodepng/
+
+LodePNG works both in C (ISO C90) and C++, with a C++ wrapper that adds
+extra functionality.
+
+LodePNG exists out of two files:
+-lodepng.h: the header file for both C and C++
+-lodepng.c(pp): give it the name lodepng.c or lodepng.cpp (or .cc) depending on your usage
+
+If you want to start using LodePNG right away without reading this doc, get the
+examples from the LodePNG website to see how to use it in code, or check the
+smaller examples in chapter 13 here.
+
+LodePNG is simple but only supports the basic requirements. To achieve
+simplicity, the following design choices were made: There are no dependencies
+on any external library. There are functions to decode and encode a PNG with
+a single function call, and extended versions of these functions taking a
+LodePNGState struct allowing to specify or get more information. By default
+the colors of the raw image are always RGB or RGBA, no matter what color type
+the PNG file uses. To read and write files, there are simple functions to
+convert the files to/from buffers in memory.
+
+This all makes LodePNG suitable for loading textures in games, demos and small
+programs, ... It's less suitable for full fledged image editors, loading PNGs
+over network (it requires all the image data to be available before decoding can
+begin), life-critical systems, ...
+
+1.1. supported features
+-----------------------
+
+The following features are supported by the decoder:
+
+*) decoding of PNGs with any color type, bit depth and interlace mode, to a 24- or 32-bit color raw image,
+	or the same color type as the PNG
+*) encoding of PNGs, from any raw image to 24- or 32-bit color, or the same color type as the raw image
+*) Adam7 interlace and deinterlace for any color type
+*) loading the image from harddisk or decoding it from a buffer from other sources than harddisk
+*) support for alpha channels, including RGBA color model, translucent palettes and color keying
+*) zlib decompression (inflate)
+*) zlib compression (deflate)
+*) CRC32 and ADLER32 checksums
+*) handling of unknown chunks, allowing making a PNG editor that stores custom and unknown chunks.
+*) the following chunks are supported (generated/interpreted) by both encoder and decoder:
+	 IHDR: header information
+	 PLTE: color palette
+	 IDAT: pixel data
+	 IEND: the final chunk
+	 tRNS: transparency for palettized images
+	 tEXt: textual information
+	 zTXt: compressed textual information
+	 iTXt: international textual information
+	 bKGD: suggested background color
+	 pHYs: physical dimensions
+	 tIME: modification time
+
+1.2. features not supported
+---------------------------
+
+The following features are _not_ supported:
+
+*) some features needed to make a conformant PNG-Editor might be still missing.
+*) partial loading/stream processing. All data must be available and is processed in one call.
+*) The following public chunks are not supported but treated as unknown chunks by LodePNG
+	 cHRM, gAMA, iCCP, sRGB, sBIT, hIST, sPLT
+	Some of these are not supported on purpose: LodePNG wants to provide the RGB values
+	stored in the pixels, not values modified by system dependent gamma or color models.
+
+
+2. C and C++ version
+--------------------
+
+The C version uses buffers allocated with alloc that you need to free()
+yourself. You need to use init and cleanup functions for each struct whenever
+using a struct from the C version to avoid exploits and memory leaks.
+
+The C++ version has extra functions with std::vectors in the interface and the
+lodepng::State class which is a LodePNGState with constructor and destructor.
+
+These files work without modification for both C and C++ compilers because all
+the additional C++ code is in "#ifdef __cplusplus" blocks that make C-compilers
+ignore it, and the C code is made to compile both with strict ISO C90 and C++.
+
+To use the C++ version, you need to rename the source file to lodepng.cpp
+(instead of lodepng.c), and compile it with a C++ compiler.
+
+To use the C version, you need to rename the source file to lodepng.c (instead
+of lodepng.cpp), and compile it with a C compiler.
+
+
+3. Security
+-----------
+
+Even if carefully designed, it's always possible that LodePNG contains possible
+exploits. If you discover one, please let me know, and it will be fixed.
+
+When using LodePNG, care has to be taken with the C version of LodePNG, as well
+as the C-style structs when working with C++. The following conventions are used
+for all C-style structs:
+
+-if a struct has a corresponding init function, always call the init function when making a new one
+-if a struct has a corresponding cleanup function, call it before the struct disappears to avoid memory leaks
+-if a struct has a corresponding copy function, use the copy function instead of "=".
+ The destination must also be inited already.
+
+
+4. Decoding
+-----------
+
+Decoding converts a PNG compressed image to a raw pixel buffer.
+
+Most documentation on using the decoder is at its declarations in the header
+above. For C, simple decoding can be done with functions such as
+lodepng_decode32, and more advanced decoding can be done with the struct
+LodePNGState and lodepng_decode. For C++, all decoding can be done with the
+various lodepng::decode functions, and lodepng::State can be used for advanced
+features.
+
+When using the LodePNGState, it uses the following fields for decoding:
+*) LodePNGInfo info_png: it stores extra information about the PNG (the input) in here
+*) LodePNGColorMode info_raw: here you can say what color mode of the raw image (the output) you want to get
+*) LodePNGDecoderSettings decoder: you can specify a few extra settings for the decoder to use
+
+LodePNGInfo info_png
+--------------------
+
+After decoding, this contains extra information of the PNG image, except the actual
+pixels, width and height because these are already gotten directly from the decoder
+functions.
+
+It contains for example the original color type of the PNG image, text comments,
+suggested background color, etc... More details about the LodePNGInfo struct are
+at its declaration documentation.
+
+LodePNGColorMode info_raw
+-------------------------
+
+When decoding, here you can specify which color type you want
+the resulting raw image to be. If this is different from the colortype of the
+PNG, then the decoder will automatically convert the result. This conversion
+always works, except if you want it to convert a color PNG to greyscale or to
+a palette with missing colors.
+
+By default, 32-bit color is used for the result.
+
+LodePNGDecoderSettings decoder
+------------------------------
+
+The settings can be used to ignore the errors created by invalid CRC and Adler32
+chunks, and to disable the decoding of tEXt chunks.
+
+There's also a setting color_convert, true by default. If false, no conversion
+is done, the resulting data will be as it was in the PNG (after decompression)
+and you'll have to puzzle the colors of the pixels together yourself using the
+color type information in the LodePNGInfo.
+
+
+5. Encoding
+-----------
+
+Encoding converts a raw pixel buffer to a PNG compressed image.
+
+Most documentation on using the encoder is at its declarations in the header
+above. For C, simple encoding can be done with functions such as
+lodepng_encode32, and more advanced decoding can be done with the struct
+LodePNGState and lodepng_encode. For C++, all encoding can be done with the
+various lodepng::encode functions, and lodepng::State can be used for advanced
+features.
+
+Like the decoder, the encoder can also give errors. However it gives less errors
+since the encoder input is trusted, the decoder input (a PNG image that could
+be forged by anyone) is not trusted.
+
+When using the LodePNGState, it uses the following fields for encoding:
+*) LodePNGInfo info_png: here you specify how you want the PNG (the output) to be.
+*) LodePNGColorMode info_raw: here you say what color type of the raw image (the input) has
+*) LodePNGEncoderSettings encoder: you can specify a few settings for the encoder to use
+
+LodePNGInfo info_png
+--------------------
+
+When encoding, you use this the opposite way as when decoding: for encoding,
+you fill in the values you want the PNG to have before encoding. By default it's
+not needed to specify a color type for the PNG since it's automatically chosen,
+but it's possible to choose it yourself given the right settings.
+
+The encoder will not always exactly match the LodePNGInfo struct you give,
+it tries as close as possible. Some things are ignored by the encoder. The
+encoder uses, for example, the following settings from it when applicable:
+colortype and bitdepth, text chunks, time chunk, the color key, the palette, the
+background color, the interlace method, unknown chunks, ...
+
+When encoding to a PNG with colortype 3, the encoder will generate a PLTE chunk.
+If the palette contains any colors for which the alpha channel is not 255 (so
+there are translucent colors in the palette), it'll add a tRNS chunk.
+
+LodePNGColorMode info_raw
+-------------------------
+
+You specify the color type of the raw image that you give to the input here,
+including a possible transparent color key and palette you happen to be using in
+your raw image data.
+
+By default, 32-bit color is assumed, meaning your input has to be in RGBA
+format with 4 bytes (unsigned chars) per pixel.
+
+LodePNGEncoderSettings encoder
+------------------------------
+
+The following settings are supported (some are in sub-structs):
+*) auto_convert: when this option is enabled, the encoder will
+automatically choose the smallest possible color mode (including color key) that
+can encode the colors of all pixels without information loss.
+*) btype: the block type for LZ77. 0 = uncompressed, 1 = fixed huffman tree,
+	2 = dynamic huffman tree (best compression). Should be 2 for proper
+	compression.
+*) use_lz77: whether or not to use LZ77 for compressed block types. Should be
+	true for proper compression.
+*) windowsize: the window size used by the LZ77 encoder (1 - 32768). Has value
+	2048 by default, but can be set to 32768 for better, but slow, compression.
+*) force_palette: if colortype is 2 or 6, you can make the encoder write a PLTE
+	chunk if force_palette is true. This can used as suggested palette to convert
+	to by viewers that don't support more than 256 colors (if those still exist)
+*) add_id: add text chunk "Encoder: LodePNG <version>" to the image.
+*) text_compression: default 1. If 1, it'll store texts as zTXt instead of tEXt chunks.
+  zTXt chunks use zlib compression on the text. This gives a smaller result on
+  large texts but a larger result on small texts (such as a single program name).
+  It's all tEXt or all zTXt though, there's no separate setting per text yet.
+
+
+6. color conversions
+--------------------
+
+An important thing to note about LodePNG, is that the color type of the PNG, and
+the color type of the raw image, are completely independent. By default, when
+you decode a PNG, you get the result as a raw image in the color type you want,
+no matter whether the PNG was encoded with a palette, greyscale or RGBA color.
+And if you encode an image, by default LodePNG will automatically choose the PNG
+color type that gives good compression based on the values of colors and amount
+of colors in the image. It can be configured to let you control it instead as
+well, though.
+
+To be able to do this, LodePNG does conversions from one color mode to another.
+It can convert from almost any color type to any other color type, except the
+following conversions: RGB to greyscale is not supported, and converting to a
+palette when the palette doesn't have a required color is not supported. This is
+not supported on purpose: this is information loss which requires a color
+reduction algorithm that is beyong the scope of a PNG encoder (yes, RGB to grey
+is easy, but there are multiple ways if you want to give some channels more
+weight).
+
+By default, when decoding, you get the raw image in 32-bit RGBA or 24-bit RGB
+color, no matter what color type the PNG has. And by default when encoding,
+LodePNG automatically picks the best color model for the output PNG, and expects
+the input image to be 32-bit RGBA or 24-bit RGB. So, unless you want to control
+the color format of the images yourself, you can skip this chapter.
+
+6.1. PNG color types
+--------------------
+
+A PNG image can have many color types, ranging from 1-bit color to 64-bit color,
+as well as palettized color modes. After the zlib decompression and unfiltering
+in the PNG image is done, the raw pixel data will have that color type and thus
+a certain amount of bits per pixel. If you want the output raw image after
+decoding to have another color type, a conversion is done by LodePNG.
+
+The PNG specification gives the following color types:
+
+0: greyscale, bit depths 1, 2, 4, 8, 16
+2: RGB, bit depths 8 and 16
+3: palette, bit depths 1, 2, 4 and 8
+4: greyscale with alpha, bit depths 8 and 16
+6: RGBA, bit depths 8 and 16
+
+Bit depth is the amount of bits per pixel per color channel. So the total amount
+of bits per pixel is: amount of channels * bitdepth.
+
+6.2. color conversions
+----------------------
+
+As explained in the sections about the encoder and decoder, you can specify
+color types and bit depths in info_png and info_raw to change the default
+behaviour.
+
+If, when decoding, you want the raw image to be something else than the default,
+you need to set the color type and bit depth you want in the LodePNGColorMode,
+or the parameters colortype and bitdepth of the simple decoding function.
+
+If, when encoding, you use another color type than the default in the raw input
+image, you need to specify its color type and bit depth in the LodePNGColorMode
+of the raw image, or use the parameters colortype and bitdepth of the simple
+encoding function.
+
+If, when encoding, you don't want LodePNG to choose the output PNG color type
+but control it yourself, you need to set auto_convert in the encoder settings
+to false, and specify the color type you want in the LodePNGInfo of the
+encoder (including palette: it can generate a palette if auto_convert is true,
+otherwise not).
+
+If the input and output color type differ (whether user chosen or auto chosen),
+LodePNG will do a color conversion, which follows the rules below, and may
+sometimes result in an error.
+
+To avoid some confusion:
+-the decoder converts from PNG to raw image
+-the encoder converts from raw image to PNG
+-the colortype and bitdepth in LodePNGColorMode info_raw, are those of the raw image
+-the colortype and bitdepth in the color field of LodePNGInfo info_png, are those of the PNG
+-when encoding, the color type in LodePNGInfo is ignored if auto_convert
+ is enabled, it is automatically generated instead
+-when decoding, the color type in LodePNGInfo is set by the decoder to that of the original
+ PNG image, but it can be ignored since the raw image has the color type you requested instead
+-if the color type of the LodePNGColorMode and PNG image aren't the same, a conversion
+ between the color types is done if the color types are supported. If it is not
+ supported, an error is returned. If the types are the same, no conversion is done.
+-even though some conversions aren't supported, LodePNG supports loading PNGs from any
+ colortype and saving PNGs to any colortype, sometimes it just requires preparing
+ the raw image correctly before encoding.
+-both encoder and decoder use the same color converter.
+
+Non supported color conversions:
+-color to greyscale: no error is thrown, but the result will look ugly because
+only the red channel is taken
+-anything to palette when that palette does not have that color in it: in this
+case an error is thrown
+
+Supported color conversions:
+-anything to 8-bit RGB, 8-bit RGBA, 16-bit RGB, 16-bit RGBA
+-any grey or grey+alpha, to grey or grey+alpha
+-anything to a palette, as long as the palette has the requested colors in it
+-removing alpha channel
+-higher to smaller bitdepth, and vice versa
+
+If you want no color conversion to be done (e.g. for speed or control):
+-In the encoder, you can make it save a PNG with any color type by giving the
+raw color mode and LodePNGInfo the same color mode, and setting auto_convert to
+false.
+-In the decoder, you can make it store the pixel data in the same color type
+as the PNG has, by setting the color_convert setting to false. Settings in
+info_raw are then ignored.
+
+The function lodepng_convert does the color conversion. It is available in the
+interface but normally isn't needed since the encoder and decoder already call
+it.
+
+6.3. padding bits
+-----------------
+
+In the PNG file format, if a less than 8-bit per pixel color type is used and the scanlines
+have a bit amount that isn't a multiple of 8, then padding bits are used so that each
+scanline starts at a fresh byte. But that is NOT true for the LodePNG raw input and output.
+The raw input image you give to the encoder, and the raw output image you get from the decoder
+will NOT have these padding bits, e.g. in the case of a 1-bit image with a width
+of 7 pixels, the first pixel of the second scanline will the the 8th bit of the first byte,
+not the first bit of a new byte.
+
+6.4. A note about 16-bits per channel and endianness
+----------------------------------------------------
+
+LodePNG uses unsigned char arrays for 16-bit per channel colors too, just like
+for any other color format. The 16-bit values are stored in big endian (most
+significant byte first) in these arrays. This is the opposite order of the
+little endian used by x86 CPU's.
+
+LodePNG always uses big endian because the PNG file format does so internally.
+Conversions to other formats than PNG uses internally are not supported by
+LodePNG on purpose, there are myriads of formats, including endianness of 16-bit
+colors, the order in which you store R, G, B and A, and so on. Supporting and
+converting to/from all that is outside the scope of LodePNG.
+
+This may mean that, depending on your use case, you may want to convert the big
+endian output of LodePNG to little endian with a for loop. This is certainly not
+always needed, many applications and libraries support big endian 16-bit colors
+anyway, but it means you cannot simply cast the unsigned char* buffer to an
+unsigned short* buffer on x86 CPUs.
+
+
+7. error values
+---------------
+
+All functions in LodePNG that return an error code, return 0 if everything went
+OK, or a non-zero code if there was an error.
+
+The meaning of the LodePNG error values can be retrieved with the function
+lodepng_error_text: given the numerical error code, it returns a description
+of the error in English as a string.
+
+Check the implementation of lodepng_error_text to see the meaning of each code.
+
+
+8. chunks and PNG editing
+-------------------------
+
+If you want to add extra chunks to a PNG you encode, or use LodePNG for a PNG
+editor that should follow the rules about handling of unknown chunks, or if your
+program is able to read other types of chunks than the ones handled by LodePNG,
+then that's possible with the chunk functions of LodePNG.
+
+A PNG chunk has the following layout:
+
+4 bytes length
+4 bytes type name
+length bytes data
+4 bytes CRC
+
+8.1. iterating through chunks
+-----------------------------
+
+If you have a buffer containing the PNG image data, then the first chunk (the
+IHDR chunk) starts at byte number 8 of that buffer. The first 8 bytes are the
+signature of the PNG and are not part of a chunk. But if you start at byte 8
+then you have a chunk, and can check the following things of it.
+
+NOTE: none of these functions check for memory buffer boundaries. To avoid
+exploits, always make sure the buffer contains all the data of the chunks.
+When using lodepng_chunk_next, make sure the returned value is within the
+allocated memory.
+
+unsigned lodepng_chunk_length(const unsigned char* chunk):
+
+Get the length of the chunk's data. The total chunk length is this length + 12.
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk):
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type):
+
+Get the type of the chunk or compare if it's a certain type
+
+unsigned char lodepng_chunk_critical(const unsigned char* chunk):
+unsigned char lodepng_chunk_private(const unsigned char* chunk):
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk):
+
+Check if the chunk is critical in the PNG standard (only IHDR, PLTE, IDAT and IEND are).
+Check if the chunk is private (public chunks are part of the standard, private ones not).
+Check if the chunk is safe to copy. If it's not, then, when modifying data in a critical
+chunk, unsafe to copy chunks of the old image may NOT be saved in the new one if your
+program doesn't handle that type of unknown chunk.
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk):
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk):
+
+Get a pointer to the start of the data of the chunk.
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk):
+void lodepng_chunk_generate_crc(unsigned char* chunk):
+
+Check if the crc is correct or generate a correct one.
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk):
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk):
+
+Iterate to the next chunk. This works if you have a buffer with consecutive chunks. Note that these
+functions do no boundary checking of the allocated data whatsoever, so make sure there is enough
+data available in the buffer to be able to go to the next chunk.
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk):
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+										const char* type, const unsigned char* data):
+
+These functions are used to create new chunks that are appended to the data in *out that has
+length *outlength. The append function appends an existing chunk to the new data. The create
+function creates a new chunk with the given parameters and appends it. Type is the 4-letter
+name of the chunk.
+
+8.2. chunks in info_png
+-----------------------
+
+The LodePNGInfo struct contains fields with the unknown chunk in it. It has 3
+buffers (each with size) to contain 3 types of unknown chunks:
+the ones that come before the PLTE chunk, the ones that come between the PLTE
+and the IDAT chunks, and the ones that come after the IDAT chunks.
+It's necessary to make the distionction between these 3 cases because the PNG
+standard forces to keep the ordering of unknown chunks compared to the critical
+chunks, but does not force any other ordering rules.
+
+info_png.unknown_chunks_data[0] is the chunks before PLTE
+info_png.unknown_chunks_data[1] is the chunks after PLTE, before IDAT
+info_png.unknown_chunks_data[2] is the chunks after IDAT
+
+The chunks in these 3 buffers can be iterated through and read by using the same
+way described in the previous subchapter.
+
+When using the decoder to decode a PNG, you can make it store all unknown chunks
+if you set the option settings.remember_unknown_chunks to 1. By default, this
+option is off (0).
+
+The encoder will always encode unknown chunks that are stored in the info_png.
+If you need it to add a particular chunk that isn't known by LodePNG, you can
+use lodepng_chunk_append or lodepng_chunk_create to the chunk data in
+info_png.unknown_chunks_data[x].
+
+Chunks that are known by LodePNG should not be added in that way. E.g. to make
+LodePNG add a bKGD chunk, set background_defined to true and add the correct
+parameters there instead.
+
+
+9. compiler support
+-------------------
+
+No libraries other than the current standard C library are needed to compile
+LodePNG. For the C++ version, only the standard C++ library is needed on top.
+Add the files lodepng.c(pp) and lodepng.h to your project, include
+lodepng.h where needed, and your program can read/write PNG files.
+
+It is compatible with C90 and up, and C++03 and up.
+
+If performance is important, use optimization when compiling! For both the
+encoder and decoder, this makes a large difference.
+
+Make sure that LodePNG is compiled with the same compiler of the same version
+and with the same settings as the rest of the program, or the interfaces with
+std::vectors and std::strings in C++ can be incompatible.
+
+CHAR_BITS must be 8 or higher, because LodePNG uses unsigned chars for octets.
+
+*) gcc and g++
+
+LodePNG is developed in gcc so this compiler is natively supported. It gives no
+warnings with compiler options "-Wall -Wextra -pedantic -ansi", with gcc and g++
+version 4.7.1 on Linux, 32-bit and 64-bit.
+
+*) Clang
+
+Fully supported and warning-free.
+
+*) Mingw
+
+The Mingw compiler (a port of gcc for Windows) should be fully supported by
+LodePNG.
+
+*) Visual Studio and Visual C++ Express Edition
+
+LodePNG should be warning-free with warning level W4. Two warnings were disabled
+with pragmas though: warning 4244 about implicit conversions, and warning 4996
+where it wants to use a non-standard function fopen_s instead of the standard C
+fopen.
+
+Visual Studio may want "stdafx.h" files to be included in each source file and
+give an error "unexpected end of file while looking for precompiled header".
+This is not standard C++ and will not be added to the stock LodePNG. You can
+disable it for lodepng.cpp only by right clicking it, Properties, C/C++,
+Precompiled Headers, and set it to Not Using Precompiled Headers there.
+
+NOTE: Modern versions of VS should be fully supported, but old versions, e.g.
+VS6, are not guaranteed to work.
+
+*) Compilers on Macintosh
+
+LodePNG has been reported to work both with gcc and LLVM for Macintosh, both for
+C and C++.
+
+*) Other Compilers
+
+If you encounter problems on any compilers, feel free to let me know and I may
+try to fix it if the compiler is modern and standards complient.
+
+
+10. examples
+------------
+
+This decoder example shows the most basic usage of LodePNG. More complex
+examples can be found on the LodePNG website.
+
+10.1. decoder C++ example
+-------------------------
+
+#include "lodepng.h"
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  //load and decode
+  std::vector<unsigned char> image;
+  unsigned width, height;
+  unsigned error = lodepng::decode(image, width, height, filename);
+
+  //if there's an error, display it
+  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
+
+  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
+}
+
+10.2. decoder C example
+-----------------------
+
+#include "lodepng.h"
+
+int main(int argc, char *argv[])
+{
+  unsigned error;
+  unsigned char* image;
+  size_t width, height;
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  error = lodepng_decode32_file(&image, &width, &height, filename);
+
+  if(error) printf("decoder error %u: %s\n", error, lodepng_error_text(error));
+
+  / * use image here * /
+
+  free(image);
+  return 0;
+}
+
+11. state settings reference
+----------------------------
+
+A quick reference of some settings to set on the LodePNGState
+
+For decoding:
+
+state.decoder.zlibsettings.ignore_adler32: ignore ADLER32 checksums
+state.decoder.zlibsettings.custom_...: use custom inflate function
+state.decoder.ignore_crc: ignore CRC checksums
+state.decoder.color_convert: convert internal PNG color to chosen one
+state.decoder.read_text_chunks: whether to read in text metadata chunks
+state.decoder.remember_unknown_chunks: whether to read in unknown chunks
+state.info_raw.colortype: desired color type for decoded image
+state.info_raw.bitdepth: desired bit depth for decoded image
+state.info_raw....: more color settings, see struct LodePNGColorMode
+state.info_png....: no settings for decoder but ouput, see struct LodePNGInfo
+
+For encoding:
+
+state.encoder.zlibsettings.btype: disable compression by setting it to 0
+state.encoder.zlibsettings.use_lz77: use LZ77 in compression
+state.encoder.zlibsettings.windowsize: tweak LZ77 windowsize
+state.encoder.zlibsettings.minmatch: tweak min LZ77 length to match
+state.encoder.zlibsettings.nicematch: tweak LZ77 match where to stop searching
+state.encoder.zlibsettings.lazymatching: try one more LZ77 matching
+state.encoder.zlibsettings.custom_...: use custom deflate function
+state.encoder.auto_convert: choose optimal PNG color type, if 0 uses info_png
+state.encoder.filter_palette_zero: PNG filter strategy for palette
+state.encoder.filter_strategy: PNG filter strategy to encode with
+state.encoder.force_palette: add palette even if not encoding to one
+state.encoder.add_id: add LodePNG identifier and version as a text chunk
+state.encoder.text_compression: use compressed text chunks for metadata
+state.info_raw.colortype: color type of raw input image you provide
+state.info_raw.bitdepth: bit depth of raw input image you provide
+state.info_raw: more color settings, see struct LodePNGColorMode
+state.info_png.color.colortype: desired color type if auto_convert is false
+state.info_png.color.bitdepth: desired bit depth if auto_convert is false
+state.info_png.color....: more color settings, see struct LodePNGColorMode
+state.info_png....: more PNG related settings, see struct LodePNGInfo
+
+
+12. changes
+-----------
+
+The version number of LodePNG is the date of the change given in the format
+yyyymmdd.
+
+Some changes aren't backwards compatible. Those are indicated with a (!)
+symbol.
+
+*) 27 nov 2016: grey+alpha auto color model detection bugfix
+*) 18 apr 2016: Changed qsort to custom stable sort (for platforms w/o qsort).
+*) 09 apr 2016: Fixed colorkey usage detection, and better file loading (within
+	the limits of pure C90).
+*) 08 dec 2015: Made load_file function return error if file can't be opened.
+*) 24 okt 2015: Bugfix with decoding to palette output.
+*) 18 apr 2015: Boundary PM instead of just package-merge for faster encoding.
+*) 23 aug 2014: Reduced needless memory usage of decoder.
+*) 28 jun 2014: Removed fix_png setting, always support palette OOB for
+	 simplicity. Made ColorProfile public.
+*) 09 jun 2014: Faster encoder by fixing hash bug and more zeros optimization.
+*) 22 dec 2013: Power of two windowsize required for optimization.
+*) 15 apr 2013: Fixed bug with LAC_ALPHA and color key.
+*) 25 mar 2013: Added an optional feature to ignore some PNG errors (fix_png).
+*) 11 mar 2013 (!): Bugfix with custom free. Changed from "my" to "lodepng_"
+	 prefix for the custom allocators and made it possible with a new #define to
+	 use custom ones in your project without needing to change lodepng's code.
+*) 28 jan 2013: Bugfix with color key.
+*) 27 okt 2012: Tweaks in text chunk keyword length error handling.
+*) 8 okt 2012 (!): Added new filter strategy (entropy) and new auto color mode.
+	 (no palette). Better deflate tree encoding. New compression tweak settings.
+	 Faster color conversions while decoding. Some internal cleanups.
+*) 23 sep 2012: Reduced warnings in Visual Studio a little bit.
+*) 1 sep 2012 (!): Removed #define's for giving custom (de)compression functions
+	 and made it work with function pointers instead.
+*) 23 jun 2012: Added more filter strategies. Made it easier to use custom alloc
+	 and free functions and toggle #defines from compiler flags. Small fixes.
+*) 6 may 2012 (!): Made plugging in custom zlib/deflate functions more flexible.
+*) 22 apr 2012 (!): Made interface more consistent, renaming a lot. Removed
+	 redundant C++ codec classes. Reduced amount of structs. Everything changed,
+	 but it is cleaner now imho and functionality remains the same. Also fixed
+	 several bugs and shrunk the implementation code. Made new samples.
+*) 6 nov 2011 (!): By default, the encoder now automatically chooses the best
+	 PNG color model and bit depth, based on the amount and type of colors of the
+	 raw image. For this, autoLeaveOutAlphaChannel replaced by auto_choose_color.
+*) 9 okt 2011: simpler hash chain implementation for the encoder.
+*) 8 sep 2011: lz77 encoder lazy matching instead of greedy matching.
+*) 23 aug 2011: tweaked the zlib compression parameters after benchmarking.
+	 A bug with the PNG filtertype heuristic was fixed, so that it chooses much
+	 better ones (it's quite significant). A setting to do an experimental, slow,
+	 brute force search for PNG filter types is added.
+*) 17 aug 2011 (!): changed some C zlib related function names.
+*) 16 aug 2011: made the code less wide (max 120 characters per line).
+*) 17 apr 2011: code cleanup. Bugfixes. Convert low to 16-bit per sample colors.
+*) 21 feb 2011: fixed compiling for C90. Fixed compiling with sections disabled.
+*) 11 dec 2010: encoding is made faster, based on suggestion by Peter Eastman
+	 to optimize long sequences of zeros.
+*) 13 nov 2010: added LodePNG_InfoColor_hasPaletteAlpha and
+	 LodePNG_InfoColor_canHaveAlpha functions for convenience.
+*) 7 nov 2010: added LodePNG_error_text function to get error code description.
+*) 30 okt 2010: made decoding slightly faster
+*) 26 okt 2010: (!) changed some C function and struct names (more consistent).
+	  Reorganized the documentation and the declaration order in the header.
+*) 08 aug 2010: only changed some comments and external samples.
+*) 05 jul 2010: fixed bug thanks to warnings in the new gcc version.
+*) 14 mar 2010: fixed bug where too much memory was allocated for char buffers.
+*) 02 sep 2008: fixed bug where it could create empty tree that linux apps could
+	 read by ignoring the problem but windows apps couldn't.
+*) 06 jun 2008: added more error checks for out of memory cases.
+*) 26 apr 2008: added a few more checks here and there to ensure more safety.
+*) 06 mar 2008: crash with encoding of strings fixed
+*) 02 feb 2008: support for international text chunks added (iTXt)
+*) 23 jan 2008: small cleanups, and #defines to divide code in sections
+*) 20 jan 2008: support for unknown chunks allowing using LodePNG for an editor.
+*) 18 jan 2008: support for tIME and pHYs chunks added to encoder and decoder.
+*) 17 jan 2008: ability to encode and decode compressed zTXt chunks added
+	 Also various fixes, such as in the deflate and the padding bits code.
+*) 13 jan 2008: Added ability to encode Adam7-interlaced images. Improved
+	 filtering code of encoder.
+*) 07 jan 2008: (!) changed LodePNG to use ISO C90 instead of C++. A
+	 C++ wrapper around this provides an interface almost identical to before.
+	 Having LodePNG be pure ISO C90 makes it more portable. The C and C++ code
+	 are together in these files but it works both for C and C++ compilers.
+*) 29 dec 2007: (!) changed most integer types to unsigned int + other tweaks
+*) 30 aug 2007: bug fixed which makes this Borland C++ compatible
+*) 09 aug 2007: some VS2005 warnings removed again
+*) 21 jul 2007: deflate code placed in new namespace separate from zlib code
+*) 08 jun 2007: fixed bug with 2- and 4-bit color, and small interlaced images
+*) 04 jun 2007: improved support for Visual Studio 2005: crash with accessing
+	 invalid std::vector element [0] fixed, and level 3 and 4 warnings removed
+*) 02 jun 2007: made the encoder add a tag with version by default
+*) 27 may 2007: zlib and png code separated (but still in the same file),
+	 simple encoder/decoder functions added for more simple usage cases
+*) 19 may 2007: minor fixes, some code cleaning, new error added (error 69),
+	 moved some examples from here to lodepng_examples.cpp
+*) 12 may 2007: palette decoding bug fixed
+*) 24 apr 2007: changed the license from BSD to the zlib license
+*) 11 mar 2007: very simple addition: ability to encode bKGD chunks.
+*) 04 mar 2007: (!) tEXt chunk related fixes, and support for encoding
+	 palettized PNG images. Plus little interface change with palette and texts.
+*) 03 mar 2007: Made it encode dynamic Huffman shorter with repeat codes.
+	 Fixed a bug where the end code of a block had length 0 in the Huffman tree.
+*) 26 feb 2007: Huffman compression with dynamic trees (BTYPE 2) now implemented
+	 and supported by the encoder, resulting in smaller PNGs at the output.
+*) 27 jan 2007: Made the Adler-32 test faster so that a timewaste is gone.
+*) 24 jan 2007: gave encoder an error interface. Added color conversion from any
+	 greyscale type to 8-bit greyscale with or without alpha.
+*) 21 jan 2007: (!) Totally changed the interface. It allows more color types
+	 to convert to and is more uniform. See the manual for how it works now.
+*) 07 jan 2007: Some cleanup & fixes, and a few changes over the last days:
+	 encode/decode custom tEXt chunks, separate classes for zlib & deflate, and
+	 at last made the decoder give errors for incorrect Adler32 or Crc.
+*) 01 jan 2007: Fixed bug with encoding PNGs with less than 8 bits per channel.
+*) 29 dec 2006: Added support for encoding images without alpha channel, and
+	 cleaned out code as well as making certain parts faster.
+*) 28 dec 2006: Added "Settings" to the encoder.
+*) 26 dec 2006: The encoder now does LZ77 encoding and produces much smaller files now.
+	 Removed some code duplication in the decoder. Fixed little bug in an example.
+*) 09 dec 2006: (!) Placed output parameters of public functions as first parameter.
+	 Fixed a bug of the decoder with 16-bit per color.
+*) 15 okt 2006: Changed documentation structure
+*) 09 okt 2006: Encoder class added. It encodes a valid PNG image from the
+	 given image buffer, however for now it's not compressed.
+*) 08 sep 2006: (!) Changed to interface with a Decoder class
+*) 30 jul 2006: (!) LodePNG_InfoPng , width and height are now retrieved in different
+	 way. Renamed decodePNG to decodePNGGeneric.
+*) 29 jul 2006: (!) Changed the interface: image info is now returned as a
+	 struct of type LodePNG::LodePNG_Info, instead of a vector, which was a bit clumsy.
+*) 28 jul 2006: Cleaned the code and added new error checks.
+	 Corrected terminology "deflate" into "inflate".
+*) 23 jun 2006: Added SDL example in the documentation in the header, this
+	 example allows easy debugging by displaying the PNG and its transparency.
+*) 22 jun 2006: (!) Changed way to obtain error value. Added
+	 loadFile function for convenience. Made decodePNG32 faster.
+*) 21 jun 2006: (!) Changed type of info vector to unsigned.
+	 Changed position of palette in info vector. Fixed an important bug that
+	 happened on PNGs with an uncompressed block.
+*) 16 jun 2006: Internally changed unsigned into unsigned where
+	 needed, and performed some optimizations.
+*) 07 jun 2006: (!) Renamed functions to decodePNG and placed them
+	 in LodePNG namespace. Changed the order of the parameters. Rewrote the
+	 documentation in the header. Renamed files to lodepng.cpp and lodepng.h
+*) 22 apr 2006: Optimized and improved some code
+*) 07 sep 2005: (!) Changed to std::vector interface
+*) 12 aug 2005: Initial release (C++, decoder only)
+
+
+13. contact information
+-----------------------
+
+Feel free to contact me with suggestions, problems, comments, ... concerning
+LodePNG. If you encounter a PNG image that doesn't work properly with this
+decoder, feel free to send it and I'll use it to find and fix the problem.
+
+My email address is (puzzle the account and domain together with an @ symbol):
+Domain: gmail dot com.
+Account: lode dot vandevenne.
+
+
+Copyright (c) 2005-2016 Lode Vandevenne
+*/