Skip to content
This repository
tree: 71f17ac460
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

file 159 lines (143 sloc) 6.272 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
<?php

/**
* Class to safely store UTF-8 in a Filename
*
* Encodes a utf8 string using only the following characters 0-9a-z_.-%
* characters 0-9a-z in the original string are preserved, "plain".
* all other characters are represented in a substring that starts
* with '%' are "converted".
* The transition from converted substrings to plain characters is
* marked with a '.'
*
* @author Christopher Smith <chris@jalakai.co.uk>
* @date 2010-04-02
*/
class SafeFN {

    // 'safe' characters are a superset of $plain, $pre_indicator and $post_indicator
    private static $plain = '-./[_0123456789abcdefghijklmnopqrstuvwxyz'; // these characters aren't converted
    private static $pre_indicator = '%';
    private static $post_indicator = ']';

    /**
* Convert an UTF-8 string to a safe ASCII String
*
* conversion process
* - if codepoint is a plain or post_indicator character,
* - if previous character was "converted", append post_indicator to output, clear "converted" flag
* - append ascii byte for character to output
* (continue to next character)
*
* - if codepoint is a pre_indicator character,
* - append ascii byte for character to output, set "converted" flag
* (continue to next character)
*
* (all remaining characters)
* - reduce codepoint value for non-printable ASCII characters (0x00 - 0x1f). Space becomes our zero.
* - convert reduced value to base36 (0-9a-z)
* - append $pre_indicator characater followed by base36 string to output, set converted flag
* (continue to next character)
*
* @param string $filename a utf8 string, should only include printable characters - not 0x00-0x1f
* @return string an encoded representation of $filename using only 'safe' ASCII characters
*
* @author Christopher Smith <chris@jalakai.co.uk>
*/
    public function encode($filename) {
        return self::unicode_to_safe(utf8_to_unicode($filename));
    }

    /**
* decoding process
* - split the string into substrings at any occurrence of pre or post indicator characters
* - check the first character of the substring
* - if its not a pre_indicator character
* - if previous character was converted, skip over post_indicator character
* - copy codepoint values of remaining characters to the output array
* - clear any converted flag
* (continue to next substring)
*
* _ else (its a pre_indicator character)
* - if string length is 1, copy the post_indicator character to the output array
* (continue to next substring)
*
* - else (string length > 1)
* - skip the pre-indicator character and convert remaining string from base36 to base10
* - increase codepoint value for non-printable ASCII characters (add 0x20)
* - append codepoint to output array
* (continue to next substring)
*
* @param string $filename a 'safe' encoded ASCII string,
* @return string decoded utf8 representation of $filename
*
* @author Christopher Smith <chris@jalakai.co.uk>
*/
    public function decode($filename) {
        return unicode_to_utf8(self::safe_to_unicode(strtolower($filename)));
    }

    public function validate_printable_utf8($printable_utf8) {
        return !preg_match('#[\x01-\x1f]#',$printable_utf8);
    }

    public function validate_safe($safe) {
        return !preg_match('#[^'.self::$plain.self::$post_indicator.self::$pre_indicator.']#',$safe);
    }

    /**
* convert an array of unicode codepoints into 'safe_filename' format
*
* @param array int $unicode an array of unicode codepoints
* @return string the unicode represented in 'safe_filename' format
*
* @author Christopher Smith <chris@jalakai.co.uk>
*/
    private function unicode_to_safe($unicode) {

        $safe = '';
        $converted = false;

        foreach ($unicode as $codepoint) {
            if ($codepoint < 127 && (strpos(self::$plain.self::$post_indicator,chr($codepoint))!==false)) {
                if ($converted) {
                    $safe .= self::$post_indicator;
                    $converted = false;
                }
                $safe .= chr($codepoint);

            } else if ($codepoint == ord(self::$pre_indicator)) {
                $safe .= self::$pre_indicator;
                $converted = true;
            } else {
                $safe .= self::$pre_indicator.base_convert((string)($codepoint-32),10,36);
                $converted = true;
            }
        }
        if($converted) $safe .= self::$post_indicator;
        return $safe;
    }

    /**
* convert a 'safe_filename' string into an array of unicode codepoints
*
* @param string $safe a filename in 'safe_filename' format
* @return array int an array of unicode codepoints
*
* @author Christopher Smith <chris@jalakai.co.uk>
*/
    private function safe_to_unicode($safe) {

        $unicode = array();
        $split = preg_split('#(?=['.self::$post_indicator.self::$pre_indicator.'])#',$safe,-1,PREG_SPLIT_NO_EMPTY);

        $converted = false;
        foreach ($split as $sub) {
            $len = strlen($sub);
            if ($sub[0] != self::$pre_indicator) {
                // plain (unconverted) characters, optionally starting with a post_indicator
                // set initial value to skip any post_indicator
                for ($i=($converted?1:0); $i < $len; $i++) {
                    $unicode[] = ord($sub[$i]);
                }
                $converted = false;
            } else if ($len==1) {
                // a pre_indicator character in the real data
                $unicode[] = ord($sub);
                $converted = true;
            } else {
                // a single codepoint in base36, adjusted for initial 32 non-printable chars
                $unicode[] = 32 + (int)base_convert(substr($sub,1),36,10);
                $converted = true;
            }
        }

        return $unicode;
    }

}
Something went wrong with that request. Please try again.