In [1]:
import json
from dataclasses import dataclass
from pprint import  pprint
from typing import List, Dict, Any, Optional
from enum import Enum


In [2]:
# Helpful for types
@dataclass
class Rarity:
    id: str
    name: str
    color: str

@dataclass
class Sticker:
    id: str
    name: str
    description: str
    rarity: Rarity
    crates: List[Dict]
    image: str

In [3]:
# Load the stickers data and Filter
with open("data/stickers.json", 'r') as f:
    stickers_main = json.load(f)

# Filter for the stickers that are autographed or text like "matching"
stickers = list(
    filter(lambda x: (
        (x.get("matching") is not None) or
        ('autographed' in x['description'].lower())
        ), stickers_main))

pprint((stickers[0:1], f"Total Stickers: {len(stickers)}"))

([{'crates': [{'id': 'crate-4007',
               'image': 'https://steamcdn-a.akamaihd.net/apps/730/icons/econ/weapon_cases/crate_sticker_pack01.050d07ce442dc326f33bbb10ade6df941136b479.png',
               'name': 'Sticker Capsule'}],
   'description': 'This sticker can be applied to any weapon you own and can '
                  'be scraped to look more worn. You can scrape the same '
                  'sticker multiple times, making it a bit more worn each '
                  'time, until it is removed from the weapon.',
   'id': 'sticker-13',
   'image': 'https://steamcdn-a.akamaihd.net/apps/730/icons/econ/stickers/standard/thirteen_large.ee7d4fe40792b2bda887c2eaae9f8fb5e7516aef.png',
   'matching': 'luckythirteen',
   'name': 'Sticker | Lucky 13',
   'rarity': {'color': '#4b69ff', 'id': 'rarity_rare', 'name': 'High Grade'}}],
 'Total Stickers: 5871')


In [4]:

def mutate_sticker(sticker: dict) -> dict:
    """
    Mutate the sticker data to extract the player name from the autograph sticker title
    and remove unnecessary fields.
    """

    # Extract player name from the autograph sticker title
    is_autographed: callable = lambda x: (sticker.get("matching", None) == None)

    if (is_autographed(sticker)):
        main_part: str = sticker["name"].split(" | ")[1]

        for tag in ["(Foil)", "(Gold)", "(Holo)", "(Holo, Champion)", "(Glitter)", "(Glitter, Champion)", "(Gold, Champion)", "(Champion)", "(Golden)"]:
            main_part = main_part.replace(tag, "")

        # Interpret certain numbers as letters
        main_part = main_part.strip().lower().replace("$", "s").replace("1", "i").replace("3", "e")

        sticker["matching"] = main_part

    # remove unnecessary fields
    sticker.pop('description', None)
    sticker.pop('crates', None)
    sticker.pop('rarity', None)

    return sticker

stickers = [mutate_sticker(sticker) for sticker in stickers]

pprint(stickers[1])

{'id': 'sticker-16',
 'image': 'https://steamcdn-a.akamaihd.net/apps/730/icons/econ/stickers/standard/conquered_large.dbdf99d6c3a5505caf5692b5d7c24271d96b1072.png',
 'matching': 'venividivici',
 'name': 'Sticker | I Conquered'}


In [5]:
# Get all unique words from the stickers
all_full_words = list(set([sticker["matching"] for sticker in stickers]))
pprint(all_full_words[:10])

['frozen',
 'loveyy',
 'gg',
 'seized',
 'rickeh',
 'mou',
 'styko',
 'brollan',
 'hunter',
 'attacker']


In [6]:
# Want to Link text back to a list of sticker objects with that text
stickers_by_matched_full_word : Dict[str,Sticker] = {}
for matched_word in all_full_words:
    stickers_with_that_pattern = []
    for sticker in stickers:
        if sticker["matching"] == matched_word:
            stickers_with_that_pattern.append(sticker)
    stickers_by_matched_full_word[matched_word] = stickers_with_that_pattern

# Want to Link text back to a list of sticker ids with that text (leaner)
sticker_ids_by_matched_full_word : Dict[str,str] = {}
for matched_word in all_full_words:
    stickers_with_that_pattern = []
    for sticker in stickers:
        if sticker["matching"] == matched_word:
            stickers_with_that_pattern.append(sticker["id"])
    sticker_ids_by_matched_full_word[matched_word] = stickers_with_that_pattern

In [7]:
def group_letters(string) -> List[List[str]]:
    """
    Generate all possible groupings of letters in a string. 

    """
    result = []

    def backtrack(start, current, result):
        if start == len(string):
            result.append(current)
            return
        for end in range(start + 1, len(string) + 1):
            backtrack(end, current + [string[start:end]], result)

    backtrack(0, [], result)
    return result

In [61]:
from enum import Enum

class Token:
    
    class Location(Enum):
        START = "START"
        MID = "MID"
        END = "END"
        TOTAL = "TOTAL"

        def __str__(self):
            return self.value
        def __repr__(self):
            return self.value



    def __init__(self, text : str, grouping:List[str], idx: Optional[int], loc:'Token.Location'=None) -> None:
        """
        text is the part of the sticker work we are using ie the token

        full is the particular text grouping of the full sticker and is used to determine loc with the idx if loc not given

        idx is the index in the grouping where the text occurs

        loc specifies if the specified token is start mid end or total

        **It is assumed that the given text is a substring of the full text
        """
        assert (text is not None and grouping is not None and idx is not None) or (text is not None and loc is not None), \
            "Not enough information provided to genrate token. Either provide text, full, and idx to derive loc, or provide text and loc directly."

        assert len(text) <= (len("".join(grouping)) if grouping != None else float('inf')), "Ensure the token is not longer than the full text, unless None"

        self.text = text
        self.full = grouping
        if loc == None:
            if idx == 0 and len(grouping) == 1:
                #  The grouping is singular ["scream"]
                self.loc = Token.Location.TOTAL
            elif idx == 0:
                self.loc = Token.Location.START
            elif idx == len(grouping)-1:
                self.loc = Token.Location.END
            else:
                self.loc = Token.Location.MID
        else:
            self.loc = loc

    def stringify(self):

        """
        Return the string representation of the token
        """
        return self.__repr__()
    

    def __repr__(self) -> str:

        return f"<Token:{self.loc}:{self.text}>"

    def __eq__(self, __value: object) -> bool:
        return (self.text == __value.text) and (self.loc == __value.loc)

    def __hash__(self) -> int:
        return hash(F"{self.text}&&{self.loc}")


def flatten_concatenation(matrix:List[List[Any]]) -> List[Any]:
    """
    List[List[Any]] -> List[Any]
    """
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list


def tokenize_all_stickers() -> Dict[str, List[Token]]:
    """
    For each sticker_text, return a list of all unique Tokens present in that sticker_text
    """
    word_tokens = {}
    for word in all_full_words:

        r = group_letters(word)
        word_tokens[word] = word_tokens.get(word, [])
        word_tokens[word].append(r)

    # Type Dict[str, List[List[List[str]]]  ]
    """
    {
    ...,
    'scream' : [ [[s,c,r,e,a,m],[s,c,r,e,am],...]  ],
    ...
    }
    """

    # this is just squeezing out  the first dimension
    word_tokens : Dict[str, List[List[str]]] = {word: letter_groupings[0] for word, letter_groupings in word_tokens.items()}






    # Create the token objects
    word_tokens : Dict[str,List[List[Token]]] = {
        word:
        [
            [
                Token(partial_text, grouping, i) for i, partial_text in enumerate(grouping)
            ]
            for grouping in letter_groupings
        ]
        for word, letter_groupings in word_tokens.items()}
    

    # Now for each sticker_text we have a list of different ways to build it with various tokens,
    # We want for each sticker_text a list of unique tokens that can be found in that sticker_text

    word_tokens = {word: 
        list(
            set(flatten_concatenation(token_groupings))
            )
        for word, token_groupings in word_tokens.items()}


    return word_tokens


def invert_dictionary(original_dict) -> Dict:

    inverted_dict = {}
    for key, lst in original_dict.items():

        for element in lst:
            if element not in inverted_dict:
                inverted_dict[element] = [key]
            else:
                if key not in inverted_dict[element]:
                    inverted_dict[element].append(key)

    return inverted_dict


In [11]:
tokens_by_matched_full_word : Dict[str, List[Token]] = tokenize_all_stickers()

In [13]:
inverted_dict : Dict[Token,List[str]] = invert_dictionary(tokens_by_matched_full_word)


### Testing Inverted Dict


In [None]:
inverted_dict[Token("ff", None, None, Token.Location.MID).stringify()]

['office', 'saffee']

## Saving Inverted Dict and Stickers By Full Matched Word


In [None]:
with open('./data/stickers_by_matched_full_word.json','w') as f:
    json.dump({key:value for key,value in stickers_by_matched_full_word.items()},f)
    
with open('./data/sticker_ids_by_matched_full_word.json','w') as f:
    json.dump({key:value for key,value in sticker_ids_by_matched_full_word.items()},f)

with open('./data/inverted_dict.json','w') as f:
    json.dump({str(key):value for key,value in inverted_dict.items()},f)



## Bulding Tokenizer Map from smaller files


In [21]:
with open('./data/stickers_by_matched_full_word.json','r') as f:
    stickers_by_matched_full_word = json.load(f)
with open('./data/sticker_ids_by_matched_full_word.json','r') as f:
    sticker_ids_by_matched_full_word = json.load(f)
with open('./data/inverted_dict.json','r') as f:
    inverted_dict = json.load(f)

In [22]:
from time import perf_counter
 

In [33]:
t1_start = perf_counter() 
tokenizer_map = {token: [sticker_ids_by_matched_full_word[i] for i in formatted_names]
             for token, formatted_names in inverted_dict.items()}


# tokenizer_map is saved as Dict[ __repr__(Token), List[Sticker id strings] ]
tokenizer_map : Dict[Token,List[str]] = {key: flatten_concatenation(
    value) for key, value in tokenizer_map.items()}

t1_stop = perf_counter()
print("Elapsed time during the whole program in seconds:",
                                        t1_stop-t1_start)

Elapsed time during the whole program in seconds: 0.0034887459987658076


In [34]:
pprint(tokenizer_map)

{'<Token:END: b>': ['sticker-488',
                    'sticker-489',
                    'sticker-490',
                    'sticker-782',
                    'sticker-783',
                    'sticker-784',
                    'sticker-2298',
                    'sticker-2299',
                    'sticker-2300',
                    'sticker-2651',
                    'sticker-2652',
                    'sticker-2653',
                    'sticker-3248',
                    'sticker-3249',
                    'sticker-3250',
                    'sticker-3615',
                    'sticker-3616',
                    'sticker-3617'],
 '<Token:END: doplan>': ['sticker-1886', 'sticker-1887', 'sticker-1888'],
 '<Token:END:->': ['sticker-5712',
                   'sticker-5713',
                   'sticker-5714',
                   'sticker-5715',
                   'sticker-6266',
                   'sticker-6267',
                   'sticker-6268',
                   'sticker-6269',
   

In [66]:
from typing import List
from itertools import product
import numpy as np
from pprint import pprint



def with_other(letter_grouping: List[str]):

    def aplit(partial_text: str):
        token_possibilities = []


        for token_location in (Token.Location.START, Token.Location.END, Token.Location.MID, Token.Location.TOTAL):
            #Create a token object for each possible location
            token_possibilities.append(Token(partial_text, None, None, token_location))


        return token_possibilities
    

    all_possible_for_each_tok : List[List[Token]] = []
    # [c  a  r] --> [Token(c,start), end c, mid c, total c], [start a, end a mid a.] ... all combinations of locations

    for partial_text in letter_grouping:
        all_possible_for_each_tok.append(aplit(partial_text))
    return all_possible_for_each_tok

def check_viability(prev_token, potential_next_token) -> bool:
    """
    Check wether or not two tokens can be next to each other.
    """
    if potential_next_token.loc == Token.Location.TOTAL:
        return (
            (prev_token.loc == Token.Location.START) or
            (prev_token.loc == Token.Location.END) or
            (prev_token.loc == Token.Location.MID) or
            (prev_token.loc == Token.Location.TOTAL)
        )
    if potential_next_token.loc == Token.Location.MID:
        return (
            (prev_token.loc == Token.Location.END) or
            (prev_token.loc == Token.Location.TOTAL)
        )
    if potential_next_token.loc == Token.Location.START:
        return (
            (prev_token.loc == Token.Location.START) or
            (prev_token.loc == Token.Location.END) or
            (prev_token.loc == Token.Location.MID) or
            (prev_token.loc == Token.Location.TOTAL)
        )
    if potential_next_token.loc == Token.Location.END:
        return (
            (prev_token.loc == Token.Location.END) or
            (prev_token.loc == Token.Location.TOTAL)
        )

def stickerfy_word(word: str):

    groupings = group_letters(word)
    groupings = [combo for combo in groupings if len(combo) <= 5] #5 Sticker maximum limitation

    results = []
    for group in groupings:
        groupingWithLoc = with_other(group)

        # Create all possible combinations of one element from each sublist
        # Say word = car, let c^ denote a List of all possible tokens for c in each location
        # permute(c^, a^, r^) to get all possible combinations of tokens
        allGroupingPermutations = list(product(*groupingWithLoc))


        # Filter out the combinations that are not viable spacially
        idx_to_keep = []
        for combo_idx, combo in enumerate(allGroupingPermutations):
            status = True
            if len(combo) > 5:
                continue
            for lst, t in enumerate(combo):
                if lst-1 >= 0:
                    current_token = t
                    last_token = combo[lst-1]

                    if not check_viability(last_token, current_token):
                        status = False
                        break
            if status:
                idx_to_keep.append(combo_idx)

        # idx_to_keep = list(set(idx_to_keep))

        allGroupingPermutations = np.array(
            allGroupingPermutations, dtype=object)
        filtered_all_combinations = allGroupingPermutations[idx_to_keep]

        aggregated_list = []
        combo_set = []
        for combo in filtered_all_combinations:

            stickers_matching_tokens = []
            status = True
            for positionalToken in combo:
                # check if token exists
                list_of_players_match = tokenizer_map.get(positionalToken.stringify(), None)

                if list_of_players_match != None:
                    stickers_matching_tokens.append(list_of_players_match)

                else:
                    status = False

            if status:
                stickers_needed = len(combo)

                if (combo[0].loc is Token.Location.END) or (combo[0].loc is Token.Location.MID):
                    stickers_needed += 1
                if (combo[-1].loc is Token.Location.START) or (combo[-1].loc is Token.Location.MID):
                    stickers_needed += 1

                if stickers_needed > 5:
                    continue
                combo_set.append(combo)
                aggregated_list.append(stickers_matching_tokens)

        if (len(combo_set) > 0):

            for k, comb in enumerate(combo_set):

                possible_text_split = []
                for lst, j in zip(aggregated_list[k], comb):

                    matched_part = j.text
                    matched_loc = j.loc

                    matching_stickers = lst

                    sticker_objects_to_append = []

                    sticker_token_to_add = {
                        "matchedPart": matched_part,
                        "matchedLoc": matched_loc,
                        "stickers": matching_stickers
                    }

                    possible_text_split.append(sticker_token_to_add)

                results.append(possible_text_split)


    return results

In [67]:

tokenizer_map[Token("fn", None, None, Token.Location.START).stringify()]

['sticker-1165',
 'sticker-1166',
 'sticker-1167',
 'sticker-1484',
 'sticker-1485',
 'sticker-1486',
 'sticker-2579',
 'sticker-2580',
 'sticker-2581',
 'sticker-5684',
 'sticker-5685',
 'sticker-5686',
 'sticker-5687',
 'sticker-124',
 'sticker-125',
 'sticker-157',
 'sticker-198',
 'sticker-199',
 'sticker-200',
 'sticker-244',
 'sticker-306',
 'sticker-307',
 'sticker-308',
 'sticker-309',
 'sticker-620',
 'sticker-621',
 'sticker-622',
 'sticker-953',
 'sticker-954',
 'sticker-955',
 'sticker-1063',
 'sticker-1064',
 'sticker-1065',
 'sticker-1066',
 'sticker-1373',
 'sticker-1374',
 'sticker-1375',
 'sticker-1376',
 'sticker-2071',
 'sticker-2072',
 'sticker-2073',
 'sticker-2074',
 'sticker-2452',
 'sticker-2453',
 'sticker-2454',
 'sticker-2455',
 'sticker-2983',
 'sticker-2984',
 'sticker-2985',
 'sticker-2986',
 'sticker-3488',
 'sticker-3489',
 'sticker-3490',
 'sticker-3491',
 'sticker-593',
 'sticker-594',
 'sticker-595',
 'sticker-674',
 'sticker-675',
 'sticker-676']

# Spell Out `word`


In [68]:
word = "apple"
res = stickerfy_word(word)

In [75]:
res

[[{'matchedPart': 'a',
   'matchedLoc': START,
   'stickers': ['sticker-3921',
    'sticker-3922',
    'sticker-3923',
    'sticker-4330',
    'sticker-4331',
    'sticker-4332',
    'sticker-2876',
    'sticker-2877',
    'sticker-2878',
    'sticker-4321',
    'sticker-4322',
    'sticker-4323',
    'sticker-5156',
    'sticker-5157',
    'sticker-5158',
    'sticker-836',
    'sticker-837',
    'sticker-838',
    'sticker-1240',
    'sticker-1241',
    'sticker-1242',
    'sticker-1297',
    'sticker-1298',
    'sticker-1299',
    'sticker-1607',
    'sticker-1608',
    'sticker-1609',
    'sticker-2045',
    'sticker-2046',
    'sticker-2047',
    'sticker-2223',
    'sticker-2224',
    'sticker-2225',
    'sticker-2561',
    'sticker-2562',
    'sticker-2563',
    'sticker-3212',
    'sticker-3213',
    'sticker-3214',
    'sticker-3684',
    'sticker-3685',
    'sticker-3686',
    'sticker-4294',
    'sticker-4295',
    'sticker-4296',
    'sticker-500',
    'sticker-501',
    's

### Make small sticker_id --> sticker map


In [40]:
stickers_by_id = {}
for sticker in stickers_main:
    sticker_copy = dict(sticker)

    sticker_copy.pop('rarity',None)

    sticker_copy.pop('description',None)
    sticker_copy.pop('crates',None)

    stickers_by_id[sticker_copy['id']] = sticker_copy

In [None]:
with open('stickers_by_id.json','w') as f:
    json.dump(stickers_by_id,f)