# Priprema podataka

**Ucitavanje podataka iz datoteke**
  - u prvoj liniji su reci za koje treba naci regex (skup M)
  - u drugoj liniji su reci za koje ne treba naci regex (skup U)
  - reci u razdvojene sa: ", "

In [379]:
def readFile(filename):
    with open(filename, 'r') as f:
        match = [word for word in f.readline().split(", ")]
        # uklanjanje novog reda iz poslednje reci
        match[-1] = match[-1][:-1]
        unmatch = [word for word in f.readline().split(", ")]
        
    return match, unmatch

In [380]:
match, unmatch = readFile("/content/example_1.txt")
print("Prvi skup: ", match)
print("Drugi skup: ", unmatch)

Prvi skup:  ['afoot', 'catfoot', 'dogfoot', 'fanfoot', 'foody', 'foolery', 'foolish', 'fooster', 'footage', 'foothot', 'footle', 'footpad', 'footway', 'hotfoot', 'jawfoot', 'mafoo', 'nonfood', 'padfoot', 'prefool', 'sfoot', 'unfool']
Drugi skup:  ['Atlas', 'Aymoro', 'Iberic', 'Mahran', 'Ormazd', 'Silipan', 'altared', 'chandoo', 'crenel', 'crooked', 'fardo', 'folksy', 'forest', 'hebamic', 'idgah', 'manlike', 'marly', 'palazzi', 'sixfold', 'tarrock', 'unfold']


In [381]:
# skup za poredjenje rezultata iz dokumentacije
# match = ['can', 'banana', 'and', 'ball']
# match = ['bar', 'den', 'foo', 'can']
# unmatch = ['indy', 'call', 'name', 'man']

In [382]:
# Broj reci u skupovima
num_m = len(match)
num_u = len(unmatch)
print(num_m)
print(num_u)

21
21


**Karakteri koji se pojavljuju u jednom skupu reci**

In [383]:
def charsInSet(wordSet):
    #stavicemo d chars bude set da ne bismo brinuli o duplikatima
    charsSet = set({})

    for word in wordSet:
        for c in word:
          charsSet.add(c)
    chars = []
    for el in charsSet:
      chars.append(el)
    chars.sort()

    return chars

In [384]:
chars_in_M = charsInSet(match)
print(chars_in_M)

['a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y']


**Opsezi reci (partial ranges) koji se javljaju u skupu M**

In [385]:
def makeRanges(chars_in_M):
    ranges = []

    i = 0
    #za svako slovo niza proveravamo
    while i < len(chars_in_M)-1:
        distance = 0
        for j in range(i+1, len(chars_in_M)):
            if ord(chars_in_M[j]) - ord(chars_in_M[i]) == distance + 1:
                distance += 1
            else:
                if chars_in_M[i] != chars_in_M[j-1]:
                    ranges.append(chars_in_M[i] + '-' + chars_in_M[j-1])
                i = j
                break

    return ranges

In [386]:
ranges = makeRanges(chars_in_M)
print(ranges)

['c-j', 'l-p', 'r-u']


**n-grami**

In [387]:
def ngram(M, U):
    res = {}

    # n-grami su duzine od 2 do 4
    for n in range(2, 5):
        # prolazimo kroz sve reci iz M i iz U 
        # (radi ako su skupovi iste duzine)
        for i in range(0, len(M)):
            word_m = M[i]
            word_u = U[i]
            
            word_m_visited = False
            word_u_visited = False
            
            # pravimo n-grame od jedne reci iz M i jedne iz U
            ngrams_m = zip(*[word_m[i:] for i in range(n)])
            ngrams_u = zip(*[word_u[i:] for i in range(n)])

            gram_m = ["".join(gr) for gr in ngrams_m]
            gram_u = ["".join(gr) for gr in ngrams_u]

            # ne smemo imati ponavljanje n-grama u istoj reci (pravimo skup)
            gram_m = set(gram_m)
            gram_u = set(gram_u)

            # azuriramo score za ngram u zavisnosti u kom skupu se nalazi
            for g in gram_m: # povecavamo score ako je u M
                if g not in res:
                    res[g] = 1
                elif g in res: 
                    res[g] += 1

            for g in gram_u: # smanjujemo score ako je u U
                if g not in res:
                    res[g] = -1
                elif g in res:
                    res[g] -= 1

    return res

In [388]:
ngrams = ngram(match, unmatch)
ngrams = sorted(ngrams.items(), key=lambda x: x[1], reverse=True)

# ngram_subset je najmanji podskup od ngrams tako da je skor reci bar |M|
ngram_subset = []
score = 0

for i in range(len(ngrams)):
    if ngrams[i][1] > 0: # azuriramo samo ako je skor pozitivan
        score += ngrams[i][1]
        ngram_subset.append(ngrams[i][0])

        if score >= num_m:
            break 

print(ngram_subset)
# print(score)

['foo']


**Terminal i Function skupovi**

In [449]:
# . je placeholder za dete cvor
FUNCTION_SET = [".*", ".+", ".?", ".{.,.}+", # possessive quantifiers
                "(.)",                          # group
                "[.]",                          # character class
                "[^.]",                         # negated character
                "..",                           # concatenator (binary node) mislim da treba da promenimo prikaz ovog noda tipa ` jer je kod njih tacka na sredini a ovo znaci concat
                ".|.",                          # disjunction
                ]

In [450]:
TERMINAL_SET = ["a-z", "A-Z", "0-9", "^", "$", "%", # instance independent terminals
                "\w", "\W", "\d", "\D", "\b", "\B", "\A", "\Z", "\s", "\S"
               ]

In [451]:
for c in chars_in_M:
    TERMINAL_SET.extend(c)

#Upisujemo n_grame u terminal set
for n in ngram_subset:
    TERMINAL_SET.extend(ngram_subset)

#ispisujemo range-ove u terminal set
for r in ranges:
    TERMINAL_SET.append(r)

print(TERMINAL_SET)

['a-z', 'A-Z', '0-9', '^', '$', '%', '\\w', '\\W', '\\d', '\\D', '\x08', '\\B', '\\A', '\\Z', '\\s', '\\S', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'w', 'y', 'foo', 'c-j', 'l-p', 'r-u']


# Parametri za algoritam genetskog programiranja



In [452]:
# pocetni parametri (zasnovani na dokumentaciji)
POPULATION_SIZE = 500
GENERATIONS_NUM = 1000
POPULATION_NUM = 32
TOURNAMENT_SIZE = 7

# Algoritam genetskog programiranja

U nastavku je samo kostur/pseudokod GP-a

Svaka jedinka ima 2 vrste fitnesa:
- funkcija n_m - n_u treba da se maksimizuje
- duzina r treba da se minimizuje

r je trenutni regex, n_m je broj reci iz M koje su poklopljene sa r, n_u je broj reci iz U koje su poklopljene sa r

In [453]:
import re
import random

In [454]:
def getRandom():
      pickSet = random.choice(['f', 't'])
      if pickSet == 't':
        value = random.choice(FUNCTION_SET)
        if value in [".{.,.}+"]:
          return value, 3
        elif value in [".|.", ".."]:
          return value, 2
        else:
          return value, 1
      else:
        value = random.choice(TERMINAL_SET)
        return value, 0
class Node(object):
  def __init__(self, depth, root):
    self.depth = depth
    self.value = ""
    self.childrenNum = 0
    
    
    if root:
      self.value = "."
      self.childrenNum = 2
    else:
      self.value, self.childrenNum = getRandom()
    
    self.left = None
    self.right = None
    self.third = None

    if self.childrenNum == 3:
      self.left= Node(depth+1, False)
      self.right = Node(depth +1, False)
      self.third = Node(depth+1, False)
    elif self.childrenNum == 2:
      self.left = Node(depth+1, False)
      self.right = Node(depth+1, False)
    elif self.childrenNum == 1:
      self.left = Node(depth+1, False)  

In [492]:
def obidji(node):
  if node.value in TERMINAL_SET:
    if node.value == "%":
      return "."
    return node.value
  
  rl= obidji(node.left)
  if node.childrenNum == 2:
    rr = obidji(node.right)
  if node.childrenNum == 3:
    rr = obidji(node.right)
    rt = obidji(node.third)
  
  if node.value in FUNCTION_SET:
    if node.value == ".*":
      string = rl + "*"
      return string
    if node.value == ".+":
      string = rl + "+"
      return string
    if node.value == ".?":
      string = rl + "?"
      return string
    if node.value == "(.)":
      string = "(" + rl + ")"
      return string
    if node.value == "[.]":
      string = "[" + rl + "]"
      return string
    if node.value == "[^.]":
      string = "[^" + rl + "]"
      return string
    if node.value == "..":
      string = rl + rr
      return string
    if node.value == ".|.":
      string = rl + "|" + rr
      return string
    if node.value == ".{.,.}+":
      string = rl + "{" + rr + "," + rt + "}+"
      return string
    print("MATCHOVO SE SA FUNCTIONAL A NIJE FUNCTIONAL " + node.value)
  #ako je dosao do ovde je root
  string = rl + rr
  return string


In [493]:
n = Node(0, True)
p = obidji(n)

In [494]:
class Individual:
  #Kako resavamo problem plain strings nase wi= 10
    def __init__(self, setM, setU):
        #kod je ustvari neko apstrakto drvo koje cuva odredjeni regex u sebi
        #napraviti f-ju initialize koja poziva konstruktor noda sve dok ne dobije regex koji je validan
        self.code = Node(0, True)
        self.wi = 10
        self.setM = setM.copy()
        self.setU = setU.copy()

        self.n_m = 0
        self.n_u = 0
        #n_m - n_u - maximize
        self.fitnessFunction = self.calculateFitnessFunction()
        #length of r - minimize
        self.fitnessRegex = self.calculateFitnessRegex()
        #zapravo fitness racunamo kao wi*(n_m - n_u) - length(r)
        self.fitness = self.wi * self.fitnessFuction - self.fitnessRegex
    
    def __lt__(self, other):
      #Dakle zapravo zelimo da maksimizujemo taj skor
      return self.fitness > other.fitness

    def calculateFitnessFunction(self):
        #pretvoriti drvo u regex i videti i izracunati n_m - n_u
        string = obidji(self.code)
        for wordM, wordU in zip(self.setM, self.setU): 
          #vratice nam listu stringova koji se poklapaju
          matchM = re.findall(string, wordM)
          matchU = re.findall(string, wordU)
          for m in matchM:
            if len(m) == len(wordM):
              n_m +=1
          for m in  matchU:
            if len(m) == len(wordU):
              n_u += 1
        return n_m - n_u

    def calculateFitnessRegex(self):
        #pretvoriti drvo u string i vratiti njegovu duzinu
        string = obidji(self.code)
        return len(string)

In [495]:

def selection(population):
  TOURNAMENT_SIZE = 7
  betsFitness = float('-inf')
  bestIndex = -1

  for i in range(TOURNAMENT_SIZE):
    index = random.randrange(len(population))
    if population[index].fitness > betsFitness:
      betsFitness = population[index].fitness
      bestIndex = index
  return bestIndex

- Prvih |M| jedinki populacije se formiraju tako sto se koriste samo slova iz trenutne reci i operator konkatenacije (..)
- Ostalih POPULATION_SIZE-|M| jedinki se formiraju random, dubine drveta 1-5

Formiranje nove populacije:
- 10% random
- 10% mutacijom
- 80% ukrstanje

In [None]:
def genetic_programming():
    population = [Individual() for _ in range(POPULATION_SIZE)]
    newPopulation = [Individual() for _ in range(POPULATION_SIZE)]

    solutionFound = False

    for i in range(NUMBER_GENERATIONS):
        population.sort()

        if population[0].fitnessFunction == num_m:
            solutionFound = True
            break

        for j in range(0, POPULATION_SIZE, 2):
            parent1Index = selection(population)
            parent2Index = selection(population)

            crossover(population[parent1Index], population[parent2Index], newPopulation[j], newPopulation[j+1])

            mutation(newPopulation[j])
            mutation(newPopulation[j+1])

            newPopulation[j].fitness = newPopulation[j].calculateFitness()
            newPopulation[j+1].fitness = newPopulation[j+1].calculateFitness()

        population = newPopulation