In [None]:
# Porter Stemmer

# Fuertemente inspirado en el código disponible en https://developer.ibm.com/tutorials/awb-stemming-text-porter-stemmer-algorithm-python/

In [2]:
class PorterStemmer:
    def __init__(self):
        self.b = ""  # buffer for word to be stemmed
        self.k = 0   # offset to the end of the string
        self.k0 = 0  # offset to the beginning of the string
        self.j = 0   # a general offset into the string

    def cons(self, i):
        """cons(i) is TRUE <=> b[i] is a consonant."""
        if self.b[i] in 'aeiou':
            return False
        if self.b[i] == 'y':
            if i == self.k0:
                return True
            else:
                return not self.cons(i - 1)
        return True

    def m(self):
        """m() measures the number of consonant sequences between k0 and j."""
        n = 0
        i = self.k0
        while True:
            if i > self.j:
                return n
            if not self.cons(i):
                break
            i += 1
        i += 1
        while True:
            while True:
                if i > self.j:
                    return n
                if self.cons(i):
                    break
                i += 1
            i += 1
            n += 1
            while True:
                if i > self.j:
                    return n
                if not self.cons(i):
                    break
                i += 1
            i += 1

    def vowelinstem(self):
        """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
        for i in range(self.k0, self.j + 1):
            if not self.cons(i):
                return True
        return False

    def doublec(self, j):
        """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
        if j < (self.k0 + 1):
            return False
        if self.b[j] != self.b[j - 1]:
            return False
        return self.cons(j)

    def cvc(self, i):
        """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
           and also if the second c is not w,x or y. this is used when trying to
           restore an e at the end of a short word. e.g.
              cav(e), lov(e), hop(e), crim(e), but
              snow, box, tray.
        """
        if i < (self.k0 + 2) or not self.cons(i) or self.cons(i - 1) or not self.cons(i - 2):
            return False
        ch = self.b[i]
        if ch == 'w' or ch == 'x' or ch == 'y':
            return False
        return True

    def ends(self, s):
        """ends(s) is TRUE <=> k0,...k ends with the string s."""
        length = len(s)
        if s != self.b[self.k - length + 1:self.k + 1]:
            return False
        self.j = self.k - length
        return True

    def setto(self, s):
        """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
        length = len(s)
        self.b = self.b[:self.j + 1] + s + self.b[self.j + length + 1:]
        self.k = self.j + length

    def r(self, s):
        """r(s) is used further down."""
        if self.m() > 0:
            self.setto(s)

    def step1ab(self):
        if self.b[self.k] == 's':
            if self.ends("sses"):
                self.k -= 2
            elif self.ends("ies"):
                self.setto("i")
            elif self.b[self.k - 1] != 's':
                self.k -= 1
        if self.ends("eed"):
            if self.m() > 0:
                self.k -= 1
        elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
            self.k = self.j
            if self.ends("at"):
                self.setto("ate")
            elif self.ends("bl"):
                self.setto("ble")
            elif self.ends("iz"):
                self.setto("ize")
            elif self.doublec(self.k):
                self.k -= 1
                ch = self.b[self.k]
                if ch == 'l' or ch == 's' or ch == 'z':
                    self.k += 1
            elif self.m() == 1 and self.cvc(self.k):
                self.setto("e")

    def step1c(self):
        if self.ends("y") and self.vowelinstem():
            self.b = self.b[:self.k] + 'i' + self.b[self.k + 1:]

    def step2(self):
        if self.k == self.k0:
            return
        if self.b[self.k - 1] == 'a':
            if self.ends("ational"):
                self.r("ate")
            elif self.ends("tional"):
                self.r("tion")
        elif self.b[self.k - 1] == 'c':
            if self.ends("enci"):
                self.r("ence")
            elif self.ends("anci"):
                self.r("ance")
        elif self.b[self.k - 1] == 'e':
            if self.ends("izer"):
                self.r("ize")
        elif self.b[self.k - 1] == 'l':
            if self.ends("bli"):
                self.r("ble")
            elif self.ends("alli"):
                self.r("al")
            elif self.ends("entli"):
                self.r("ent")
            elif self.ends("eli"):
                self.r("e")
            elif self.ends("ousli"):
                self.r("ous")
        elif self.b[self.k - 1] == 'o':
            if self.ends("ization"):
                self.r("ize")
            elif self.ends("ation"):
                self.r("ate")
            elif self.ends("ator"):
                self.r("ate")
        elif self.b[self.k - 1] == 's':
            if self.ends("alism"):
                self.r("al")
            elif self.ends("iveness"):
                self.r("ive")
            elif self.ends("fulness"):
                self.r("ful")
            elif self.ends("ousness"):
                self.r("ous")
        elif self.b[self.k - 1] == 't':
            if self.ends("aliti"):
                self.r("al")
            elif self.ends("iviti"):
                self.r("ive")
            elif self.ends("biliti"):
                self.r("ble")
        elif self.b[self.k - 1] == 'g':
            if self.ends("logi"):
                self.r("log")

    def step3(self):
        if self.k == self.k0:
            return
        if self.b[self.k] == 'e':
            if self.ends("icate"):
                self.r("ic")
            elif self.ends("ative"):
                self.r("")
            elif self.ends("alize"):
                self.r("al")
        elif self.b[self.k] == 'i':
            if self.ends("iciti"):
                self.r("ic")
        elif self.b[self.k] == 'l':
            if self.ends("ical"):
                self.r("ic")
            elif self.ends("ful"):
                self.r("")
        elif self.b[self.k] == 's':
            if self.ends("ness"):
                self.r("")

    def step4(self):
        if self.k == self.k0:
            return
        if self.b[self.k - 1] in 'aeiou':
            return
        if self.ends("al"):
            pass
        elif self.ends("ance"):
            pass
        elif self.ends("ence"):
            pass
        elif self.ends("er"):
            pass
        elif self.ends("ic"):
            pass
        elif self.ends("able"):
            pass
        elif self.ends("ible"):
            pass
        elif self.ends("ant"):
            pass
        elif self.ends("ement"):
            pass
        elif self.ends("ment"):
            pass
        elif self.ends("ent"):
            pass
        elif self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'):
            pass
        elif self.ends("ou"):
            pass
        elif self.ends("ism"):
            pass
        elif self.ends("ate"):
            pass
        elif self.ends("iti"):
            pass
        elif self.ends("ous"):
            pass
        elif self.ends("ive"):
            pass
        elif self.ends("ize"):
            pass
        else:
            return
        if self.m() > 1:
            self.k = self.j

    def step5(self):
        self.j = self.k
        if self.b[self.k] == 'e':
            a = self.m()
            if a > 1 or (a == 1 and not self.cvc(self.k - 1)):
                self.k -= 1
        if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
            self.k -= 1

    def stem(self, p, i, j):
        """In stem(p,i,j), p is a char array, and the string to be stemmed is from p[i] to p[j] inclusive."""
        self.b = p
        self.k = j
        self.k0 = i
        if self.k <= self.k0 + 1:
            return self.b  # --DEPARTURE--
        self.step1ab()
        self.step1c()
        self.step2()
        self.step3()
        self.step4()
        self.step5()
        return self.b[self.k0:self.k + 1]


In [3]:
stemmer = PorterStemmer()
words = ["illusion", "illusional", "illusionsit", "illusive", "illusory", "desilusional"]
for word in words:
    print(f"{word} -> {stemmer.stem(word, 0, len(word) - 1)}")

illusion -> illusion
illusional -> illusional
illusionsit -> illusionsit
illusive -> illus
illusory -> illusori
desilusional -> desilusional


In [6]:
from nltk.stem import PorterStemmer

nltk_stemmer = PorterStemmer()
for word in words:
    print(f"{word} -> {nltk_stemmer.stem(word)}")

illusion -> illus
illusional -> illusion
illusionsit -> illusionsit
illusive -> illus
illusory -> illusori
desilusional -> desilusion
