From 5014512d1d26f4fb6a056052aa86a5524c052595 Mon Sep 17 00:00:00 2001
From: Mohammad Sadegh Rasooli <rasooli@fb.com>
Date: Fri, 1 Feb 2019 19:31:44 -0800
Subject: [PATCH] EM algorithm for unsupervised morphology

Summary: This diff implements the EM algorithm for morphological segmentation. A next diff should write methods to save and load models as well as segmenting new files.

Differential Revision: D13827777

fbshipit-source-id: 7bae107d5f194c6b92766c31b08ff25e94e19318
---
 .../test/test_unsupervised_morphology.py      |  27 ++++
 .../unsupervised_morphology.py                | 144 ++++++++++++++++++
 2 files changed, 171 insertions(+)

diff --git a/pytorch_translate/research/test/test_unsupervised_morphology.py b/pytorch_translate/research/test/test_unsupervised_morphology.py
index 83d196db..28b393da 100644
--- a/pytorch_translate/research/test/test_unsupervised_morphology.py
+++ b/pytorch_translate/research/test/test_unsupervised_morphology.py
@@ -314,3 +314,30 @@ def test_forward_backward_long_str(self):
             assert t[("suffix", "prefix")] == 0
             assert t[("suffix", "stem")] == 0
             assert t[("suffix", "suffix")] > 0
+
+    def test_EM(self):
+        with patch("builtins.open") as mock_open:
+            txt_content = [
+                "work",
+                "works",
+                "worked",
+                "working",
+                "go",
+                "goes",
+                "gone",
+                "going",
+                "do",
+                "does",
+                "did",
+                "doing",
+                "see",
+                "saw",
+                "seen",
+                "seeing",
+            ]
+            mock_open.return_value.__enter__ = mock_open
+            mock_open.return_value.__iter__ = Mock(return_value=iter(txt_content))
+            unsupervised_model = unsupervised_morphology.UnsupervisedMorphology(
+                "no_exist_file.txt", smoothing_const=0.0
+            )
+            unsupervised_model.expectation_maximization(100, 10)
diff --git a/pytorch_translate/research/unsupervised_morphology/unsupervised_morphology.py b/pytorch_translate/research/unsupervised_morphology/unsupervised_morphology.py
index df4f9e20..b8de763b 100644
--- a/pytorch_translate/research/unsupervised_morphology/unsupervised_morphology.py
+++ b/pytorch_translate/research/unsupervised_morphology/unsupervised_morphology.py
@@ -2,6 +2,8 @@
 
 import math
 from collections import Counter, defaultdict
+from itertools import chain, zip_longest
+from multiprocessing import Pool
 
 
 class MorphologyHMMParams(object):
@@ -373,3 +375,145 @@ def forward_backward(self, word):
                         ) / denominator
 
         return emission_expectations, transition_expectations
+
+    @staticmethod
+    def group_to(max_size, iterable):
+        return list(zip_longest(*[iter(iterable)] * max_size, fillvalue=None))
+
+    def expectation_substep(self, words):
+        """
+        This method is subprocess for the expectation method.
+        """
+        emissions, transitions, freqs = [], [], []
+
+        for (word, freq) in words:
+            e, t = self.forward_backward(word)
+            emissions.append(e)
+            transitions.append(t)
+            freqs.append(freq)
+
+        emission_expectations = defaultdict(float)
+        transition_expectations = defaultdict(float)
+
+        for e_key in set(chain(*[list(e.keys()) for e in emissions])):
+            emission_expectations[e_key] = sum(
+                e[e_key] * freqs[i] for i, e in enumerate(emissions)
+            )
+        for t_key in set(chain(*[list(t.keys()) for t in transitions])):
+            transition_expectations[t_key] = sum(
+                t[t_key] * freqs[i] for i, e in enumerate(transitions)
+            )
+        return emission_expectations, transition_expectations
+
+    def expectation(self, pool, train_words_chunks):
+        """
+        This method runs the expectation step with a chunked list of training words.
+        Args:
+            pool: Pool object for multi-threading.
+            train_words_chunks: a list of word+frequency-lists (chunked for
+                    multi-threading).
+        """
+        expectations = pool.map(self.expectation_substep, train_words_chunks)
+
+        emission_expectations = {"prefix": {}, "stem": {}, "suffix": {}}
+        transition_expectations = {
+            "prefix": {},
+            "stem": {},
+            "suffix": {},
+            "START": {},
+            "END": {},
+        }
+
+        for e_key in set(chain(*[list(e[0].keys()) for e in expectations])):
+            emission_expectations[e_key[0]][e_key[1]] = sum(
+                e[0][e_key] for e in expectations
+            )
+        for t_key in set(chain(*[list(t[1].keys()) for t in expectations])):
+            transition_expectations[t_key[0]][t_key[1]] = sum(
+                t[1][t_key] for t in expectations
+            )
+
+        emission_denoms = {
+            e: sum(v for v in emission_expectations[e].values())
+            for e in emission_expectations.keys()
+        }
+        transition_denoms = {
+            t: sum(v for v in transition_expectations[t].values())
+            for t in transition_expectations.keys()
+        }
+        return (
+            emission_expectations,
+            emission_denoms,
+            transition_expectations,
+            transition_denoms,
+        )
+
+    def maximization(
+        self,
+        emission_expectations,
+        emission_denoms,
+        transition_expectations,
+        transition_denoms,
+    ):
+        """
+        Runs the maximization algorithm.
+        Args:
+            emission_expectations: the expected counts for each affix-morpheme pair.
+            emission_denoms: the sum-expected count of each morpheme class.
+            transition_expectations: the expected counts for each affix-affix pair
+                for transition.
+            transition_denoms: the sum-expected count of each morpheme class as
+                conditional in transition.
+        """
+        smoothing_const = self.params.smoothing_const
+        for morpheme_class in self.params.morph_emit_probs.keys():
+            num_morphs = len(self.params.morph_emit_probs[morpheme_class])
+            d = emission_denoms[morpheme_class]
+            for morpheme in self.params.morph_emit_probs[morpheme_class].keys():
+                e = emission_expectations[morpheme_class][morpheme]
+                if d > 0 or smoothing_const > 0:
+                    self.params.morph_emit_probs[morpheme_class][morpheme] = (
+                        e + smoothing_const
+                    ) / ((num_morphs * smoothing_const) + d)
+                else:  # for cases of underflowing
+                    self.params.morph_emit_probs[morpheme_class][morpheme] = (
+                        1.0 / num_morphs
+                    )
+
+        for m1 in self.params.affix_trans_probs.keys():
+            if m1 == "END":
+                continue  # "END" has zero probs for all.
+            for m2 in self.params.affix_trans_probs[m1].keys():
+                if m2 in transition_expectations[m1]:
+                    if transition_denoms[m1] > 0:
+                        self.params.affix_trans_probs[m1][m2] = (
+                            transition_expectations[m1][m2] / transition_denoms[m1]
+                        )
+                    else:  # for cases of underflow.
+                        self.params.affix_trans_probs[m1][m2] = 1.0 / len(
+                            self.params.affix_trans_probs[m1]
+                        )
+                else:
+                    self.params.affix_trans_probs[m1][m2] = 0.0
+
+    def expectation_maximization(self, num_iters, num_cpus=10):
+        """
+        Runs the EM algorithm.
+        Args:
+            num_iters: Number of EM epochs.
+            num_cpus: Number of cpus for parallel executation of the E step.
+        """
+        pool = Pool(num_cpus)
+        train_words = [
+            (word, self.params.word_counts[word])
+            for word in self.params.word_counts.keys()
+        ]
+        chunk_size = math.ceil(float(len(train_words)) / num_cpus)
+        train_words_chunks = UnsupervisedMorphology.group_to(chunk_size, train_words)
+        for epoch in range(num_iters):
+            print("starting epoch %i" % epoch)
+            print("starting expectation step")
+            ee, ed, te, td = self.expectation(pool, train_words_chunks)
+            print("starting maximization step")
+            self.maximization(ee, ed, te, td)
+            print("updated parameters after maximization")