Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Remake.

  • Loading branch information...
commit 357a97522011b592cac47937a680f480bead196f 1 parent fcbc3c1
Oscar Sjöberg authored February 13, 2012
19  Makefile
... ...
@@ -1,16 +1,17 @@
1 1
 REL_TOP := $(dir $(lastword $(MAKEFILE_LIST)))
2 2
 TOP=`readlink -f $(REL_TOP)`/dist
3  
-PROF_FLAGS=-prof -fforce-recomp -auto-all
4 3
 RTS_FLAGS=-rtsopts=all -threaded
5  
-all: build test
  4
+GHC_FLAGS=-XTypeSynonymInstances -XBangPatterns -O2 -fforce-recomp -funfolding-use-threshold=16 -fexcess-precision -fllvm
  5
+PROF_FLAGS=-prof -fforce-recomp -auto-all
  6
+all: build
6 7
 
7 8
 build:
8 9
 	@echo "Building..."
9  
-	cabal install --prefix=$(TOP) --user -O2 --ghc-options="$(RTS_FLAGS)"
  10
+	@cabal install --prefix=$(TOP) --user -O2 --ghc-options="$(GHC_FLAGS) $(RTS_FLAGS)"
10 11
 
11 12
 build-prof:
12 13
 	@echo "Building..."
13  
-	@cabal install --prefix=$(TOP) --user -O2 --ghc-options="$(PROF_FLAGS) $(RTS_FLAGS)"
  14
+	@cabal install --prefix=$(TOP) --user -O2 --ghc-options="$(GHC_FLAGS) $(RTS_FLAGS) $(PROF_FLAGS)"
14 15
 
15 16
 hlint:
16 17
 	@echo "Running hlint..."
@@ -24,11 +25,13 @@ clean:
24 25
 
25 26
 test:
26 27
 	@echo "Testing..."
27  
-	@runhaskell -w tests/AllTests.hs
  28
+	@runhaskell -w tests/Test1.hs
28 29
 
29 30
 run:
30  
-	dist/bin/sp +RTS -N -K100M -RTS
31  
-	@# +RTS -N -RTS # -K100M -RTS
  31
+	@dist/bin/sp +RTS -N -K100M -RTS
32 32
 
33 33
 run-prof:
34  
-	@dist/bin/sp +RTS -N1 -K100M -xc -RTS
  34
+	@dist/bin/sp +RTS -N1 -K100M -xc -L80 -RTS
  35
+
  36
+deps:
  37
+	@cabal install --reinstall -O2 -p hashable unordered-containers deepseq regex-compat regex-posix regex-base MissingH hslogger network parsec compact-string-fix cryptohash crypto-api entropy tagged semigroups data-default dlist cereal data-binary-ieee754 monad-control base-unicode-symbols random-shuffle parallel bson lifted-base list-extras largeword ConfigFile mongoDB
19  sp.cabal
@@ -9,15 +9,22 @@ Build-Type:          Simple
9 9
 Cabal-Version:       >= 1.9.2
10 10
 
11 11
 Library
12  
-  Build-Depends:     base >= 4, mtl, mongoDB, parallel, bytestring, 
13  
-                     list-extras, hashable, unordered-containers
14  
-  Exposed-Modules:   SP.Cluster, SP.Score, SP.Merge
  12
+  Build-Depends:     base >= 4, mtl, mongoDB, parallel, bytestring, bson,
  13
+                     list-extras, hashable, unordered-containers, array, 
  14
+                     deepseq, regex-compat, MissingH, ConfigFile, containers,
  15
+                     text, stream-fusion
  16
+  Exposed-Modules:   SP.Cluster, SP.Score.Score, SP.Preprocess.Preprocess
  17
+                     SP.ByteString, SP.DeepSeq, SP.Preprocess.Compound,
  18
+                     SP.Redirect, SP.Config, SP.Bootstrap.MongoDB,
  19
+                     SP.Score.Scorer, SP.Score.Argument, SP.Score.Object,
  20
+                     SP.Score.Math
15 21
   HS-Source-Dirs:    src
16  
-
17 22
 Executable sp
  23
+  Build-Depends:     base >= 4, mtl, mongoDB, parallel, bytestring, bson,
  24
+                     list-extras, hashable, unordered-containers, array, 
  25
+                     deepseq, regex-compat, MissingH, ConfigFile, containers,
  26
+                     text, stream-fusion
18 27
   Main-is:           SP.hs
19  
-  Build-Depends:     base >= 4, mtl, mongoDB, parallel, bytestring, 
20  
-                     list-extras, hashable, unordered-containers
21 28
   HS-Source-Dirs:    src
22 29
 
23 30
 Test-Suite test-sp
2  src/Makefile
... ...
@@ -0,0 +1,2 @@
  1
+all:
  2
+	(cd ..;make)
70  src/SP.hs
... ...
@@ -1,44 +1,38 @@
1  
-
2  
-import Control.Monad
3 1
 import Control.Monad.Trans (liftIO)
4  
-import Data.List (genericLength)
5  
-import Data.List.Extras
6  
-import SP.Bootstrapper
  2
+import Data.List
  3
+import Data.Ord
  4
+import SP.ByteString
7 5
 import SP.Cluster
8  
-import SP.Score
9  
-import SP.Merge
10  
-import System.CPUTime
11  
-import System.Environment
12  
-import Text.Printf
  6
+import SP.Config
  7
+import SP.Bootstrap.MongoDB
  8
+import SP.Preprocess.Compound
  9
+import SP.Preprocess.Preprocess
  10
+import SP.Score.Score
  11
+import SP.Score.Scorer
  12
+import Text.Printf (printf)
  13
+import Prelude
  14
+import System.Exit
13 15
 
14 16
 main :: IO ()
15  
-main = bootstrap cluster
16  
-
17  
-cluster :: [Art] -> [ObjClr] -> IO ()
18  
-cluster arts objClrs = liftIO $ do
19  
---print arts
20  
-  iter 1 objClrs
  17
+main = bootstrap start
21 18
 
22  
-iter :: Int -> [ObjClr] -> IO ()
23  
-iter n clrs = liftIO $ do
24  
-  printf "Making iteration %d...\n" n
25  
-  printf "%d object clusters.\n" $ length clrs
26  
-  start <- getCPUTime
27  
-  -- Scores.
28  
-  let scores = bestScrs clrs
29  
-  printf "Total score: %.2f\n" $ sum $ map val scores
30  
-  printf "Number of scores: %d\n" $ length scores
31  
-  -- Merge clusters.
32  
-  let result = mergeClusters clrs scores
33  
-  -- End and print timing.
34  
-  end <- getCPUTime
35  
-  let elapsed :: Double
36  
-      elapsed = fromIntegral (end - start) / 10^9
37  
-  -- Elapsed time, seconds.
38  
-  printf "Time elapsed: %.2f s\n" $ elapsed / 10^3 
39  
-  -- Elapsed time / cluster, milliseconds.
40  
-  printf "Time elapsed / cluster: %.2f ms\n" $ elapsed / genericLength clrs
41  
-  -- Print and contingently continue.
42  
-  print scores
43  
-  unless (null scores) $ iter (n + 1) result
  19
+start :: [Partition] -> IO ()
  20
+start ptns1 = liftIO $ do
  21
+  config <- getConfig
  22
+  let ptns2 = takePartitions config ptns1
  23
+      ptns3 = mergeNerCompounds ptns2
  24
+  iter 1 ptns3
  25
+  exitSuccess
44 26
 
  27
+iter :: Int -> [Partition] -> IO ()
  28
+iter n ptns = liftIO $ do
  29
+  printf "Starting iteration %d.\n" n
  30
+  let groups = groupByPos ptns
  31
+      best = maxOperatorScores 0.25 groups
  32
+--print ptns
  33
+  mapM_ (putStrLn.show) best
  34
+  {-nptns = merge best ptns
  35
+  if null best 
  36
+    then return nptns
  37
+    else return (iter (n+1) nptns)-}
  38
+--mapM_ ((putStrLn . unwords . map (unpack.text.head.parts)).sortBy (comparing ocId).ocs) ptns 
122  src/SP/Bootstrap/MongoDB.hs
... ...
@@ -0,0 +1,122 @@
  1
+{-# OPTIONS_GHC -fno-warn-missing-fields #-}
  2
+module SP.Bootstrap.MongoDB (bootstrap) where
  3
+
  4
+import Control.DeepSeq
  5
+import Control.Monad.Trans (liftIO)
  6
+import qualified Data.HashMap.Strict as M
  7
+import qualified Data.HashSet as S
  8
+import Database.MongoDB
  9
+import Data.Array.IArray hiding (elems)
  10
+import Data.Function (on)
  11
+import qualified Data.HashMap.Lazy as HM
  12
+import Data.IntMap (keys, findWithDefault, singleton)
  13
+import Data.List.Stream hiding (find)--(intersect, nub, union, foldl')
  14
+import Data.Maybe
  15
+import Data.UString (u)
  16
+import SP.ByteString as B
  17
+import SP.Cluster
  18
+import SP.Config
  19
+import SP.DeepSeq
  20
+import Prelude hiding (take,concat,concatMap,map,length,(++),filter,notElem,unzip,zipWith,head)
  21
+
  22
+-- | Get the news items from Mongo. c is the callback function.
  23
+bootstrap c = do                                   
  24
+  pipe <- runIOE $ connect (host "127.0.0.1")
  25
+  cfg <- getConfig
  26
+  e <- access pipe master (u"articles") (findItems >>= mkDomain cfg c)
  27
+  close pipe
  28
+  print e
  29
+
  30
+-- | Find news items.                                           
  31
+findItems = rest =<< find (select q $ u"reuters.mergers")
  32
+  where q = [u"status" =: u"ok", u"sentences" =: [u"$exists" =: True]]
  33
+
  34
+-- Retrieve typed values from documents.                                        
  35
+docStr :: String -> Document -> ByteString                                         
  36
+docStr lbl doc = pack $ typed $ valueAt (u lbl) doc                                   
  37
+docLst :: String -> Document -> [Document]                                         
  38
+docLst lbl doc = typed $ valueAt (u lbl) doc                                          
  39
+docInt :: String -> Document -> Int                                                
  40
+docInt lbl doc = typed $ valueAt (u lbl) doc
  41
+
  42
+mkDomain cfg c arts = liftIO $ c $ mkPtns cfg narts
  43
+  where narts = take (artSize cfg) arts -- Limit no. of articles.
  44
+
  45
+mkPtns cfg arts = go [] arts 1 1 1
  46
+  where
  47
+  -- Ignored dependencies and POS tags
  48
+  ignDeps = depIgn cfg; ignPos = posIgn cfg
  49
+
  50
+  -- Build the partitions using the worker go.
  51
+  go :: [Partition] -> [Document] -> Int -> Int -> Int -> [Partition]
  52
+  go ps []          _  _  _  = ps
  53
+  go ps (art:narts) pi oi ai = go (p:ps) narts npi noi nai
  54
+    where
  55
+    -- Partition
  56
+    p = Partition pi ocs acs
  57
+
  58
+    -- Object clusters and argument clusters.
  59
+    ocs = fst.fst $ osAsOiAi
  60
+    acs = nub $ concatMap (map fst.pars) ocs  --snd.fst $ osAsOiAi
  61
+
  62
+    -- New start IDs for the next batch of argument and object clusters.
  63
+    npi = 1 + pi -- Next partition index.
  64
+    noi = fst.snd $ osAsOiAi -- Next object cluster index.
  65
+    nai = snd.snd $ osAsOiAi -- Next argument cluster index.
  66
+
  67
+    -- The object clusters and id of the last argument cluster.
  68
+    osAsOiAi :: (([ObjectCluster],[ArgumentCluster]),(Int,Int))
  69
+    osAsOiAi = procSnts (docLst "sentences" art) [] [] 1 oi ai
  70
+
  71
+    -- Create argument and object clusters for sentences.
  72
+    procSnts :: [Document] -> [ObjectCluster] -> [ArgumentCluster] -> Int -> 
  73
+                Int -> Int -> (([ObjectCluster],[ArgumentCluster]),(Int,Int))
  74
+    procSnts []     os as _  oi ai = ((os,acs),(oi,ai))
  75
+    procSnts (s:ss) os as si oi ai = procSnts ss nos nas (si+1) noi nai
  76
+      where
  77
+      -- Next argument cluster index.
  78
+      nai = ai + length nas
  79
+      noi = oi + ufoslen
  80
+      -- New argument clusters.
  81
+      nas = nub.foldl' (++) [].HM.elems $ pm
  82
+      
  83
+      -- Object clusters --with argument clusters.
  84
+      nos = os ++ ufos --(HM.keys pm `union` HM.keys cm)
  85
+      
  86
+      -- Maps from object clusters to 
  87
+      pm = HM.fromListWith (++) $ fst ats
  88
+      cm = HM.fromListWith (++) $ snd ats
  89
+
  90
+      -- Create argument clusters.
  91
+      deps = filter (\d -> docStr "rel" d `notElem` ignDeps) (docLst "deps" s)
  92
+      ats = unzip $ zipWith mkArgumentCluster [ai..] deps
  93
+      mkArgumentCluster id dep = ((p,[ac]),(c,[ac]))
  94
+        where ac = ArgumentCluster id pm cm rm
  95
+              pm = singleton (ocId p) 1; p = oa ! docInt "gov_index" dep
  96
+              cm = singleton (ocId c) 1; c = oa ! docInt "dept_index" dep
  97
+              rm = HM.singleton (docStr "rel" dep) 1
  98
+
  99
+      -- Unfiltered object clusters, array for faster indexing.
  100
+      oa :: Array Int ObjectCluster
  101
+      oa = listArray (0, ufoslen - 1) ufos
  102
+      ufoslen = length ufos
  103
+      ufos = zipWith mkObjectCluster [oi..] (docLst "tokens" s)
  104
+      mkObjectCluster id tk = o
  105
+        where 
  106
+        o = ObjectCluster id [part] pars chdn sbls 
  107
+        part = Part {partId = id,  artId = pi,       sntId = si,   form = form,  
  108
+                     pos = g"pos", lemma = g"lemma", ner = g"ner", text = g"text"}
  109
+          where form = B.concat [g"lemma", pack ":", g"pos"]
  110
+                g str = docStr str tk
  111
+
  112
+        -- Incidence lists for argument clusters.
  113
+        pars = tpls $ HM.lookupDefault [] o cm -- Parents
  114
+        chdn = tpls $ HM.lookupDefault [] o pm -- Children
  115
+        sbls = concatMap acChdn pars           -- Siblings
  116
+        -- TODO Check that siblings uses the right map.
  117
+        acChdn pi = [(D2ArgumentCluster par par, 1) | -- Incidence
  118
+                     let par = fst pi
  119
+                         pidx = (head.keys.parMap) par,
  120
+                     chd <- HM.lookupDefault [] ObjectCluster {ocId = pidx} pm,
  121
+                     ocId o /= (head.keys.chdMap) chd] -- Don't point to o
  122
+        tpls = map $ \e -> (e,1)
91  src/SP/Bootstrapper.hs
... ...
@@ -1,91 +0,0 @@
1  
-{-# LANGUAGE OverloadedStrings #-}
2  
-module SP.Bootstrapper (bootstrap) where
3  
-
4  
-import Control.Monad.Trans (liftIO)
5  
-import Data.ByteString.Char8 (ByteString, pack)
6  
-import qualified Data.ByteString.Char8 as BS
7  
-import qualified Data.HashMap.Lazy as M
8  
-import Data.List (elemIndex)
9  
-import Data.Maybe
10  
-import Database.MongoDB
11  
-import SP.Cluster
12  
-
13  
--- | Get the news items from Mongo, where cb is the callback function that 
14  
--- | will be called upon once the domain has been created.
15  
-bootstrap cb = do
16  
-  pipe <- runIOE $ connect (host "127.0.0.1")
17  
-  e <- access pipe master "articles" (reutersMergers >>= mkDomain cb)
18  
-  close pipe
19  
-  print e
20  
-
21  
--- | Find Reuters Mergers news items.
22  
-reutersMergers = rest =<< find (select q "reuters.mergers")
23  
-  where 
24  
-    q = ["status" =: u"ok", "sentences" =: ["$exists" =: True]]
25  
-
26  
-mkDomain cb dArts = liftIO $ cb arts fObjClrs
27  
-  where
28  
-    arts = zipWith mkArt (take 10 dArts) [1..]
29  
-    parts = concatMap sParts $ concatMap aSnts arts
30  
-    objClrMap = M.fromList $ zip parts objClrs 
31  
-    objClrs = zipWith (mkObjClr objClrMap) parts [1..]
32  
-    fObjClrs = filter (not . null . ocArgClrs) objClrs
33  
-   
34  
-mkArt :: Document -> Int -> Art 
35  
-mkArt dArt aId = art
36  
-  where 
37  
-    art = Art aId parts aTitle aText
38  
-    parts = zipWith (mkSnt art) dSnts [1..]
39  
-    aTitle = dStr "title" dArt
40  
-    aText = dStr "content" dArt
41  
-    dSnts = dLst "sentences" dArt
42  
-
43  
-mkSnt :: Art -> Document -> Int -> Snt
44  
-mkSnt sArt dSnt sId = snt
45  
-  where
46  
-    snt = Snt sId sArt parts
47  
-    tks = dLst "tokens" dSnt
48  
-    sntDeps = map (mkDep parts) $ dLst "deps" dSnt
49  
-    parts = zipWith (mkPart snt sntDeps) tks [1..]
50  
-
51  
-mkDep :: [Part] -> Document ->  Dep
52  
-mkDep parts doc = Dep rel gov dpt
53  
-  where
54  
-    rel = dStr "rel" doc
55  
-    govIdx = dInt "gov_index" doc
56  
-    dptIdx = dInt "dept_index" doc
57  
-    gov = parts !! govIdx
58  
-    dpt = parts !! dptIdx
59  
-
60  
-mkPart :: Snt -> [Dep] -> Document -> Int -> Part
61  
-mkPart pSnt sntDeps tk pId = part
62  
-  where
63  
-    part = Part pId pSnt form lemma pos word parDeps chdDeps 
64  
-    pos = dStr "pos" tk
65  
-    lemma = dStr "lemma" tk
66  
-    word = dStr "text" tk
67  
-    form = BS.concat [pos, pack ":", lemma]
68  
-    parDeps = filter (\p -> dpt p == part) sntDeps
69  
-    chdDeps = filter (\p -> gov p == part) sntDeps
70  
-
71  
--- Retrieve typed values from documents.
72  
-dStr :: Label -> Document -> ByteString
73  
-dStr lbl doc = pack $ typed $ valueAt lbl doc
74  
-dLst :: Label -> Document -> [Document]
75  
-dLst lbl doc = typed $ valueAt lbl doc
76  
-dInt :: Label -> Document -> Int
77  
-dInt lbl doc = typed $ valueAt lbl doc
78  
-
79  
-mkObjClr :: M.HashMap Part ObjClr -> Part -> Int -> ObjClr
80  
-mkObjClr objClrMap part ocId = oc
81  
-  where
82  
-    oc = ObjClr ocId [part] parArgClrs chdArgClrs sblArgClrs 
83  
-    parArgClrs = [AdjArgClr id oc [arg] Par | (arg, id) <- zip parArgs [1..]]
84  
-    chdArgClrs = [AdjArgClr id oc [arg] Chd | (arg, id) <- zip chdArgs [1..]]
85  
-    sblArgClrs = [RmtArgClr id oc [arg] Sbl | (arg, id) <- zip sblArgs [1..]]
86  
-    parArgs = [AdjArg rel (toOC gov) | (Dep rel gov _) <- parDeps part ]
87  
-    chdArgs = [AdjArg rel (toOC dep) | (Dep rel _ dep) <- chdDeps part]
88  
-    sblArgs = [RmtArg (dRel pd) (dRel cd) (toOC $ gov cd) (toOC rmtPart) 
89  
-              | (pd,cd) <- sblDPs, let rmtPart = dpt pd, rmtPart /= part]
90  
-    sblDPs = concat [[(pd,cd) | cd <- chdDeps $ gov pd] | pd <- parDeps part]
91  
-    toOC p = fromJust $ M.lookup p objClrMap
26  src/SP/ByteString.hs
... ...
@@ -0,0 +1,26 @@
  1
+module SP.ByteString where
  2
+
  3
+import qualified Data.ByteString.Char8 as B
  4
+import Data.Hashable
  5
+import Data.Ord (comparing)
  6
+import Data.List.Stream
  7
+import Prelude hiding (map)
  8
+
  9
+data ByteString = ByteString {bsStr :: B.ByteString, bsHsh :: !Int}
  10
+                  deriving (Read,Show)
  11
+
  12
+instance Ord ByteString where
  13
+  compare = comparing bsStr
  14
+
  15
+instance Eq ByteString where
  16
+  x == y = bsHsh x == bsHsh y
  17
+
  18
+instance Hashable ByteString where
  19
+  hash = bsHsh
  20
+
  21
+{-instance Show ByteString where 
  22
+  show = show.bsStr-}
  23
+
  24
+pack s = ByteString b (hash b) where b = B.pack s
  25
+unpack bs = B.unpack $ bsStr bs
  26
+concat bs = ByteString b (hash b) where b = B.concat $ map bsStr bs
164  src/SP/Cluster.hs
... ...
@@ -1,108 +1,64 @@
  1
+{-# LANGUAGE TypeSynonymInstances #-}
1 2
 module SP.Cluster where
2 3
 
3  
-import Data.ByteString.Char8 (ByteString, unpack)
  4
+import Data.Function
4 5
 import Data.Hashable
5  
-import Data.List
6  
-
7  
--- | Article.
8  
-data Art = Art {aId::Int, aSnts::[Snt], aTitle, aText::ByteString} 
9  
--- | Sentence.
10  
-data Snt = Snt {sId::Int, sArt::Art, sParts::[Part]} deriving (Show)
11  
--- | Part.
12  
-data Part = Part {pId::Int, pSnt::Snt, form, lemma, pos, word::ByteString, 
13  
-                  parDeps, chdDeps::[Dep]}
14  
--- | Dependency.
15  
-data Dep = Dep {dRel::ByteString, gov, dpt::Part} deriving (Show)
16  
--- | Object cluster.
17  
-data ObjClr = ObjClr {ocId::Int, ocParts::[Part], parArgClrs, chdArgClrs, 
18  
-                      sblArgClrs::[ArgClr]}
19  
--- | Argument. Adjacent or remote.
20  
-data AdjArg = AdjArg {rel::ByteString, obj::ObjClr}
21  
-data RmtArg = RmtArg {intRel,rmtRel::ByteString, intObj,rmtObj::ObjClr} 
22  
--- | Argument cluster.
23  
-data ArgClr = AdjArgClr {aacId::Int, aacObj::ObjClr, aacArgs::[AdjArg], 
24  
-                         aacRole::Role} 
25  
-            | RmtArgClr {racId::Int, racObj::ObjClr, racArgs::[RmtArg], 
26  
-                         racRole::Role} 
27  
-data Role = Par | Chd | Sbl deriving (Show, Eq)
28  
-
29  
-
30  
--- Accessor methods for object clusters.
31  
-ocArgClrs :: ObjClr -> [ArgClr]
32  
-ocArgClrs (ObjClr _ _ ps cs ss) = ps ++ cs ++ ss
33  
-
34  
--- General accessor methods for argument clusters.
35  
-acId :: ArgClr -> Int
36  
-acId AdjArgClr {aacId = id} = id
37  
-acId RmtArgClr {racId = id} = id
38  
-acRole :: ArgClr -> Role
39  
-acRole AdjArgClr {aacRole = role} = role
40  
-acRole RmtArgClr {racRole = role} = role
41  
-
42  
--- Equality.
43  
-instance Eq Art where
44  
-  a1 == a2 = aId a1 == aId a2
45  
-
46  
-instance Eq Snt where
47  
-  s1 == s2 = sId s1 == sId s2 && sArt s1 == sArt s2
48  
-
49  
-instance Eq Part where
50  
-  p1 == p2 = pId p1 == pId p2 && pSnt p1 == pSnt p2
51  
-  
52  
-instance Eq ObjClr where
53  
-  c1 == c2 = ocId c1 == ocId c2
54  
-
55  
-instance Eq ArgClr where
56  
-  -- Must have same id, object cluster, and role.
57  
-  AdjArgClr id1 oc1 _ r1 == AdjArgClr id2 oc2 _ r2 = 
58  
-    id1 == id2 && oc1 == oc2 && r1 == r2
59  
-  RmtArgClr id1 oc1 _ r1 == RmtArgClr id2 oc2 _ r2 = 
60  
-    id1 == id2 && oc1 == oc2 && r1 == r2
61  
-
62  
--- Show.
63  
-instance Show Part where
64  
-  show p = unpack (word p) ++ " " ++ unpack (form p)
65  
-
66  
-instance Show ObjClr where
67  
-  show oc = "\n" ++ id ++ parts ++ parArgs ++ chdArgs ++ sblArgs
68  
-    where
69  
-      id = fuse ["ID: ", show (ocId oc)]
70  
-      parts = fuse ["Parts: ", show (ocParts oc)]
71  
-      parArgs = fuse ["ParArgClrs: ", show (parArgClrs oc)] 
72  
-      chdArgs = fuse ["ChdArgClrs: ", show (chdArgClrs oc)] 
73  
-      sblArgs = fuse ["SblArgClrs: ", show (sblArgClrs oc)] 
74  
-
75  
-fuse strs = concat strs ++ "\n"
76  
-
77  
-instance Show ArgClr where
78  
-  show AdjArgClr {aacArgs = args} = show args
79  
-  show RmtArgClr {racArgs = args} = show args
80  
-
81  
-instance Show AdjArg where
82  
-  show (AdjArg rel obj) = unpack rel ++ " " ++ show (ocId obj) ++ " " ++ 
83  
-    show (ocParts obj)
84  
-
85  
-instance Show RmtArg where
86  
-  show (RmtArg intRel rmtRel intObj rmtObj) = 
87  
-    unpack intRel ++ " " ++ show (ocId intObj) ++ " " ++ 
88  
-      show (ocParts intObj) ++ "\n" ++
89  
-      unpack rmtRel ++ " " ++ show (ocId rmtObj) ++ " " ++ 
90  
-      show (ocParts rmtObj)
91  
-
92  
-instance Show Art where
93  
-  show Art {aId = id, aTitle = title, aText = text} = 
94  
-    fuse ["ID: ", show id] ++ fuse ["Title: ", unpack title] ++ fuse ["Text: ", unpack text]
95  
-
96  
--- Hashable.
97  
-instance Hashable Art where
98  
-  hash = aId
99  
-
100  
-instance Hashable Snt where
101  
-  hash snt = sId snt `combine` hash (sArt snt)
102  
-
103  
-instance Hashable Part where
104  
-  hash part = pId part `combine` hash (pSnt part)
105  
-  
106  
-instance Hashable ObjClr where
107  
-  hash = ocId
  6
+import Data.Maybe
  7
+import SP.ByteString
  8
+import Data.IntMap (IntMap,keys)
  9
+import Data.HashMap.Lazy (HashMap,fromList)
  10
+import Text.Read
  11
+
  12
+data Partition = Partition { ptnId :: Id
  13
+                           , ocs :: [ObjectCluster]
  14
+                           , acs :: [ArgumentCluster]
  15
+                           } deriving (Read,Show)
  16
+
  17
+data ObjectCluster = ObjectCluster { ocId :: Id 
  18
+                                   , parts :: [Part]
  19
+                                   , pars, chdn, sbls :: IncidenceList
  20
+                                   } deriving (Read,Show)
  21
+
  22
+data Part = Part { partId, artId, sntId :: Id
  23
+                 , form, pos, lemma, ner, text :: ByteString
  24
+                 } deriving (Read,Show)
  25
+
  26
+data ArgumentCluster = ArgumentCluster { acId :: Id
  27
+                                       , parMap, chdMap :: ObjectMap
  28
+                                       , relMap :: RelationMap} |
  29
+                       D2ArgumentCluster { acFst, acSnd :: ArgumentCluster
  30
+                                         } deriving (Read,Show)
  31
+
  32
+instance Eq Partition where (==) = (==) `on` ptnId
  33
+instance Eq ObjectCluster where (==) = (==) `on` ocId
  34
+instance Eq Part where (==) = (==) `on` partId
  35
+instance Hashable ObjectCluster where hash = ocId
  36
+
  37
+instance Eq ArgumentCluster where
  38
+  D2ArgumentCluster x y == D2ArgumentCluster z w = x == z && y == w
  39
+  ArgumentCluster {acId = id1} == ArgumentCluster {acId = id2} = id1 == id2
  40
+  _ == _ = False
  41
+
  42
+instance Hashable ArgumentCluster where 
  43
+  hash (D2ArgumentCluster x y) = acId x `combine` acId y
  44
+  hash ArgumentCluster {acId = acId} = acId
  45
+
  46
+instance (Read k, Hashable k, Ord k, Read a) => Read (HashMap k a) where
  47
+  readPrec = parens $ prec 10 $ do
  48
+    Ident "fromList" <- lexP
  49
+    xs <- readPrec
  50
+    return (fromList xs)
  51
+  readListPrec = readListPrecDefault
  52
+
  53
+type Id = Int
  54
+type Relation = ByteString
  55
+type Incidence = Double
  56
+type IncidenceList = [(ArgumentCluster, Incidence)]
  57
+type ObjectMap = IntMap Incidence
  58
+type RelationMap = HashMap Relation Incidence
  59
+
  60
+parObjIds, chdObjIds, sblObjIds :: ObjectCluster -> [Id]
  61
+parObjIds = concatMap (keys.parMap.fst).pars 
  62
+chdObjIds = concatMap (keys.chdMap.fst).chdn
  63
+sblObjIds = concatMap (keys.chdMap.acSnd.fst).sbls
108 64
 
32  src/SP/Config.hs
... ...
@@ -0,0 +1,32 @@
  1
+module SP.Config where
  2
+
  3
+import Data.Char
  4
+import Data.ConfigFile                                                          
  5
+import Data.Either.Utils                                                        
  6
+import SP.ByteString (ByteString, pack)
  7
+import Text.Regex
  8
+
  9
+getOpt def sec opt = do
  10
+  -- Open config. file.                                                         
  11
+  val <- readfile emptyCP "sp.conf"
  12
+  let cfgPar = forceEither val                                                  
  13
+  return $ if has_option cfgPar sec opt
  14
+              then forceEither $ get cfgPar sec opt
  15
+              else def
  16
+
  17
+getConfig = do
  18
+  depIgnStr <- getOpt "" "Dependency" "ignore"
  19
+  posIgnStr <- getOpt "" "POS" "ignore"
  20
+  minSmpSizeStr <- getOpt "5" "Preprocess" "samples"
  21
+  artSizeStr <- getOpt "50" "Preprocess" "articles"
  22
+  let depIgn = optBList depIgnStr
  23
+      posIgn = optBList posIgnStr
  24
+      minSmpSize = read minSmpSizeStr::Int
  25
+      artSize = read artSizeStr::Int
  26
+  return $ Config depIgn posIgn minSmpSize artSize
  27
+
  28
+optBList opt = map pack $ splitRegex optListRegex $ map toLower opt
  29
+optListRegex = mkRegex ",\\s*"
  30
+
  31
+data Config = Config {depIgn, posIgn::[ByteString], minSmpSize, artSize::Int}
  32
+
28  src/SP/DeepSeq.hs
... ...
@@ -0,0 +1,28 @@
  1
+{-# LANGUAGE TypeSynonymInstances #-}
  2
+module SP.DeepSeq where
  3
+
  4
+import Control.DeepSeq
  5
+import qualified Data.ByteString.Internal as BS
  6
+import SP.ByteString
  7
+import SP.Cluster
  8
+import Data.HashMap.Lazy
  9
+import Data.IntMap
  10
+
  11
+instance NFData Partition where
  12
+  rnf (Partition _ ocs acs) = rnf ocs `seq` rnf acs
  13
+
  14
+instance NFData ObjectCluster where
  15
+  rnf (ObjectCluster _ parts pars chdn sbls) = (rnf parts `seq` rnf pars) `seq`
  16
+                                               (rnf chdn `seq` rnf sbls)
  17
+
  18
+instance NFData ArgumentCluster where
  19
+  rnf (ArgumentCluster _ pm cm rm) = rnf rm -- `seq` (rnf pm `seq` rnf cm)
  20
+  rnf (D2ArgumentCluster x y) = rnf x `seq` rnf y
  21
+
  22
+instance NFData Part where
  23
+  rnf Part {form = form, lemma = lemma, ner = ner, pos = pos} = 
  24
+    rnf form `seq` (rnf lemma `seq` (rnf ner `seq` rnf pos))
  25
+
  26
+instance NFData ByteString where
  27
+  rnf (ByteString _ h) = rnf h
  28
+
93  src/SP/Merge.hs
... ...
@@ -1,93 +0,0 @@
1  
-module SP.Merge (mergeClusters) where
2  
-
3  
-import Data.HashMap.Lazy (fromList, HashMap, lookupDefault)
4  
-import Data.List
5  
-import SP.Cluster
6  
-import SP.Score
7  
-
8  
-type ObjClrMap = HashMap ObjClr ObjClr
9  
-
10  
--- | Merge all clusters in accordance with a list of good scores.
11  
-mergeClusters :: [ObjClr] -> [OpScr] -> [ObjClr]
12  
-mergeClusters objClrs scores = redir ++ new
13  
-  where
14  
-    -- Object cluster map, mapping unused and used to redirected and new.
15  
-    ocm = fromList $ newTpls ++ redirTpls
16  
-    -- Redirected unused object clusters.
17  
-    redir = snd $ unzip redirTpls 
18  
-    -- New object clusters.
19  
-    new = nub $ snd $ unzip newTpls
20  
-    -- Tuples used for merge.
21  
-    used = concatMap (\s -> map ($ scr s) [sOc1, sOc2]) scores
22  
-    -- Unused object clusters.
23  
-    unused = filter (`notElem` used) objClrs
24  
-    -- New tuples (redirected).
25  
-    newTpls = concat $ zipWith (mergeObjClrs ocm) newIds scores
26  
-      where newIds = [ocId (snd $ last redirTpls) + 1..]
27  
-    -- Tuples of unused and their redirected copies.
28  
-    redirTpls = zipWith (mkObjClr ocm) [ocId (last objClrs) + 1..] unused
29  
-
30  
--- | Create a new argument cluster. 
31  
--- | Returns a list of pairs containing a used cluster as the first element, 
32  
--- | and the new cluster as the second
33  
-mergeObjClrs :: ObjClrMap -> Int -> OpScr -> [(ObjClr,ObjClr)]
34  
-mergeObjClrs ocm id s = [((oc1 . objScr . scr) s, oc), 
35  
-                         ((oc2 . objScr . scr) s, oc)]
36  
-  where 
37  
-    oc = ObjClr id parts (acsByRole Par) (acsByRole Chd) (acsByRole Sbl)
38  
-    parts = ocParts (oc1 os) ++ ocParts (oc2 os) where os = objScr $ scr s
39  
-    argClrs = mergeAllArgClrs ocm oc $ scr s
40  
-    -- Get argument clusters by role.
41  
-    acsByRole r = filter (\ac -> acRole ac == r) argClrs
42  
-
43  
--- | Make new argument clusters by merging existing ones or copying.
44  
-mergeAllArgClrs :: ObjClrMap -> ObjClr -> Scr -> [ArgClr]
45  
-mergeAllArgClrs ocm oc (Scr os ass u1 u2) = clustered ++ unclustered
46  
-  where
47  
-    -- Results of the cluster merges.
48  
-    clustered = zipWith (\s -> mergeArgClrs ocm oc (ac1 s) (ac2 s)) ass [1..]
49  
-    -- Copies of the scores that were not merged. orginals can't be used,
50  
-    -- since they point to the wrong object cluster and have the wrong id.
51  
-    unclustered = zipWith (mkArgClr ocm oc) orgUnclustered [length ass + 1..]
52  
-    -- orginal argument clusters not merged.
53  
-    orgUnclustered = u1 ++ u2 -- filter (`notElem` usedArgClrs) orgArgClrs
54  
-    -- The orginal argument clusters of both object clusters.
55  
-    orgArgClrs = argClrs (oc1 os) ++ argClrs (oc2 os)
56  
-      where argClrs oc = parArgClrs oc ++ chdArgClrs oc ++  sblArgClrs oc
57  
-    -- orginal argument clusters used for merging.
58  
-    usedArgClrs = concat [[ac1 as, ac2 as] | as <- ass]
59  
-
60  
--- | Create a new argument cluster by merging two existing ones.
61  
-mergeArgClrs :: ObjClrMap -> ObjClr -> ArgClr -> ArgClr -> Int -> ArgClr
62  
-mergeArgClrs ocm oc (AdjArgClr _ _ args1 role) (AdjArgClr _ _ args2 _) id = 
63  
-  AdjArgClr id oc (map (rplAdjArg ocm) $ args1 ++ args2) role
64  
-mergeArgClrs ocm oc (RmtArgClr _ _ args1 role) (RmtArgClr _ _ args2 _) id = 
65  
-  RmtArgClr id oc (map (rplRmtArg ocm) $ args1 ++ args2) role
66  
-
67  
--- | Create an argument cluster from an existing one.
68  
-mkArgClr :: ObjClrMap -> ObjClr -> ArgClr -> Int -> ArgClr
69  
-mkArgClr ocm oc ac@AdjArgClr {aacArgs=args} id = 
70  
-  ac {aacId=id, aacObj=oc, aacArgs=map (rplAdjArg ocm) args}
71  
-mkArgClr ocm oc ac@RmtArgClr {racArgs=args} id = 
72  
-  ac {racId=id, racObj=oc, racArgs=map (rplRmtArg ocm) args}
73  
-
74  
--- | Make a new object cluster with updated references.
75  
-mkObjClr :: ObjClrMap -> Int -> ObjClr -> (ObjClr, ObjClr)
76  
-mkObjClr ocm id oc@ObjClr {parArgClrs=oldPacs, chdArgClrs=oldCacs, 
77  
-                           sblArgClrs=oldSacs, ocParts=oldParts} = (oc, ocNew)
78  
-  where 
79  
-    ocNew = ObjClr id oldParts pacs cacs sacs
80  
-    pacs = mapArgClrs oldPacs -- Parent argument clusters.
81  
-    cacs = mapArgClrs oldCacs -- Child argument clusters.
82  
-    sacs = mapArgClrs oldSacs -- Sibling argument clusters.
83  
-    mapArgClrs = map (\ac -> mkArgClr ocm ocNew ac $ acId ac)
84  
-
85  
--- | Eventually substitutes an object cluster.
86  
-rplObjClr :: ObjClrMap -> ObjClr -> ObjClr
87  
-rplObjClr ocm oc = lookupDefault oc oc ocm
88  
-rplAdjArg :: ObjClrMap -> AdjArg -> AdjArg
89  
-rplAdjArg ocm arg@AdjArg {obj = oc} = arg {obj = rplObjClr ocm oc}
90  
-rplRmtArg :: ObjClrMap -> RmtArg -> RmtArg
91  
-rplRmtArg ocm arg@RmtArg {intObj = intObj, rmtObj = rmtObj} = 
92  
-  arg {intObj = rplObjClr ocm intObj, rmtObj = rplObjClr ocm rmtObj}
93  
-
116  src/SP/Preprocess/Compound.hs
... ...
@@ -0,0 +1,116 @@
  1
+module SP.Preprocess.Compound --(mkCompounds,mkNnGrps,mkNerCompounds,mkNerGrps) 
  2
+  where
  3
+
  4
+import qualified Data.HashMap.Lazy as HashMap
  5
+import qualified Data.IntMap as IntMap
  6
+import Data.Function
  7
+import Data.List (groupBy, intersect, intersperse, nubBy, sortBy)
  8
+import Data.Maybe 
  9
+import Data.Ord
  10
+import Prelude hiding (concat)
  11
+import SP.ByteString (pack, concat)
  12
+import SP.Cluster
  13
+import SP.Redirect
  14
+
  15
+mkCompounds :: Partition -> Partition
  16
+mkCompounds = mkNnCompounds.mkNerCompounds
  17
+
  18
+mkNnCompounds :: Partition -> Partition
  19
+mkNnCompounds ptn | null grps = ptn
  20
+                  | otherwise = nptn
  21
+  where nptn = redirect ptn $ mkRedirectMap grps
  22
+        grps = mkNnGrps $ ocs ptn
  23
+
  24
+mkNerCompounds :: Partition -> Partition
  25
+mkNerCompounds ptn | null grps = ptn -- TODO Improve grouping=>No recursion
  26
+                   | otherwise = if null (mkNerGrps $ ocs nptn) then nptn 
  27
+                                 else error "Ner fail!!!"
  28
+  where nptn = redirect ptn $ mkRedirectMap grps
  29
+        grps = mkNerGrps $ ocs ptn
  30
+
  31
+mkRedirectMap :: [[ObjectCluster]] -> HashMap.HashMap ObjectCluster ObjectCluster
  32
+mkRedirectMap grps = HashMap.fromList $ concatMap mkRedirectList grps
  33
+  where mkRedirectList grp = map (\o -> (o, merge grp)) grp
  34
+
  35
+-- Merge a group of object clusters. 
  36
+merge :: [ObjectCluster] -> ObjectCluster
  37
+merge grp = ObjectCluster ((ocId.head) grp) [part] parIs chdIs sblIs
  38
+  where -- Create part and form new token annotations.
  39
+        part = ((p.last) grp) {form = nform, lemma = nlemma, text = ntext}
  40
+        nform = concat [nlemma, pack ":", pos part]
  41
+        nlemma = concat.intersperse (pack "_") $ map (lemma.p) grp
  42
+        ntext = concat.intersperse (pack "_") $ map (text.p) grp
  43
+        p = head.parts
  44
+
  45
+        -- Remove argument clusters pointing to the same object cluster
  46
+        -- in the group. Remove multiple argument clusters.
  47
+        allSbls :: IncidenceList
  48
+        allSbls = concatMap sbls grp
  49
+
  50
+        -- Incidence lists for siblings, parents and children.
  51
+        sblIs = nubBy cmp2 oacs
  52
+          where oacs :: IncidenceList
  53
+                oacs = filter f allSbls 
  54
+                f :: (ArgumentCluster, Incidence) -> Bool
  55
+                f (a,_) = null $ [(intKey.parMap.acFst) a, 
  56
+                                  (intKey.chdMap.acSnd) a] 
  57
+                                  `intersect` 
  58
+                                  map ocId grp
  59
+                cmp2 (x,_) (y,_) = cmp parMap (acFst x) (acFst y) && 
  60
+                                   cmp chdMap (acSnd x) (acSnd y)
  61
+        parIs = map (\a -> (a,1)) $ nubBy (cmp parMap) $ acs pars parMap
  62
+        chdIs = map (\a -> (a,1)) $ nubBy (cmp chdMap) $ acs chdn chdMap
  63
+        -- Get all argument clusters via method acc, that aren't pointing
  64
+        -- within the group via object map m.
  65
+        acs acc m = [a | i <- concatMap acc grp, let a = fst i, inGrp a]
  66
+          where inGrp a = (intKey.m) a `notElem` map ocId grp
  67
+        -- Compare two incidence tuples with respect to where their 
  68
+        -- object clusters are pointing via object map m, and their 
  69
+        -- relation.
  70
+        cmp m ai aj = (intKey.m) ai == (intKey.m) aj && 
  71
+                      (key.relMap) ai == (key.relMap) aj
  72
+        intKey = head . IntMap.keys; key = head . HashMap.keys
  73
+
  74
+{-mkNerGrps :: [ObjectCluster] -> [[ObjectCluster]]
  75
+mkNerGrps ocs = filter (\g -> (cner.head) g /= pack "O" && length g > 1) groups
  76
+  where groups = groupBy pred $ sortBy (comparing ocId) ocs
  77
+          where pred oi oj = let conn | cner oi == pack "O" = True                       
  78
+                                      | otherwise = ocId oj `elem` neighbors oi
  79
+                             in (cner oi == cner oj) && conn
  80
+        neighbors oi = parObjIds oi ++ chdObjIds oi -- ++ sblObjIds oi
  81
+        cner = ner.head.parts -- Ner of the first (only) part in a cluster.
  82
+-}
  83
+
  84
+mkNerGrps ocs = filter (\g -> (cner.head) g /= pack "O" && length g > 1) groups
  85
+  where groups = groupBy pred $ sortBy (comparing ocId) ocs
  86
+        pred oi oj = let eqNer = (==) `on` cner
  87
+                         eqSnt = (==) `on` sntId.head.parts
  88
+                         eqArt = (==) `on` artId.head.parts
  89
+                     in eqNer oi oj && eqSnt oi oj && eqArt oi oj
  90
+        cner = ner.head.parts
  91
+
  92
+{-
  93
+mkNnGrps :: [ObjectCluster] -> [[ObjectCluster]]
  94
+mkNnGrps ocs = groupBy pred $ sortBy (comparing ocId) $ filter oHasRelNn ocs
  95
+  where pred oi oj = any (conn chdMap) (acs chdn oi) || 
  96
+                     any (conn parMap) (acs pars oi)
  97
+          where conn m a = aHasRelNn a && aHasO a oj m
  98
+        oHasRelNn o = any aHasRelNn (acs chdn o) || 
  99
+                      any aHasRelNn (acs pars o)
  100
+        aHasRelNn a = isJust $ HashMap.lookup (pack "nn") (relMap a)
  101
+        aHasO a o m = isJust $ IntMap.lookup (ocId o) (m a)
  102
+        acs l o = map fst $ l o
  103
+-}
  104
+
  105
+mkNnGrps :: [ObjectCluster] -> [[ObjectCluster]]
  106
+mkNnGrps ocs = map (sortBy (comparing ocId).getWithChdn).filter oHasRelNn $ ocs
  107
+  where oHasRelNn o = any aHasRelNn (map fst $ chdn o)
  108
+        aHasRelNn a = isJust $ HashMap.lookup (pack "nn") (relMap a)
  109
+        aHasO a o m = isJust $ IntMap.lookup (ocId o) (m a)
  110
+        idOcMap = IntMap.fromList $ map (\o -> (ocId o,o)) ocs
  111
+        getWithChdn x = x:(concatMap getWithChdn $ chdObjs x)
  112
+        chdObjs = idsToObjs.concatMap (IntMap.keys.chdMap).filter aHasRelNn.map fst.chdn
  113
+        idsToObjs = map $ \id -> fj "NnGrps" $ IntMap.lookup id idOcMap
  114
+
  115
+fj s m | m == Nothing = error s
  116
+       | otherwise = fromJust m
32  src/SP/Preprocess/Preprocess.hs
... ...
@@ -0,0 +1,32 @@
  1
+module SP.Preprocess.Preprocess where
  2
+
  3
+import Control.Parallel.Strategies as P
  4
+import Data.Function
  5
+import Data.HashMap.Lazy hiding (filter, map, null)
  6
+import Data.List.Stream
  7
+import Data.Maybe 
  8
+import Data.Ord
  9
+import SP.ByteString (pack, ByteString)
  10
+import SP.Config
  11
+import SP.Cluster
  12
+import SP.Preprocess.Compound
  13
+import SP.Redirect
  14
+import Prelude hiding (head,concatMap,take,map,filter,null)
  15
+
  16
+takePartitions :: Config -> [Partition] -> [Partition]
  17
+takePartitions cfg = take $ artSize cfg
  18
+
  19
+groupByPos :: [Partition] -> [[ObjectCluster]]
  20
+groupByPos ptns = let os :: [ObjectCluster]
  21
+                      os = concatMap ocs ptns
  22
+                      opos :: ObjectCluster -> ByteString
  23
+                      opos = pos.head.parts
  24
+                  in groupBy ((==) `on` opos) (sortBy (comparing opos) os)
  25
+
  26
+mergeNerCompounds :: [Partition] -> [Partition]
  27
+mergeNerCompounds = P.parMap rseq mkNerCompounds
  28
+
  29
+filterEmpty :: [Partition] -> [Partition]
  30
+filterEmpty ps = map rebuild ps
  31
+  where rebuild p = p {ocs = filterObjClrs $ ocs p}
  32
+        filterObjClrs = filter $ \o -> not $ null (pars o) || null (chdn o)
49  src/SP/Redirect.hs
... ...
@@ -0,0 +1,49 @@
  1
+{-# OPTIONS_GHC -fno-warn-missing-fields #-}
  2
+module SP.Redirect where
  3
+
  4
+import Control.Arrow (first)
  5
+import Control.DeepSeq
  6
+import Data.List.Stream
  7
+import Data.Maybe
  8
+import Data.HashMap.Lazy as HashMap hiding (filter, map)
  9
+import qualified Data.IntMap as IntMap
  10
+import SP.Cluster
  11
+import SP.DeepSeq
  12
+import Prelude hiding (filter,map,notElem,unzip,(++))
  13
+
  14
+-- | Redirects object clusters and argument clusters in a partion
  15
+-- given a map from object clusters to delete, to new ones.
  16
+redirect :: Partition -> HashMap ObjectCluster ObjectCluster -> Partition
  17
+redirect ptn objMap = newPtn `deepseq` newPtn 
  18
+  where
  19
+  newPtn = ptn {ocs = nub $ elems objMapDeep, acs = map aUpdate $ acs ptn}
  20
+--new, del, keep :: [ObjectCluster]
  21
+--delNew = unzip $ toList objMap
  22
+--del = fst delNew -- Object clusters to remove.
  23
+--new = nub $ snd delNew -- New object clusters.
  24
+--keep = filter (`notElem` del) (ocs ptn) -- Old object clusters to keep.
  25
+  
  26
+  -- Maps for old to new object and argument clusters.
  27
+  objMapDeep :: HashMap ObjectCluster ObjectCluster
  28
+  objMapDeep = unionWith merge objMap (fromList $ map toTpl $ ocs ptn) --keep ++ new ++ del)
  29
+    where toTpl o = (o, oUpdate o); merge o _ = oUpdate o
  30
+  argMap = fromList $ map (\a -> (a, aUpdate a)) (acs ptn)
  31
+  
  32
+  -- Redirect an object cluster.
  33
+  oUpdate :: ObjectCluster -> ObjectCluster
  34
+  oUpdate o@(ObjectCluster {pars = pars, chdn = chdn, sbls = sbls}) = 
  35
+    o {pars = itUpdate pars, chdn = itUpdate chdn, sbls = itUpdate sbls}
  36
+    where itUpdate = map get -- = map first aUpdate
  37
+          get it = (lookup (fst it) argMap, snd it)
  38
+  
  39
+  lookup k = fromJust.HashMap.lookup k
  40
+
  41
+  -- Redirect an argument cluster.
  42
+  aUpdate :: ArgumentCluster -> ArgumentCluster
  43
+  aUpdate a@(ArgumentCluster {parMap = parMap, chdMap = chdMap}) = 
  44
+    a {parMap = amUpd parMap, chdMap = amUpd chdMap}
  45
+    where amUpd m = IntMap.fromList $ map itUpdate (IntMap.toList m)
  46
+          itUpdate (id,inc) = (ocId $ lookup oKey objMapDeep, inc)
  47
+            where oKey = ObjectCluster {ocId = id}
  48
+  aUpdate (D2ArgumentCluster x y) = D2ArgumentCluster (aUpdate x) (aUpdate y)
  49
+  
158  src/SP/Score.hs
... ...
@@ -1,158 +0,0 @@
1  
-module SP.Score where
2  
-
3  
-import Control.Parallel
4  
-import Control.Parallel.Strategies
5  
-import Data.List
6  
-import Data.List.Extras (argmax, argmaxes)
7  
-import Data.Ord (comparing)
8  
-import SP.Cluster
9  
-
10  
--- | A generic score.
11  
-data Scr = Scr {objScr::ObjScr, argScrs::[ArgScr], u1,u2::[ArgClr]} 
12  
-  deriving (Show, Eq) 
13  
-
14  
--- | An operator score.
15  
-data OpScr = MrgScr {val::Double, scr::Scr}
16  
-           | AbsScr {val::Double, scr::Scr}
17  
-           | PrnResScr {val::Double, scr::Scr}
18  
-           | AddChdScr {val::Double, scr::Scr, parOc,chdOc::ObjClr}
19  
-           deriving (Show, Eq)
20  
-
21  
--- | Object score.
22  
-data ObjScr = ObjScr {objScrVal::Double, oc1, oc2::ObjClr} 
23  
-  deriving (Show, Eq)
24  
-
25  
--- | Argument score.
26  
-data ArgScr = ArgScr {argScrVal::Double, ac1, ac2::ArgClr} 
27  
-  deriving (Show, Eq)
28  
-
29  
-sOc1 = oc1 . objScr
30  
-sOc2 = oc2 . objScr
31  
-
32  
--- | Calculates the similarity between two argument clusters.
33  
-cmpac :: ArgClr -> ArgClr -> Double
34  
-cmpac (AdjArgClr _ _ xs _) (AdjArgClr _ _ ys _) = cmpAdjArgs xs ys
35  
-cmpac (RmtArgClr _ _ xs _) (RmtArgClr _ _ ys _) = cmpRmtArgs xs ys
36  
-cmpac _                     _                   = 0
37  
-
38  
--- | Calculates the similarity between two lists of adjacent arguments.
39  
-cmpAdjArgs :: [AdjArg] -> [AdjArg] -> Double
40  
-cmpAdjArgs xs ys = 0.5 * cmpFld rel xs ys + 0.5 * cmpFld obj xs ys
41  
-
42  
--- | Calculates the similarity between two lists of remote arguments.
43  
-cmpRmtArgs :: [RmtArg] -> [RmtArg] -> Double
44  
-cmpRmtArgs xs ys = scr intRel + scr rmtRel + scr intObj + scr rmtObj 
45  
-  where scr fld = 0.25 * cmpFld fld xs ys
46  
-
47  
--- | Calculates the similarity between two dependencies with respect to a 
48  
--- | field.
49  
-cmpFld :: (Eq b) => (a -> b) -> [a] -> [a] -> Double
50  
-cmpFld f xs ys = 
51  
-  if null uniques 
52  
-    then 0.0 
53  
-    else sum [1.0 - abs (freq f v xs - freq f v ys) | v <- uniques] 
54  
-           / genericLength uniques
55  
-  where uniques = unique f xs ys
56  
-
57  
--- | The frequency of the value v in the dependencies xs.
58  
-freq :: (Eq b) => (a -> b) -> b -> [a] -> Double
59  
-freq f v xs = genericLength (filter (\x -> f x == v) xs) / genericLength xs
60  
-
61  
--- | The unique values of the property, transformed by f, in the dependencies 
62  
--- in xs, ys.
63  
-unique :: (Eq b) => (a -> b) -> [a] -> [a] -> [b]
64  
-unique f xs ys = nub (map f xs `intersect` map f ys)
65  
-
66  
--- | Get the best combination of argument scores. Greedy search.
67  
-bestArgScrs :: ObjClr -> ObjClr -> [ArgScr]
68  
-bestArgScrs oc1 oc2 = takeBest ac1 ac2 $ sortBy (key argScrVal) allScrs
69  
-  where
70  
-    allScrs = scrFor parArgClrs ++ scrFor chdArgClrs ++ scrFor sblArgClrs 
71  
-    scrFor fld = allArgScrs (fld oc1) (fld oc2)
72  
-
73  
--- | Get the best combination of scores among the supplied scores.
74  
--- Input must be sorted descending by score value.
75  
-takeBest :: (Eq s, Eq c) => (s -> c) -> (s -> c) -> [s] -> [s]
76  
-takeBest _  _  []     = []
77  
-takeBest f1 f2 (s:ss) = s : takeBest f1 f2 (filter (indep f1 f2 s) ss)
78  
-
79  
--- | Returns true if the specified scores are independent.
80  
--- f1 and f2 are accessor methods for the clusters of the scores.
81  
-indep :: (Eq s, Eq c) => (s -> c) -> (s -> c) -> s -> s -> Bool
82  
-indep f1 f2 s1 s2 = null $ intersect [f1 s1, f2 s1] [f1 s2, f2 s2]
83  
-
84  
--- | Key for descending sort of scores.
85  
-key :: (a -> Double) -> a -> a -> Ordering
86  
-key f s1 s2 = comparing negate (f s1) (f s2)
87  
-
88  
--- | Get all argument scores.
89  
-allArgScrs :: [ArgClr] -> [ArgClr] -> [ArgScr]
90  
-allArgScrs xs ys = zipWith3 ArgScr (zipWith cmpac xs ys) xs ys
91  
-
92  
--- | Create an operator score for two object clusters.
93  
-scrObjClr :: ObjClr -> ObjClr -> OpScr
94  
-scrObjClr c1 c2 = toOpScr $ Scr os ass (fuacss c1) (fuacss c2)
95  
-  where
96  
-    ass = bestArgScrs c1 c2
97  
-    os = ObjScr (cmpFld form (ocParts c1) (ocParts c2)) c1 c2
98  
-    -- Filter Unmerged Argument Cluster ScoreS.
99  
-    fuacss c = filter (`notElem` uacss) $ ocArgClrs c
100  
-    uacss = concatMap (\as -> [ac1 as, ac2 as]) ass
101  
-
102  
-scrVal :: Scr -> (Double, Double)
103  
-scrVal s = (objScrVal $ objScr s, if argScrSize == 0 then 0 else avgArgScr)
104  
-  where avgArgScr = sum (map argScrVal $ argScrs s) / argScrSize
105  
-        argScrSize = genericLength $ argScrs s
106  
-
107  
--- | Calculate the best scores such that their object clusters are independent.
108  
-bestScrs :: [ObjClr] -> [OpScr]
109  
-bestScrs xs = takeBest (oc1 . objScr . scr) (oc2 . objScr . scr) $ 
110  
-                filter (\s -> val s > 0.0) $ 
111  
-                  argmaxes val (allScrs xs)
112  
-    
113  
--- | Calculate all scores. Parallel.
114  
-allScrs :: [ObjClr] -> [OpScr]
115  
-allScrs xs = parMap rpar f [(x,y) | x <- xs, y <- xs, x /= y]
116  
-  where f = uncurry scrObjClr
117  
-
118  
--- | Converts a generic score to an operator score.
119  
-toOpScr :: Scr -> OpScr
120  
-toOpScr s = argmax val opScrs
121  
-  where
122  
-    -- Score value tuple.
123  
-    sv = scrVal s
124  
-    -- Object Score Value.
125  
-    osv = fst sv
126  
-    -- Argument Score Value.
127  
-    asv = snd sv
128  
-    -- Length of cluster groups, and their sum.
129  
-    tot = x + y + z
130  
-    x = gl argScrs; y = gl u1; z = gl u2; gl f = genericLength $ f s
131  
-    -- Score for abstracting: 1 - the weighted coefficient of variation.
132  
-    -- No abs needed for mean in the coefficient of variation, since mean 
133  
-    -- currently always is positive.
134  
-    absScrVal = (1.0 - sqrt varW / mean) * asv / 2.0
135  
-      where
136  
-        mean = tot / 3.0
137  
-        varW = ((x - mean)^2 + (y - mean)^2 + (z - mean)^2) / 6.0
138  
-    -- Score for merging.
139  
-    -- The share of merged arg. clusters.
140  
-    mrgScrVal = (osv + x / tot * asv) / 2.0
141  
-    -- Create add child scores.
142  
-    dcv = cv x y - cv x z
143  
-      where
144  
-        -- Adjusted coefficient of variation for two samples.
145  
-        cv x1 x2 = sqrt( ( (x1 - mean)^2 + (x2 - mean)^2) / 6.0 ) / mean
146  
-          where mean = (x1 + x2) / 2.0
147  
-    -- List of add child scores.
148  
-    addChdScrs = 
149  
-      [AddChdScr ((1 - abs dcv) * asv / 2.0) s oc1 oc2 | y>x && x>z] ++
150  
-      [AddChdScr ((1 - abs dcv) * asv / 2.0) s oc2 oc1 | z>x && x>y] ++
151  
-      [AddChdScr (dcv * asv / 2.0)           s oc1 oc2 | dcv > 0 && y>x] ++
152  
-      [AddChdScr (dcv * asv / 2.0)           s oc2 oc1 | dcv > 0 && x>y] ++
153  
-      [AddChdScr ((-dcv) * asv / 2.0)        s oc1 oc2 | dcv < 0 && z<x] ++
154  
-      [AddChdScr ((-dcv) * asv / 2.0)        s oc2 oc1 | dcv < 0 && x<z]
155  
-      where oc1 = sOc1 s; oc2 = sOc2 s
156  
-    -- Operator score candidates.
157  
-    opScrs = MrgScr mrgScrVal s : AbsScr absScrVal s : addChdScrs
158  
-
102  src/SP/Score/Argument.hs
... ...
@@ -0,0 +1,102 @@
  1
+{-# LANGUAGE TypeSynonymInstances #-}
  2
+module SP.Score.Argument (bestArgumentScores) where
  3
+
  4
+import Data.Function
  5
+import Data.Hashable
  6
+import Data.HashMap.Lazy (keys, lookupDefault)
  7
+import Data.IntMap (intersectionWith, elems)
  8
+import Data.Maybe
  9
+import Data.Ord
  10
+import qualified Data.Set as Set
  11
+import SP.ByteString
  12
+import SP.Cluster
  13
+import SP.Score.Math
  14
+import SP.Score.Score
  15
+import Data.List.Stream -- (find, foldl', intersect, sortBy)
  16
+import Prelude hiding (any,take,head,null,sum,(++),filter,foldr,length)
  17
+
  18
+-- | Degree 2 argument scores.
  19
+argumentScore :: (ArgumentCluster -> ObjectMap) -> ArgumentCluster -> 
  20
+                 ArgumentCluster -> (Incidence -> Incidence -> ArgumentScore)
  21
+argumentScore m ai aj = ArgumentScore value ai aj
  22
+  where
  23
+  value = (srel + sobj) / 2
  24
+
  25
+  srel, sobj:: Double
  26
+  srel = meanBy (\k -> 1 - abs (get k ai - get k aj)) ks
  27
+    where ks = (intersect `on` (keys.relMap)) ai aj
  28
+          get k ak = lookupDefault 0 k $ relMap ak
  29
+  --srel | null ks   = 0
  30
+  --     | otherwise = --1 - 1 / (fromIntegral.length) ks * 
  31
+                     --foldl' (\r k -> r + abs (get k ai - get k aj)) 0 ks
  32
+
  33
+  sobj = mean es -- | null es   = 0
  34
+    where es = elems $ intersectionWith (\x y -> 1 - abs(x - y)) (m ai) (m aj)
  35
+       -- | otherwise = --1 - 1 / cardinality es * sum es
  36
+       -- 1 - mean es
  37
+
  38
+-- | Degree 2 argument scores. Currently a bit tailored to siblings.
  39
+d2ArgumentScore :: [ArgumentScore] -> ArgumentCluster -> ArgumentCluster -> 
  40
+                   (Incidence -> Incidence -> ArgumentScore)
  41
+d2ArgumentScore ss ai aj = ArgumentScore value ai aj 
  42
+  where value = hmean2 d1ScrVal d2ScrVal
  43
+        d2ScrVal = argScrVal $ argumentScore chdMap (acSnd ai) (acSnd aj) 0 0
  44
+        d1ScrVal = toVal $ find matches ss -- Find degree 1 scores.
  45
+          where matches s = a1 s == acFst ai && a2 s == acFst aj
  46
+                toVal (Just s) = argScrVal s
  47
+                toVal Nothing  = 0
  48
+
  49
+-- | All argument scores valued above zero. 
  50
+argumentScores :: ObjectCluster -> ObjectCluster -> [ArgumentScore]
  51
+argumentScores oi oj = parScrs ++ chdScrs ++ sblScrs
  52
+  where
  53
+  parScrs = as parMap pars; chdScrs = as chdMap chdn; sblScrs = as2 sbls
  54
+  -- Degree 1 argument scores.
  55
+  as m = scores $ argumentScore m
  56
+  -- Degree 2 argument scores.
  57
+  as2 = scores $ d2ArgumentScore parScrs
  58
+  -- Builds scores using score function scrFun, and fetches the argument
  59
+  -- clusters from the incidence lists accessed by iacc, filter for values > 0.
  60
+  scores scrFun iacc = [score | ih <- iacc oi, ik <- iacc oj, 
  61
+                        let score = scrFun (fst ih) (fst ik) (snd ih) (snd ik),
  62
+                        argScrVal score > 0]
  63
+
  64
+-- | Greedy search for the best combination of argument scores.
  65
+bestArgumentScores :: ObjectCluster -> ObjectCluster -> [ArgumentScore]
  66
+bestArgumentScores oi oj = align sorted
  67
+  where
  68
+  -- Descending sort of all argument scores valued above 0.
  69
+  sorted = sortBy (comparing (negate.argScrVal)) (argumentScores oi oj)
  70
+--align = foldr (\n r -> n:filter (isIndependentOf n) r) []
  71
+  align = let f r n | any (isDependentOn n) r = r
  72
+                    | otherwise               = n:r
  73
+          in foldl' f []
  74
+       
  75
+--3 funkar
  76
+--align :: [ArgumentScore] -> [ArgumentScore]
  77
+--align = fst.foldl' f ([], Set.empty)
  78
+--  where f :: ([ArgumentScore], Set.Set ArgumentCluster) -> ArgumentScore -> 
  79
+--             ([ArgumentScore], Set.Set ArgumentCluster)
  80
+--        f (as,s) n | Set.member (a1 n) s || Set.member (a2 n) s = (as,s)
  81
+--                   | otherwise = (n:as, Set.insert (a1 n).Set.insert (a2 n) $ s)
  82
+--1 funkar
  83
+--align (score:scores) = score:align (filter (isIndependentOf score) scores)
  84
+--align []             = []
  85
+--2 funkar ej
  86
+--align (score:scores) = foldr construct [score] scores
  87
+--  where construct n r = if n `isIndependentOf` head r then n:r else r
  88
+--align []             = []
  89
+
  90
+-- | Independence of argument scores.
  91
+isIndependentOf :: ArgumentScore -> ArgumentScore -> Bool
  92
+isIndependentOf ArgumentScore {a1 = ai, a2 = aj} 
  93
+                ArgumentScore {a1 = ak, a2 = al} = ai /= ak && aj /= al
  94
+
  95
+-- | Independence of argument scores.
  96
+isDependentOn :: ArgumentScore -> ArgumentScore -> Bool
  97
+isDependentOn ArgumentScore {a1 = ai, a2 = aj} 
  98
+              ArgumentScore {a1 = ak, a2 = al} = ai == ak || aj == al
  99
+
  100
+instance Ord ArgumentCluster where