In [43]:
import qualified Data.Text as T

import Control.Monad
import Control.Monad.Bayes.Class 
import Control.Monad.Bayes.Sampler
import Control.Monad.Bayes.Traced
import Control.Monad.Bayes.Weighted

import Graphics.Vega.VegaLite hiding (density)
import qualified Graphics.Vega.VegaLite as VL
import IHaskell.Display.Hvega (vlShow)

:e OverloadedStrings
:e BlockArguments



Inspired by the tutorials on probabilistic programming language Gen (https://www.gen.dev/tutorials/iterative-inference/tutorial), we'll use the example of a regression with outliers. The idea is that each datapoint $(x,y)$ has $y$ either linearly dependent on $x$, or randomly sampled (an outlier). So the goal of inference is to *jointly* work out what the linear relationship is and which points flout it.

In [68]:
paramPrior = do
    slope <- normal 0 2
    intercept <- normal 0 2
    noise <- gamma 1 1
    prob_outlier <- uniform 0 0.5 
    return (slope, intercept, noise, prob_outlier)

linear slope intercept x = x*slope + intercept

forward (slope, intercept, noise, probOutlier) x = do
    isOutlier <- bernoulli probOutlier
    let meanParams = if isOutlier
                    then (0, 20)
                    else (linear slope intercept x, noise)
    return (meanParams, isOutlier)

regressionWithOutliersData :: (MonadSample m, Traversable t) => t Double -> m (t (Double, Bool))
regressionWithOutliersData xs = do
    params <- paramPrior

    forM xs \x -> do
        ((mu, std), isOutlier) <- forward params x
        y <- normal mu std
        return (y, isOutlier)

This is our model. It describes a process for getting $y$ from $x$. Specifically, you start by drawing values for the slope $s$, bias $b$ and noise $n$. Then for each input $x$, you flip a coin. If it lands one way, you draw a $y$ value from a normal with mean $x*slope + bias$ and std $n$, and otherwise you draw from a centered normal with large variance.

Given a list of $x$ values, this gives a distribution over lists of $y$ values, from which we can sample:




In [63]:
range = [-10,-9.9..10] :: [Double]
samples <- sampleIOfixed $ regressionWithOutliersData range

In [64]:
baseData = dataFromColumns [ ]
  . dataColumn "X" (Numbers range)
  . dataColumn "Y" (Numbers (fst <$> samples))
  . dataColumn "Outlier" (Strings (T.pack . show . snd <$> samples))

baseEncoding = encoding
                    . position X [ PName "X" ]
                    . position Y [ PName "Y" ]
                    . color [ MName "Outlier"]

showPlot enc dat = vlShow $ toVegaLite [ 
            dat [],
            mark Point []
              , enc []
              , width 200
              , height 200
              ]

showPlot baseEncoding baseData

This is our dataset, with outliers shown in orange.

Given this dataset of $(x,y)$ pairs, we now consider the problem of inference. That is, we want to infer the slope, bias, noise and *for each datapoint*, whether it's an outlier.

Our first attempt will be to average a large number of samples, weighted by how likely they are. TODO: add the conditioning


Why did this result in a bad guess? Simply because there are so many more bad guesses than good ones, so that even if you weight the good guesses higher, if you have almost none, then it doesn't matter.

The classic solution to this problem (the solution space is too large for independent sampling) is to use a Markov Chain method, where each sample depends on the last. You can do so in a clever way, to obtain unbiased samples from the true posterior distribution in the limit of a sufficiently long chain. Monad-bayes implements the standard Markov Chain Monte Carlo approach with Metropolis Hastings transitions.

Here's how it goes:

In [174]:
regressionWithOutliers :: (MonadSample m, MonadCond m) =>
    [Double] -> [Double] -> m ((Double, Double, Double, Double), [Bool])
regressionWithOutliers xs ys = do
    params <- paramPrior
    
    outliers <- forM (zip xs ys) \(x, y) -> do
        ((mu, std), isOutlier) <- forward params x
        factor $ normalPdf mu std y
        return isOutlier
    return (params, outliers)

In [74]:
mhRuns <- sampleIOfixed $ prior $ mh 10000 $ regressionWithOutliers range (fst <$> samples)




In [75]:


-- countOutliers :: [[Bool]] -> [(Int, Int)]
countOutliers = foldr 
    (\(_,lb) li -> 
        [ if b then (num1+1, num2) else (num1,num2+1) | (b,(num1, num2)) <- zip lb li]) 
    (Prelude.repeat (0,0))


In [76]:
predData = baseData . 
  dataColumn "Outlier Prediction" 
      (Numbers ((\(x, y) -> log (fromIntegral y / (fromIntegral x+1))) 
        <$> take 9000 (countOutliers mhRuns)))
  
predEncoding = baseEncoding . color [ MName "Outlier Prediction", VL.MmType VL.Quantitative]
showPlot predEncoding predData

Running MCMC gives us a list of samples. The graph displays points as more opaque according to how many samples consider that point to not be an outlier. The results make sense: points that are very near the line are opaque and ones very far are transparent. 

It would be nice to make our approach more sample efficient though. The key to that is to choose a proposal distribution more cleverly. Again, we follow the approaches in Gen.

In [184]:
import Control.Monad.Bayes.Population
import Control.Monad.Bayes.Inference.RMSMC
smcRuns <- sampleIOfixed $ runPopulation $ rmsmc 200 10 100 $ regressionWithOutliers range (fst <$> samples)




In [145]:
import Data.Function
import Data.List
import Data.Ord


In [186]:
last smcRuns

(((-3.999910332767723,1.2421556781150982,1.51073704384082,0.17345226725403168),[True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False,False,False,False,False,False,False,False,True,False,True,False,True,True,True,True,True,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,

In [129]:
a = undefined :: Log Double

b = ln . exp $ a 

:t a

In [194]:
import Numeric.Log

countOutliersWithWeight :: [((a, [Bool]), Log Double)] -> [(Double, Double)]
countOutliersWithWeight = foldr 
    (\((_,lb),w) li -> 
        [ if b then (num1+ 1, num2) else (num1,num2+ 1) | ((b),(num1, num2)) <- zip lb li]) 
    (Prelude.repeat (0,0))

predData = baseData . 
  dataColumn "Outlier Prediction" 
      -- (Booleans $ (\((_,s),_) -> s) (maximumBy (compare `on` (snd)) smcRuns))
      (Booleans ((\(x, y) -> ( if x > y then False else True) )
        <$> (countOutliers (fst <$> smcRuns))))
  
predEncoding = baseEncoding . color [ MName "Outlier Prediction", VL.MmType VL.Quantitative]
showPlot predEncoding predData

# Particle Marginal Metropolis Hastings