Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#64] Copy paste detection in lists #102

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ default-extensions:
- ConstraintKinds
- DataKinds
- DefaultSignatures
- DeriveAnyClass
- DeriveDataTypeable
- DeriveGeneric
- DerivingStrategies
Expand Down
52 changes: 30 additions & 22 deletions src/Xrefcheck/Core.hs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import Control.Lens (makeLenses)
import Data.Aeson (FromJSON (..), withText)
import Data.Char (isAlphaNum)
import Data.Char qualified as C
import Data.DList (DList)
import Data.DList qualified as DList
import Data.Default (Default (..))
import Data.List qualified as L
import Data.Map qualified as M
Expand All @@ -26,8 +28,6 @@ import Text.Numeral.Roman (toRoman)

import Xrefcheck.Progress
import Xrefcheck.Util
import Data.DList (DList)
import Data.DList qualified as DList

-----------------------------------------------------------
-- Types
Expand All @@ -40,15 +40,10 @@ import Data.DList qualified as DList
data Flavor
= GitHub
| GitLab
deriving stock (Show)
deriving stock (Show, Enum, Bounded)

allFlavors :: [Flavor]
allFlavors = [GitHub, GitLab]
where
_exhaustivenessCheck = \case
GitHub -> ()
GitLab -> ()
-- if you update this, also update the list above
allFlavors = [minBound.. maxBound]

instance FromJSON Flavor where
parseJSON = withText "flavor" $ \txt ->
Expand All @@ -62,6 +57,7 @@ instance FromJSON Flavor where
-- representation of this thing, and it actually appears in reports only.
newtype Position = Position (Maybe Text)
deriving stock (Show, Eq, Generic)
deriving anyclass NFData

instance Buildable Position where
build (Position pos) = case pos of
Expand All @@ -77,7 +73,9 @@ data Reference = Reference
, rAnchor :: Maybe Text
-- ^ Section or custom anchor tag.
, rPos :: Position
} deriving stock (Show, Generic)
}
deriving stock (Show, Generic)
deriving anyclass NFData

-- | Context of anchor.
data AnchorType
Expand All @@ -88,35 +86,51 @@ data AnchorType
| BiblioAnchor
-- ^ Id of entry in bibliography
deriving stock (Show, Eq, Generic)
deriving anyclass NFData

-- | A referable anchor.
data Anchor = Anchor
{ aType :: AnchorType
, aName :: Text
, aPos :: Position
} deriving stock (Show, Eq, Generic)
}
deriving stock (Show, Eq, Generic)
deriving anyclass NFData

data CopyPaste = CopyPaste
{ cpAnchorText :: Text
Copy link
Contributor

@alyoanton9 alyoanton9 Nov 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like AnchorText and PlainText sound confusing. The issue doesn't focus on anchors, but all type of links, external and internal. Perhaps, more generic LinkText and URLText would be better?

UPD: Okay, I'm not sure if all the links were intended to check, as there is only example with files in issue description. But it's also not clear that this feature is for files only

, cpPlainText :: Text
, cpPosition :: Position
}
deriving stock (Show, Eq, Generic)
deriving anyclass NFData

data FileInfoDiff = FileInfoDiff
{ _fidReferences :: DList Reference
, _fidAnchors :: DList Anchor
, _fidCopyPastes :: DList CopyPaste
}
makeLenses ''FileInfoDiff

diffToFileInfo :: FileInfoDiff -> FileInfo
diffToFileInfo (FileInfoDiff refs anchors) =
FileInfo (DList.toList refs) (DList.toList anchors)
diffToFileInfo (FileInfoDiff refs anchors pastas) =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: I believe here should be pastes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🍝

FileInfo (DList.toList refs) (DList.toList anchors) (DList.toList pastas)

instance Semigroup FileInfoDiff where
FileInfoDiff a b <> FileInfoDiff c d = FileInfoDiff (a <> c) (b <> d)
FileInfoDiff a b e <> FileInfoDiff c d f = FileInfoDiff (a <> c) (b <> d) (e <> f)

instance Monoid FileInfoDiff where
mempty = FileInfoDiff mempty mempty
mempty = FileInfoDiff mempty mempty mempty

-- | All information regarding a single file we care about.
data FileInfo = FileInfo
{ _fiReferences :: [Reference]
, _fiAnchors :: [Anchor]
} deriving stock (Show, Generic)
, _fiCopyPastes :: [CopyPaste]
}
deriving stock (Show, Generic)
deriving anyclass NFData

makeLenses ''FileInfo

instance Default FileInfo where
Expand All @@ -129,12 +143,6 @@ newtype RepoInfo = RepoInfo (Map FilePath FileInfo)
-- Instances
-----------------------------------------------------------

instance NFData Position
instance NFData Reference
instance NFData AnchorType
instance NFData Anchor
instance NFData FileInfo

instance Buildable Reference where
build Reference{..} =
nameF ("reference " +| paren (build loc) |+ " " +| rPos |+ "") $
Expand Down
57 changes: 49 additions & 8 deletions src/Xrefcheck/Scanners/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@ module Xrefcheck.Scanners.Markdown
import Universum

import CMarkGFM (Node (..), NodeType (..), PosInfo (..), commonmarkToNode)
import Control.Lens hiding ((^?))
import Control.Monad.Except (MonadError, throwError)
import Data.Aeson.TH (deriveFromJSON)
import Data.ByteString.Lazy qualified as BSL
import Data.Char (isAlpha)
import Data.DList qualified as DList
import Data.Default (def)
import Data.List (isSubsequenceOf)
import Data.Text qualified as T
import Data.Text.Lazy qualified as LT
import Fmt (Buildable (..), blockListF, nameF, (+|), (|+))
Expand All @@ -37,6 +40,8 @@ data MarkdownConfig = MarkdownConfig

deriveFromJSON aesonConfigOption ''MarkdownConfig

makePrisms ''NodeType

defGithubMdConfig :: MarkdownConfig
defGithubMdConfig = MarkdownConfig
{ mcFlavor = GitHub
Expand Down Expand Up @@ -130,7 +135,7 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
else case removeIgnored input of
Left err -> throwError err
Right relevant ->
diffToFileInfo <$> foldNode extractor relevant
diffToFileInfo <$> foldNode (merge [extractor, copyPaste]) relevant

where
extractor :: Node -> m FileInfoDiff
Expand All @@ -144,17 +149,19 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
let aType = HeaderAnchor lvl
let aName = headerToAnchor flavor $ nodeExtractText node
let aPos = toPosition pos
return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos}
return mempty
{ _fidAnchors = DList.singleton $ Anchor {aType, aName, aPos}
}

HTML_INLINE text -> do
let mName = T.stripSuffix "\">" =<< T.stripPrefix "<a name=\"" text
case mName of
Just aName -> do
let aType = HandAnchor
aPos = toPosition pos
return $ FileInfoDiff
mempty
(pure $ Anchor {aType, aName, aPos})
return mempty
{ _fidAnchors = DList.singleton $ Anchor {aType, aName, aPos}
}

Nothing -> do
return mempty
Expand All @@ -167,12 +174,46 @@ nodeExtractInfo input@(Node _ _ nSubs) = do
[t] -> (t, Nothing)
t : ts -> (t, Just $ T.intercalate "#" ts)
[] -> error "impossible"
return $ FileInfoDiff
(DList.singleton $ Reference {rName, rPos, rLink, rAnchor})
DList.empty
return mempty
{ _fidReferences = DList.singleton $ Reference {rName, rPos, rLink, rAnchor}
}

_ -> return mempty

copyPaste :: Node -> m FileInfoDiff
copyPaste (Node _ (LIST _) nodes) = do
case items of
top : rest | urlIsASubsequence top -> do
let bad = filter (not . urlIsASubsequence) rest
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the acceptance criteria, we want to report copy-paste if there are two links [T1](L1) and [T2](L1) in a file, and T1 is substring of L1 modulo casing and all the non-alphanum characters, while T2 is not substring of L1 modulo the same things;

However, AFAIU here you check that the first item satisfy the law -- T1 is a subsequence of L1 -- and report all other links from the list that doesn't satisfy as invalid, so [Tn](Ln) would also be reported if Tn isn't a subsequence of Ln, e.g.:

- [Foo Bar](foo-bar) e
- [Foo Qux](foo-qux) e
- [Foo Kek](foo-kek) e
- [Just text](file) e

produces

  ➥  In file tests/markdowns/without-annotations/copy/copy-paste.md
     bad reference (local) at src:18:4-12:
       - text: ""
       - link:
       - anchor: -

     ⛂  Possibly incorrect copy-paste in list with references
        the url is file
           but the text is Just text

I am not sure if we need to consider such list items as bad copy-paste, I believe it would be better to don't take them into account at all and report only those which strictly satisfy the law from acceptance criteria -- there are two links [T1](L1) and [T2](L1) in a file, and T1 is substring of L1 modulo casing and all the non-alphanum characters, while T2 is not substring of L1 modulo the same things;

pure mempty { _fidCopyPastes = DList.fromList bad }
_ -> do
pure mempty
where
items = do
(_, nodes', _) <- takeOnly _ITEM nodes
(_, nodes'', _) <- takeOnly _PARAGRAPH nodes'
take 1 $ do
(_, texts, (url, _)) <- takeOnly _LINK nodes''
(pos, _, txt) <- take 1 $ takeOnly _TEXT texts
return (CopyPaste url txt (toPosition pos))

copyPaste _ = pure mempty

takeOnly prizm list = do
Node pos hdr nodes <- list
case hdr^?prizm of
Just res -> return (pos, nodes, res)
Nothing -> []

urlIsASubsequence :: CopyPaste -> Bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: AFAICS, here we check if link text is a subsequence of link URL, so if name is a URL subsequence, not URL is a subsequence of name

urlIsASubsequence paste =
gist (cpAnchorText paste) `isSubsequenceOf` gist (cpPlainText paste)
where
gist = T.unpack . T.toLower . T.filter isAlpha

merge :: (Monad m, Monoid b) => [a -> m b] -> a -> m b
merge fs a = mconcat <$> traverse ($ a) fs

checkIgnoreFile :: [Node] -> Bool
checkIgnoreFile nodes =
let isSimpleComment :: Node -> Bool
Expand Down
25 changes: 19 additions & 6 deletions src/Xrefcheck/Verify.hs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import Universum
import Control.Concurrent.Async (wait, withAsync)
import Control.Exception (throwIO)
import Control.Monad.Except (MonadError (..))
import Data.Bits (toIntegralSized)
import Data.ByteString qualified as BS
import Data.Map qualified as M
import Data.Text qualified as T
Expand All @@ -49,7 +50,6 @@ import Text.Regex.TDFA.Text (Regex, regexec)
import Text.URI (Authority (..), URI (..), mkURI)
import Time (RatioNat, Second, Time (..), ms, threadDelay, timeout)

import Data.Bits (toIntegralSized)
import Xrefcheck.Config
import Xrefcheck.Core
import Xrefcheck.Orphans ()
Expand All @@ -70,9 +70,7 @@ deriving newtype instance Semigroup (VerifyResult e)
deriving newtype instance Monoid (VerifyResult e)

instance Buildable e => Buildable (VerifyResult e) where
build vr = case verifyErrors vr of
Nothing -> "ok"
Just errs -> listF errs
build vr = maybe "ok" listF (verifyErrors vr)

verifyOk :: VerifyResult e -> Bool
verifyOk (VerifyResult errors) = null errors
Expand Down Expand Up @@ -114,6 +112,7 @@ data VerifyError
| ExternalFtpException FTPException
| FtpEntryDoesNotExist FilePath
| ExternalResourceSomeError Text
| PossiblyIncorrectCopyPaste Text Text
deriving stock (Show, Eq)

instance Buildable VerifyError where
Expand Down Expand Up @@ -156,10 +155,15 @@ instance Buildable VerifyError where
"⛂ FTP exception (" +| err |+ ")\n"

FtpEntryDoesNotExist entry ->
"⛂ File or directory does not exist:\n" +| entry |+ "\n"
"⛂ File or directory does not exist:\n" +| entry |+ "\n"

ExternalResourceSomeError err ->
"⛂ " +| build err |+ "\n\n"

PossiblyIncorrectCopyPaste url text ->
"⛂ Possibly incorrect copy-paste in list with references\n" +|
" the url is " +| build url |+ "\n " +|
" but the text is " +| build text |+ "\n\n"
where
anchorHints = \case
[] -> "\n"
Expand Down Expand Up @@ -219,10 +223,19 @@ verifyRepo

progressRef <- newIORef $ initVerifyProgress (map snd toScan)

errorss <- for (M.toList repoInfo) $ \(file, info) -> do
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and below

Suggested change
errorss <- for (M.toList repoInfo) $ \(file, info) -> do
errors <- for (M.toList repoInfo) $ \(file, info) -> do

let pasta = _fiCopyPastes info
return
$ VerifyResult
$ fmap (\(CopyPaste url txt pos) ->
WithReferenceLoc file (Reference "" "" Nothing pos)
$ PossiblyIncorrectCopyPaste url txt)
pasta

accumulated <- withAsync (printer progressRef) $ \_ ->
forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) ->
verifyReference config mode progressRef repoInfo' root file ref
return $ fold accumulated
return $ fold errorss <> fold accumulated
where
printer progressRef = forever $ do
readIORef progressRef >>= reprintAnalyseProgress rw mode
Expand Down
24 changes: 24 additions & 0 deletions tests/Test/Xrefcheck/CopyPasteInListsSpec.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{- SPDX-FileCopyrightText: 2019 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-}

module Test.Xrefcheck.CopyPasteInListsSpec where

import Universum

import Test.Hspec (Spec, describe, it, shouldBe)

import Test.Xrefcheck.Util
import Xrefcheck.Core

spec :: Spec
spec = do
describe "Possibly incorrect copy-paste" $ do
for_ allFlavors $ \fl -> do
it ("is detected (" <> show fl <> ")") $ do
fi <- getFI fl "tests/markdowns/without-annotations/copy-paste_in_lists.md"
getPasta fi `shouldBe`[("foo-bar","Foo Kek")]
where
getPasta :: FileInfo -> [(Text, Text)]
getPasta fi = map (cpAnchorText &&& cpPlainText) $ fi ^. fiCopyPastes
17 changes: 17 additions & 0 deletions tests/markdowns/without-annotations/copy-paste_in_lists.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!--
- SPDX-FileCopyrightText: 2018-2019 Serokell <https://serokell.io>
-
- SPDX-License-Identifier: MPL-2.0
-->

A list with bad copy-paste:

- [Foo Bar](foo-bar) e
- [Foo Qux](foo-qux) e
- [Foo Kek](foo-bar) e

A list that is completely fine:

- [Foo Bar](foo-bar) e
- [Foo Qux](foo-qux) e
- [Foo Kek](foo-kek) e