In [1]:
import os

if not os.path.exists('data'):
    os.chdir('..')
    assert os.getcwd().endswith('HoardingDisorderScripts')
os.getcwd()

'/home/danim/HoardingDisorderScripts'

# Identifying Duplicate Documents

## Identifying same-name documents

I'm going to start by specifically querying for every document name that appears twice.

In [25]:
from collections import Counter
import utils.datasaur as data


doc_names = [doc.name for doc in data.by_doc]
doc_name_cntr = Counter(doc_names)

duplicate_doc_names = {name : count for name, count in doc_name_cntr.items()
                       if count >= 2}
print(duplicate_doc_names)

{'050_617.txt': 2, '046_565.txt': 2, '046_554.txt': 2, '048_592.txt': 2, '050_610.txt': 2, '049_595.txt': 2, '047_569.txt': 2, '049_597.txt': 2, '050_621.txt': 2, '048_593.txt': 2, '050_614.txt': 2, '046_564.txt': 2, '048_586.txt': 2, '048_585.txt': 2, '046_566.txt': 2, '049_600.txt': 2, '046_562.txt': 2, '046_557.txt': 2, '046_558.txt': 2, '050_611.txt': 2, '047_572.txt': 2, '050_624.txt': 2, '046_556.txt': 2, '047_573.txt': 2, '046_561.txt': 2, '047_575.txt': 2, '048_584.txt': 2, '048_588.txt': 2, '046_563.txt': 2, '049_604.txt': 2, '048_587.txt': 2, '050_612.txt': 2, '049_605.txt': 2, '047_574.txt': 2, '046_567.txt': 2, '047_577.txt': 2, '050_618.txt': 2, '049_596.txt': 2, '050_622.txt': 2, '049_599.txt': 2, '049_607.txt': 2, '050_620.txt': 2, '048_591.txt': 2, '048_583.txt': 2, '047_576.txt': 2, '050_615.txt': 2, '050_613.txt': 2, '050_616.txt': 2, '047_579.txt': 2, '048_594.txt': 2, '047_580.txt': 2, '050_619.txt': 2, '048_582.txt': 2, '048_590.txt': 2, '050_625.txt': 2, '049_603.

So, we see a great deal of document names appear twice, unfortunately. Thankfully, we can see that duplicate document names appear no more than twice:

In [None]:
print(not any(count > 2 for count in duplicate_doc_names.values()))

Here, I'm going to organize documents of the same names into pairs:

In [23]:
from itertools import product


duplicate_docs = [doc for doc in data.by_doc 
                  if doc.name in duplicate_doc_names]
dupdoc_pairs = [(doc1, doc2) for doc1, doc2 in product(duplicate_docs, repeat=2)
                if doc1.name == doc2.name and doc1 is not doc2]
dupdoc_pairs

[(Document(name="050_617.txt", project="s1043-5_s2010-11_s3016-25"),
  Document(name="050_617.txt", project="s1046-50_s2012-13_s3026-50")),
 (Document(name="046_565.txt", project="s1043-5_s2010-11_s3016-25"),
  Document(name="046_565.txt", project="s1046-50_s2012-13_s3026-50")),
 (Document(name="046_554.txt", project="s1043-5_s2010-11_s3016-25"),
  Document(name="046_554.txt", project="s1046-50_s2012-13_s3026-50")),
 (Document(name="048_592.txt", project="s1043-5_s2010-11_s3016-25"),
  Document(name="048_592.txt", project="s1046-50_s2012-13_s3026-50")),
 (Document(name="050_610.txt", project="s1043-5_s2010-11_s3016-25"),
  Document(name="050_610.txt", project="s1046-50_s2012-13_s3026-50")),
 (Document(name="049_595.txt", project="s1043-5_s2010-11_s3016-25"),
  Document(name="049_595.txt", project="s1046-50_s2012-13_s3026-50")),
 (Document(name="047_569.txt", project="s1043-5_s2010-11_s3016-25"),
  Document(name="047_569.txt", project="s1046-50_s2012-13_s3026-50")),
 (Document(name="049

In [26]:
# Sanity check: make sure all pairs actually have the same name
assert all([doc_pair[0].name == doc_pair[1].name for doc_pair in dupdoc_pairs])

## All same-name documents have the same text content

And from the below code, we will see that all of these documents indeed share the same text content.

In [30]:
assert all([doc_pair[0].content == doc_pair[1].content for doc_pair in dupdoc_pairs])
assert all([doc_pair[0].lines == doc_pair[1].lines for doc_pair in dupdoc_pairs])
assert all([doc_pair[0].tokens == doc_pair[1].tokens for doc_pair in dupdoc_pairs])

Note that this is quite a strict requirement for these documents to satisfy. I was expected at least a few documents to not have the *exact* same content, maybe as a result of spacing being slightly different. But no, all of these same-name documents have the exact same text content, so they are in fact duplicates.

Let's see how many there are:

In [35]:
len(dupdoc_pairs)

142

## Checking duplicate document label data

It would be nice if all of these documents happened to have the same label data. However:

In [34]:
print(all([doc_pair[0].label_data == doc_pair[1].label_data for doc_pair in dupdoc_pairs]), 
      all([doc_pair[0].label_counts == doc_pair[1].label_counts for doc_pair in dupdoc_pairs]))

False False


We see that this is not the case.

In [40]:
dupdoc_pairs[0][0].label_data[0]['textPosition']

{'start': {'row': 16, 'column': 0, 'tokenIndex': 0, 'charIndex': 0},
 'end': {'row': 16, 'column': 0, 'tokenIndex': 5, 'charIndex': 9}}

### Request: Checking Missing Documents

As Dr. Moeller requested, I took the `datasaur` folder from from the Google Drive and I looked for any filenames that were not included in the datasaur data that was downloaded from datasaur following adjudication.

In [55]:
from pathlib import Path

# Google Drive document names
gd_docnames = [path.name for path in Path('datasaur').rglob('*.txt')]
print(gd_docnames)

['3001_056.txt', '3001_040.txt', '3001_032.txt', '3001_028.txt', '3001_095.txt', '3001_064.txt', '3001_042.txt', '3001_065.txt', '3001_029.txt', '3001_050.txt', '3001_070.txt', '3001_041.txt', '3001_076.txt', '3001_009.txt', '3001_071.txt', '3001_052.txt', '3001_006.txt', '3001_008.txt', '3001_063.txt', '3001_069.txt', '3001_011.txt', '3001_014.txt', '3001_020.txt', '3001_044.txt', '3001_087.txt', '3001_086.txt', '3001_083.txt', '3001_045.txt', '3001_068.txt', '3001_025.txt', '3001_017.txt', '3001_077.txt', '3001_037.txt', '3001_010.txt', '3001_018.txt', '3001_080.txt', '3001_019.txt', '3001_074.txt', '3001_058.txt', '3001_049.txt', '3001_013.txt', '3001_066.txt', '3001_090.txt', '3001_030.txt', '3001_034.txt', '3001_053.txt', '3001_024.txt', '3001_015.txt', '3001_079.txt', '3001_036.txt', '3001_088.txt', '3001_039.txt', '3001_081.txt', '3001_046.txt', '3001_047.txt', '3001_033.txt', '3001_089.txt', '3001_001.txt', '3001_002.txt', '3001_072.txt', '3001_061.txt', '3001_055.txt', '3001_0

It seems that the only name that got messed up is a file that separated its transcript number from its document number using a hyphen instead of the usual underscore. However, it seems that the mistake got fixed somehow anyway, since the assumed corresponding document is clearly in the imported datasaur data.

In [61]:
ds_docnames = [doc.name for doc in data.by_doc]
print([name for name in gd_docnames if name not in ds_docnames])
print([name for name in ds_docnames if name not in gd_docnames])

['2001-003.txt']
['2001_003.txt']
