Permalink
Browse files

Merge pull request #759 from samtools/rhl_validate_ac_af_no_gt_757

Validate VariantContext AC and AF without genotypes
  • Loading branch information...
2 parents 4d0070b + 7e91886 commit e69aff0a81e7ec59844574504fcc19a26eef509d @ronlevine ronlevine committed on GitHub Dec 7, 2016
@@ -25,7 +25,6 @@
package htsjdk.variant.variantcontext;
-import htsjdk.samtools.util.Tuple;
import htsjdk.tribble.Feature;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.util.ParsingUtils;
@@ -263,7 +262,7 @@
* @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first
*/
public List<String> calcVCFGenotypeKeys(final VCFHeader header) {
- final Set<String> keys = new HashSet<String>();
+ final Set<String> keys = new HashSet<>();
boolean sawGoodGT = false;
boolean sawGoodQual = false;
@@ -287,11 +286,11 @@
if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY);
if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
- List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
+ List<String> sortedList = ParsingUtils.sortList(new ArrayList<>(keys));
// make sure the GT is first
if (sawGoodGT) {
- final List<String> newList = new ArrayList<String>(sortedList.size()+1);
+ final List<String> newList = new ArrayList<>(sortedList.size() + 1);
newList.add(VCFConstants.GENOTYPE_KEY);
newList.addAll(sortedList);
sortedList = newList;
@@ -434,7 +433,7 @@ public VariantContext subContextFromSamples(Set<String> sampleNames, final boole
Set<Allele> allelesFromGenotypes = allelesOfGenotypes(newGenotypes);
// ensure original order of genotypes
- List<Allele> rederivedAlleles = new ArrayList<Allele>(allelesFromGenotypes.size());
+ List<Allele> rederivedAlleles = new ArrayList<>(allelesFromGenotypes.size());
for (Allele allele : alleles)
if (allelesFromGenotypes.contains(allele))
rederivedAlleles.add(allele);
@@ -469,7 +468,7 @@ public VariantContext subContextFromSample(String sampleName) {
* @return allele set
*/
private final Set<Allele> allelesOfGenotypes(Collection<Genotype> genotypes) {
- final Set<Allele> alleles = new HashSet<Allele>();
+ final Set<Allele> alleles = new HashSet<>();
boolean addedref = false;
for ( final Genotype g : genotypes ) {
@@ -863,7 +862,7 @@ private boolean hasAllele(final Allele allele, final boolean ignoreRefState, fin
return null;
}
- List<Integer> lengths = new ArrayList<Integer>();
+ List<Integer> lengths = new ArrayList<>();
for ( Allele a : getAlternateAlleles() ) {
lengths.add(a.length() - getReference().length());
}
@@ -973,7 +972,7 @@ public GenotypesContext getGenotypes(String sampleName) {
* @throws IllegalArgumentException if sampleName isn't bound to a genotype
*/
protected GenotypesContext getGenotypes(Collection<String> sampleNames) {
- return getGenotypes().subsetToSamples(new HashSet<String>(sampleNames));
+ return getGenotypes().subsetToSamples(new HashSet<>(sampleNames));
}
public GenotypesContext getGenotypes(Set<String> sampleNames) {
@@ -1049,7 +1048,7 @@ public int getCalledChrCount(Set<String> sampleIds) {
* @return chromosome count
*/
public int getCalledChrCount(Allele a) {
- return getCalledChrCount(a,new HashSet<String>(0));
+ return getCalledChrCount(a, new HashSet<>(0));
}
/**
@@ -1162,7 +1161,7 @@ public int getMixedCount() {
* Run all extra-strict validation tests on a Variant Context object
*
* @param reportedReference the reported reference allele
- * @param observedReference the actual reference allele
+ * @param observedReference the observed reference allele
* @param rsIDs the true dbSNP IDs
*/
public void extraStrictValidation(final Allele reportedReference, final Allele observedReference, final Set<String> rsIDs) {
@@ -1172,7 +1171,7 @@ public void extraStrictValidation(final Allele reportedReference, final Allele o
// validate the RS IDs
validateRSIDs(rsIDs);
- // validate the altenate alleles
+ // validate the alternate alleles
validateAlternateAlleles();
// validate the AN and AC fields
@@ -1201,7 +1200,7 @@ public void validateAlternateAlleles() {
if ( !hasGenotypes() )
return;
- // maintain a list of non-symbolic alleles reported in the REF and ALT fields of the record
+ // maintain a list of non-symbolic alleles expected in the REF and ALT fields of the record
// (we exclude symbolic alleles because it's commonly expected that they don't show up in the genotypes, e.g. with GATK gVCFs)
final List<Allele> reportedAlleles = new ArrayList<Allele>();
for ( final Allele allele : getAlleles() ) {
@@ -1210,7 +1209,7 @@ public void validateAlternateAlleles() {
}
// maintain a list of non-symbolic alleles observed in the genotypes
- final Set<Allele> observedAlleles = new HashSet<Allele>();
+ final Set<Allele> observedAlleles = new HashSet<>();
observedAlleles.add(getReference());
for ( final Genotype g : getGenotypes() ) {
if ( g.isCalled() ) {
@@ -1233,24 +1232,39 @@ public void validateAlternateAlleles() {
throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getContig(), getStart()));
}
+ private void validateAttributeIsExpectedSize(final String attributeKey, final int numAlternateAlleles ) {
+ final List<Object> actualValues = getAttributeAsList(attributeKey);
+ if (!actualValues.isEmpty()) {
+ // always have at least one actual value
+ final int expectedValuesSize = numAlternateAlleles > 0 ? numAlternateAlleles : 1;
+ if (actualValues.size() != expectedValuesSize) {
+ throw new TribbleException.InternalCodecException(String.format("the %s tag has the incorrect number of records at position %s:%d, %d vs. %d", attributeKey, getContig(), getStart(), actualValues.size(), expectedValuesSize));
+ }
+ }
+ }
+
public void validateChromosomeCounts() {
+ final int numberOfAlternateAlleles = alleles.size() - 1;
+ validateAttributeIsExpectedSize(VCFConstants.ALLELE_COUNT_KEY, numberOfAlternateAlleles);
+ validateAttributeIsExpectedSize(VCFConstants.ALLELE_FREQUENCY_KEY, numberOfAlternateAlleles);
+
if ( !hasGenotypes() )
return;
// AN
if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) {
- int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString());
- int observedAN = getCalledChrCount();
+ final int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString());
+ final int observedAN = getCalledChrCount();
if ( reportedAN != observedAN )
throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedAN, observedAN));
}
// AC
if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
- ArrayList<Integer> observedACs = new ArrayList<Integer>();
+ final ArrayList<Integer> observedACs = new ArrayList<>();
// if there are alternate alleles, record the relevant tags
- if (!getAlternateAlleles().isEmpty()) {
+ if ( numberOfAlternateAlleles > 0 ) {
for ( Allele allele : getAlternateAlleles() ) {
observedACs.add(getCalledChrCount(allele));
}
@@ -1259,22 +1273,13 @@ public void validateChromosomeCounts() {
observedACs.add(0);
}
- if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) {
- final List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY);
- if ( observedACs.size() != reportedACs.size() )
- throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have the correct number of values for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedACs.size(), observedACs.size()));
- for (int i = 0; i < observedACs.size(); i++) {
- // need to cast to int to make sure we don't have an issue below with object equals (earlier bug) - EB
- final int reportedAC = Integer.valueOf(reportedACs.get(i).toString());
- if ( reportedAC != observedACs.get(i) )
- throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getContig(), getStart(), reportedAC, observedACs.get(i)));
- }
- } else {
- if ( observedACs.size() != 1 )
- throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have enough values for the record at position %s:%d", getContig(), getStart()));
- int reportedAC = Integer.valueOf(getAttribute(VCFConstants.ALLELE_COUNT_KEY).toString());
- if ( reportedAC != observedACs.get(0) )
- throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedAC, observedACs.get(0)));
+ final List<Object> reportedACs = getAttributeAsList(VCFConstants.ALLELE_COUNT_KEY);
+
+ for (int i = 0; i < observedACs.size(); i++) {
+ // need to cast to int to make sure we don't have an issue below with object equals (earlier bug) - EB
+ final int reportedAC = Integer.valueOf(reportedACs.get(i).toString());
+ if ( reportedAC != observedACs.get(i) )
+ throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getContig(), getStart(), reportedAC, observedACs.get(i)));
}
}
}
@@ -1473,7 +1478,7 @@ public String toStringWithoutGenotypes() {
// protected basic manipulation routines
private static List<Allele> makeAlleles(Collection<Allele> alleles) {
- final List<Allele> alleleList = new ArrayList<Allele>(alleles.size());
+ final List<Allele> alleleList = new ArrayList<>(alleles.size());
boolean sawRef = false;
for ( final Allele a : alleles ) {
@@ -1544,7 +1549,7 @@ private final void fullyDecodeInfo(final VariantContextBuilder builder, final VC
private final Map<String, Object> fullyDecodeAttributes(final Map<String, Object> attributes,
final VCFHeader header,
final boolean lenientDecoding) {
- final Map<String, Object> newAttributes = new HashMap<String, Object>(10);
+ final Map<String, Object> newAttributes = new HashMap<>(10);
for ( final Map.Entry<String, Object> attr : attributes.entrySet() ) {
final String field = attr.getKey();
@@ -1582,7 +1587,7 @@ private final Object decodeValue(final String field, final Object value, final V
final String string = (String)value;
if ( string.indexOf(',') != -1 ) {
final String[] splits = string.split(",");
- final List<Object> values = new ArrayList<Object>(splits.length);
+ final List<Object> values = new ArrayList<>(splits.length);
for ( int i = 0; i < splits.length; i++ )
values.add(decodeOne(field, splits[i], format));
return values;
@@ -1591,7 +1596,7 @@ private final Object decodeValue(final String field, final Object value, final V
}
} else if ( value instanceof List && (((List) value).get(0)) instanceof String ) {
final List<String> asList = (List<String>)value;
- final List<Object> values = new ArrayList<Object>(asList.size());
+ final List<Object> values = new ArrayList<>(asList.size());
for ( final String s : asList )
values.add(decodeOne(field, s, format));
return values;
@@ -1259,13 +1259,34 @@ private VariantContext createValidateAlternateAllelesContext(final List<Allele>
final VariantContext vcACSetTwoAlts =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACTwoAlts, hetVarTC);
+ // with AC set, and two different ALTs (T and C), with no GT, we expect a 2 count values.
+ final Map<String, Object> attributesACNoGtTwoAlts = new HashMap<String, Object>();
+ attributesACNoGtTwoAlts.put(VCFConstants.ALLELE_COUNT_KEY, Arrays.asList("1", "1"));
+ final VariantContext vcACNoGtSetTwoAlts =
+ createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACNoGtTwoAlts, null);
+
+ // with AF set, and two different ALTs (T and C), with GT of 1/2, we expect two frequncy values.
+ // With two ALTs, a list is expected, so we set the attribute as a list of 0.5,0.5
+ final Map<String, Object> attributesAFTwoAlts = new HashMap<String, Object>();
+ attributesAFTwoAlts.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5", "0.5"));
+ final VariantContext vcAFSetTwoAlts =
+ createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFTwoAlts, hetVarTC);
+
+ // with AF set, and two different ALTs (T and C), with no GT, we expect two frequency values.
+ final Map<String, Object> attributesAFNoGtTwoAlts = new HashMap<String, Object>();
+ attributesAFNoGtTwoAlts.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5", "0.5"));
+ final VariantContext vcAFNoGtSetTwoAlts =
+ createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFNoGtTwoAlts, null);
+
return new Object[][]{
{vcNoGenotypes},
{vcANSet},
{vcANSetNoCall},
{vcACSet},
{vcACSetNoAlts},
- {vcACSetTwoAlts}
+ {vcACNoGtSetTwoAlts},
+ {vcAFSetTwoAlts},
+ {vcAFNoGtSetTwoAlts}
};
}
@Test(dataProvider = "testValidateChromosomeCountsDataProvider")
@@ -1319,13 +1340,34 @@ public void testValidateChromosomeCounts(final VariantContext vc) {
final VariantContext vcACSetTwoAltsOneAltCount =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACTwoAltsOneAltCount, hetVarTC);
+ // with AC set, no GT, two ALTs, but only count for one ALT (we expect two items in the list: 1,1)
+ final Map<String, Object> attributesACNoGtTwoAltsOneAltCount = new HashMap<String, Object>();
+ attributesACNoGtTwoAltsOneAltCount.put(VCFConstants.ALLELE_COUNT_KEY, Arrays.asList("1"));
+ final VariantContext vcACNoGtSetTwoAltsOneAltCount =
+ createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACNoGtTwoAltsOneAltCount, null);
+
+ // with AF set, two ALTs, but only frequency for one ALT (we expect two items in the list
+ final Map<String, Object> attributesAFTwoAltsWrongFreq = new HashMap<String, Object>();
+ attributesAFTwoAltsWrongFreq.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5"));
+ final VariantContext vcAFSetTwoAltsWrongFreq =
+ createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFTwoAltsWrongFreq, hetVarTC);
+
+ // with AF set, no GT, two ALTs, but only frequency for one ALT (we expect two items in the list
+ final Map<String, Object> attributesAFNoGtTwoAltsWrongCount = new HashMap<String, Object>();
+ attributesAFNoGtTwoAltsWrongCount.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5"));
+ final VariantContext vcAFNoGtSetTwoAltsWrongFreq =
+ createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFNoGtTwoAltsWrongCount, null);
+
return new Object[][]{
{vcANSet},
{vcANSetNoCall},
{vcACWrongCount},
{vcACSetTwoAlts},
{vcACSetTwoAltsWrongCount},
- {vcACSetTwoAltsOneAltCount}
+ {vcACSetTwoAltsOneAltCount},
+ {vcACNoGtSetTwoAltsOneAltCount},
+ {vcAFSetTwoAltsWrongFreq},
+ {vcAFNoGtSetTwoAltsWrongFreq}
};
}
@Test(dataProvider = "testValidateChromosomeCountsFailureDataProvider", expectedExceptions = TribbleException.class)

0 comments on commit e69aff0

Please sign in to comment.