In [56]:
import numpy
import matplotlib.pyplot as plt
import pandas
import scipy.stats
import scipy
import collections

In [57]:
def loadAnnotationFile (annoFile):

    # read it
    anno = pandas.read_csv (annoFile, sep='\t', low_memory=False)

    # reduce the column names
    # fuse first two names, or take the only one
    newColumns = []
    for c in anno.columns:
        fields = c.split()
        if (len(fields) <= 1):
            # just this one
            newColumns.append (fields[0])
        elif ((fields[0] == "SNPs") and (fields[1] == "hit")):
            # this one a bit special
            if any([x == '1240k' for x in fields]):
                newColumns.append (f"{fields[0]}_{fields[1]}_1240k")
            elif any([x == 'HO' for x in fields]):
                newColumns.append (f"{fields[0]}_{fields[1]}_HO")
            elif any([x == '3.2M' for x in fields]):
                newColumns.append (f"{fields[0]}_{fields[1]}_3.2M")
            elif any([x == 'non-padded' for x in fields]):
                newColumns.append (f"{fields[0]}_{fields[1]}_non-padded")
            else:
                # this should be ok for version 50.0?
                newColumns.append (f"{fields[0]}_{fields[1]}")
        elif ((fields[0] == "Y") and (fields[1] == "haplogroup")):
            # this one's special too
            if any([x == 'terminal' for x in fields]):
                newColumns.append (f"{fields[0]}_{fields[1]}_terminal")
            elif any([x == 'ISOGG' for x in fields]):
                if (any([x == 'curation' for x in fields])):
                    newColumns.append (f"{fields[0]}_{fields[1]}_ISOGG_curation")
                else:
                    newColumns.append (f"{fields[0]}_{fields[1]}_ISOGG")
            else:
                assert (False)
        elif ((fields[0] == "Xcontam") and (fields[1] == "ANGSD")):
            # many special snowflakes
            if any([x == 'SNPs' for x in fields]):
                newColumns.append (f"{fields[0]}_{fields[1]}SNPs")
            elif any([x == 'MOM' for x in fields]):
                if (any([x == 'point' for x in fields])):
                    newColumns.append (f"{fields[0]}_{fields[1]}_point")
                elif (any([x == 'Z-score' for x in fields])):
                    newColumns.append (f"{fields[0]}_{fields[1]}_Z-score")
                elif (any([x == 'CI' for x in fields])):
                    newColumns.append (f"{fields[0]}_{fields[1]}_CI")
                else:
                    assert (False)
            else:
                assert (False)
        else:
            # fuse first two
            newColumns.append (f"{fields[0]}_{fields[1]}")

    # set the new names
    anno.columns = newColumns

    # make sure we don't have duplicates in the new names
    # print(anno.columns)
    assert (numpy.all ([x <= 1 for x in collections.Counter(anno.columns).values()]))

    return anno

In [58]:
# annoFrame = loadAnnotationFile ("AADR_v54.1_1240K/v54.1_1240K_public.anno")
annoFrame = loadAnnotationFile ("AADR_v54.1_1240K_HO/v54.1_HO_public.anno")

In [59]:
presentFrame = annoFrame[annoFrame['Date_mean'] <= 0]

In [60]:
# for (k,v) in collections.Counter(presentFrame['Political_Entity']).items():
#     if (v > 10):
#         print (k,v)

In [64]:
presentIndividuals = numpy.array (presentFrame['Genetic_ID'])

In [65]:
indFrame = pandas.read_csv ("AADR_v54.1_1240K_HO/v54.1_HO_public.ind", header=None, delim_whitespace=True)
# annoFrame = loadAnnotationFile ("AADR_v54.1_1240K/v54.1_1240K_public.anno")


In [74]:
# get only the presentIndividuals
presentMask = [x in presentIndividuals for x in numpy.array(indFrame[0])]
theCounter = collections.Counter(indFrame[presentMask][2])

In [75]:
sortedCounter = sorted(theCounter.items(), key=lambda x:x[1])

In [76]:
[x for x in reversed(sortedCounter)]

[('Spanish.HO', 172),
 ('GWD.DG', 112),
 ('GWD.SG', 112),
 ('TSI.SG', 108),
 ('TSI.DG', 107),
 ('Han.HO', 107),
 ('CHS.SG', 106),
 ('GIH.SG', 105),
 ('JPT.DG', 104),
 ('JPT.SG', 104),
 ('CHS.DG', 103),
 ('IBS.DG', 103),
 ('CHB.DG', 103),
 ('ITU.SG', 103),
 ('IBS.SG', 103),
 ('CHB.SG', 103),
 ('GIH.DG', 102),
 ('ITU.DG', 102),
 ('YRI.DG', 101),
 ('YRI.SG', 101),
 ('LWK.SG', 101),
 ('PUR.SG', 100),
 ('ESN.DG', 99),
 ('LWK.DG', 99),
 ('PUR.DG', 99),
 ('CEU.DG', 99),
 ('STU.SG', 99),
 ('ESN.SG', 99),
 ('CEU.SG', 99),
 ('CDX.SG', 99),
 ('STU.DG', 98),
 ('FIN.DG', 97),
 ('KHV.SG', 97),
 ('FIN.SG', 97),
 ('Tibetan.HO', 97),
 ('PJL.DG', 96),
 ('PJL.SG', 96),
 ('KHV.DG', 95),
 ('CLM.DG', 94),
 ('CLM.SG', 94),
 ('CDX.DG', 93),
 ('ACB.DG', 92),
 ('GBR.SG', 92),
 ('ACB.SG', 92),
 ('GBR.DG', 90),
 ('MSL.DG', 85),
 ('BEB.DG', 85),
 ('MSL.SG', 85),
 ('BEB.SG', 85),
 ('Russian.HO', 71),
 ('PEL.SG', 69),
 ('PEL.DG', 68),
 ('MXL.SG', 64),
 ('MXL.DG', 62),
 ('French.HO', 61),
 ('ASW.SG', 60),
 ('ASW.DG',