In [5]:
import pandas
import os
import ftplib
import collections as col

def keep_item(item):
    if 'manifest' in item.lower():
        return False
    if 'readme' in item.lower():
        return False
    return True

def traverse_remote_path(ftp_server, remote_url):
    remote_listing = ftp_server.nlst(remote_url)
    remote_files = []

    for item in remote_listing:
        # simplistic heuristic: if it has a file extension, it's a file
        # works for intended use case
        if len(item.split('.')) > 1:
            # it's a file
            remote_files.append(item)
            continue
        try:
            sub_dir_listing = ftp_server.nlst(item)
        except Exception as error:
            continue
        remote_files.extend(sub_dir_listing)

    remote_files = [os.path.basename(f).split('_')[0] for f in remote_files]
    remote_files = set(filter(keep_item, remote_files))

    return sorted(remote_files)

cache_folder = '/home/peter/work/code/github/ptrebert/project-diploid-assembly/annotation'
cache_file = 'cache_ftp_sseq_fastq.txt'
store_file = os.path.join(cache_folder, cache_file)

if not os.path.isfile(store_file):
    data_source = 'vol1/ftp/data_collections/HGSVC2/working/20200120_Strandseq/fastq'
    server_url = 'ftp.1000genomes.ebi.ac.uk'
    server = ftplib.FTP(server_url)
    server.login()

    sseq_fastq = traverse_remote_path(server, data_source)
    print('Writing cache file')
    with open(store_file, 'w') as dump:
        _ = dump.write('\n'.join(sseq_fastq))

with open(store_file, 'r') as dump:
    sseq_libs = set(dump.read().strip().split())

print('FTP FASTQ libs: ', len(sseq_libs))

cells_per_sample = col.Counter([l.split('x')[0] for l in sseq_libs])
# for HGSVC2, there should be 96 cells per sample
assert all([x == 96 for x in cells_per_sample.values()]), 'Cells per sample missing: {}'.format(count_libs)

selection = '../annotation/20200128_ASanders_QCselect_HGSVClibs.txt'
controls = '../annotation/20200507_ASanders_100cell_controls.txt'
wgs = '../annotation/20200507_ASanders_wgs_cells.txt'

df = pandas.read_csv(selection, sep='\t', comment='#')

# Ashley's annotation is more restrictive as it targets
# inversion analysis; for the clustering, it's just important
# to get rid of the complete garbage
high_qual = df.loc[df['score'] == 1, :]
ok_qual = df.loc[((df['score'] == 0) & (df['reads'] > 50000)), :]
low_qual = df.loc[((df['score'] == 0) & (df['reads'] <= 50000)), 'cell']

accepted_libs = set(high_qual['cell'].values).union(set(ok_qual['cell'].values))
print('OK libs ', len(accepted_libs))

blacklist_from_ftp = sseq_libs.difference(accepted_libs)
print('Blacklist ', len(blacklist_from_ftp))

with open(controls, 'r') as annotation:
    control_cells = set(annotation.read().strip().split())
print('100c controls: ', len(control_cells))
    
accept_controls = []
for c in control_cells:
    if c in blacklist_from_ftp:
        continue
    elif c in accepted_libs:
        accept_controls.append(c)
    else:
        print('-- Unknown: {}'.format(c))
print('-- in AS list: ', accept_controls)

with open(wgs, 'r') as annotation:
    wgs_cells = set(annotation.read().strip().split())
print('wgs cells: ', len(wgs_cells))
    
accept_wgs = []
for w in wgs_cells:
    if w in blacklist_from_ftp:
        continue
    elif w in accepted_libs:
        accept_wgs.append(w)
    else:
        print('-- Unknown: {}'.format(w))
print('-- in AS list: ', accept_wgs)

final_blacklist = blacklist_from_ftp.union(control_cells, wgs_cells)
print('Final blacklist: ', len(final_blacklist))

cells_blacklisted = col.Counter([c.split('x')[0] for c in final_blacklist])
for sample, num_blacklisted in cells_blacklisted.most_common():
    print(sample, ': ', num_blacklisted)
assert not any([n >= 96 for n in cells_blacklisted.values()]), \
    'All cells blacklisted: {}'.format(
        [(sample, n) for (sample, n) in cells_blacklisted.most_common() if n >= 96]
    )


# The overlap between Ashley's annotation and the blacklisted
# libraries should be non-zero (e.g., because of the "0" quality
# libs with less than 50000 reads). On the other hand, it should
# also not be too large because the complete garbage is not part
# of Ashley's annotation.

print('Sanity check')
sanity_check = final_blacklist.intersection(set(df['cell'].values))
print('Blacklisted "but" annotated: ', len(sanity_check))
print('Thereof, dropped/low qual: ', low_qual.shape[0])

blacklist = '../annotation/hgsvc_blacklist.txt'

with open(blacklist, 'w') as dump:
    _ = dump.write('\n'.join(sorted(final_blacklist)))

# update: fixed Strand-seq blacklist
# after double-checking with A. Sanders
# important fixes:
# - correct NA20509 blacklist now
# - library HG03009x02PE20386: from WGS to "good/1"
# MD5: b75aa05368ca490a36aafcf44a00b161

# update: Strand-seq data for NA20509 added
# blacklist generated on 2020-05-13
# MD5: ee7ccc87c35c8359b37da8e3f5559124
    
# blacklist generated on 2020-05-07
# MD5: cc6f92724d97034523b68adadec1c9fa


FTP FASTQ libs:  2304
OK libs  1603
Blacklist  701
100c controls:  24
-- in AS list:  []
wgs cells:  124
-- in AS list:  []
Final blacklist:  701
HG02492 :  42
GM18939 :  41
HG00864 :  40
HG02587 :  37
HG02011 :  36
HG03732 :  34
HG01114 :  34
HG01573 :  34
GM20509B :  33
HG03065 :  31
HG02018 :  30
GM19983 :  30
HG03371 :  30
HG00171A :  29
GM12329 :  28
HG01505 :  27
HG01596 :  26
GM20847B :  24
HG03009 :  23
GM19036B :  22
HG03683 :  20
GM19650A :  19
GM18534B :  17
HG00096 :  14
Sanity check
Blacklisted "but" annotated:  122
Thereof, dropped/low qual:  122
