In [20]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib, collections, itertools, os, re, textwrap, logging
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from functools import reduce

from logging.config import dictConfig
from logging import getLogger

dictConfig(dict(
    version = 1,
    formatters = {'f': {'format': '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'}},
    handlers = {
        'h': {'class': 'logging.StreamHandler','formatter': 'f',
              'level': logging.DEBUG}},
    root = {'handlers': ['h'], 'level': logging.DEBUG,},
))

matplotlib_font = { 
    'size'   : 16 #, 'family' : 'normal', 'weight' : 'bold', 
}

matplotlib.rc('font', **matplotlib_font)
logger = getLogger('notebook')

repo_dir=os.path.realpath(
    os.path.join('..', '..')
)
data_dir=os.path.realpath(
    os.path.join(repo_dir, 'private_data', os.path.basename(os.path.realpath(os.getcwd())))
)

In [42]:
def read_ids():
    '''This function reads three files (training,validation, and test set)
    '''
    ids = collections.OrderedDict([])
    for split_class in ['train', 'val', 'test', ]:
        ids[split_class] = list(
            pd.read_csv(
                os.path.join(data_dir, '{}_ids.csv'.format(split_class)),
                names=['idx', 'IID'], skiprows=1
            )['IID']
        )
    return ids

In [38]:
def read_337199():
    '''This function reads 337,199 White British individuals
    '''
    white_british = '/oak/stanford/groups/mrivas/private_data/ukbb/24983/sqc/population_stratification/ukb24983_white_british.phe'
    return list(
        pd.read_table(
            white_british, names=['IID', 'FID'],            
        )['IID']
    )

### Comparison of population stratification
- Junyang's set has 337,203 individuals while Rivas lab's standard has 337,199 individuals

In [53]:
white_british = read_337199()
ids = read_ids()

In [54]:
ids_all = reduce(lambda x, y: set(x).union(y), ids.values())

In [55]:
len(ids_all), len(white_british)

(337203, 337199)

In [56]:
ids_all - set(white_british)

{-6, -5, -2, -1}

- individual IDs with negative number needs to be removed from the analysis

### Removal of reducted individuals

In [63]:
ids_337199 = collections.OrderedDict([])
for k in ids.keys():
    ids_337199[k] = [x for x in ids[k] if x > 0]

In [64]:
for k, v in ids.items():
    print('{}\t{}'.format(k, len(v)))

train	202277
val	67654
test	67272


In [65]:
for k, v in ids_337199.items():
    print('{}\t{}'.format(k, len(v)))

train	202276
val	67651
test	67272


### Save as .fam files
- https://www.cog-genomics.org/plink/1.9/formats

In [66]:
for split_class in ['train', 'val', 'test', ]:
    pd.DataFrame({
        'FID': ids_337199[split_class],
        'IID': ids_337199[split_class],        
    }).to_csv(
        os.path.join(data_dir, '{}.fam'.format(split_class)),
        sep = '\t',
        header = False, 
        index = False,
    )    