# Generate the phenotypes for the hearing impairment traits from the UKBB

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# 1. Read in the data

## 1.1. Read in database

In [2]:
# collect the necessary column names of the database for our analysis

with open("/mnt/mfs/statgen/UKBiobank/data/ukbb_databases/ukb47922_updatedAug2021/ukb47922.tab") as fp:
    line = fp.readline() # header
    header = line.split("\t")
    
    indiv = ["f.eid"]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000." in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]

In [3]:
combined_cols = indiv  + ethnicity + reported_sex + genetic_sex +  year_of_birth + month_of_birth

In [4]:
print(datetime.now())

2022-01-10 09:14:21.489797


In [5]:
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/data/ukbb_databases/ukb47922_updatedAug2021/ukb47922.tab", dtype="string", sep='\t', usecols=combined_cols)
df

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0
0,1000019,0,1960,11,1001,,,0
1,1000022,1,1954,8,1001,,,1
2,1000035,1,1944,5,1001,,,1
3,1000046,0,1946,3,1001,,,0
4,1000054,0,1942,1,1001,,,0
...,...,...,...,...,...,...,...,...
502456,6025409,0,1946,11,1001,1001,,0
502457,6025411,0,1960,11,1001,,,0
502458,6025425,0,1963,8,1001,,,0
502459,6025438,1,1952,9,1001,,,1


In [5]:
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/082321_UKBB_exomes.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0
0,1000019,1000019,Female,1960,November,British,,,Female
1,1000035,1000035,Male,1944,May,British,,,Male
2,1000078,1000078,Female,1955,June,British,British,British,Female
3,1000081,1000081,Male,1942,February,British,,,Male
4,1000198,1000198,Female,1967,July,British,,,Female
...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,British,,,Male
200615,6025319,6025319,Female,1953,March,British,,,Female
200616,6025346,6025346,Female,1954,October,British,,,Female
200617,6025363,6025363,Male,1944,April,British,,,Male


In [6]:
print(datetime.now())

2022-01-10 09:31:29.972446


# 2. Sample QC

## 2.2. Remove non-white individuals

In [7]:
# set of answers for the ethnicity question
set(df[ethnicity[0]].to_list()).union( set(df[ethnicity[1]].to_list()) , set(df[ethnicity[2]].to_list()))

{'-1',
 '-3',
 '1',
 '1001',
 '1002',
 '1003',
 '2',
 '2001',
 '2002',
 '2003',
 '2004',
 '3',
 '3001',
 '3002',
 '3003',
 '3004',
 '4',
 '4001',
 '4002',
 '4003',
 '5',
 '6',
 <NA>}

```
1	White
1001	British
2001	White and Black Caribbean
3001	Indian
4001	Caribbean
2	Mixed
1002	Irish
2002	White and Black African
3002	Pakistani
4002	African
3	Asian or Asian British
1003	Any other white background
2003	White and Asian
3003	Bangladeshi
4003	Any other Black background
4	Black or Black British
2004	Any other mixed background
3004	Any other Asian background
5	Chinese
6	Other ethnic group
-1	Do not know
-3	Prefer not to answer
```

https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=1001

## Select white 

In [8]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['1001', '1002', '1','1003']
african = ['4001','2001', '4002', '2002', '4', '4003' ]
asian = ['3001', '3002', '2003', '3004', '3003', '3']
mixed = ['2', '2004']
chinese = ['5']
other = ['6']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "-3" and x != "-1"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in white: # if we have only one unique answer and the answer is in the white variable
        return "_".join(temp[0].split(" ")) # return the unique answer
    
    if len([x for x in temp if x in white]) == len(temp):
        return "Inconsistent_white"
    if len([x for x in temp if x in asian]) == len(temp):
        return "Asian"
    if len([x for x in temp if x in african]) == len(temp):
        return "African"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"

In [30]:
df_white = df.copy()

In [31]:
df_white["ethnicity"] = df_white[ethnicity].apply(ancestry, axis=1)

In [33]:
df_white.groupby(['ethnicity']).count()

Unnamed: 0_level_0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,553,553,553,553,553,5,1,529
1001,442342,442342,442342,442342,442337,18619,10593,430782
1002,13021,13021,13021,13021,13021,395,229,12575
1003,16149,16149,16149,16149,16148,377,199,15636
African,9096,9096,9096,9096,9096,113,90,8634
Asian,10695,10695,10695,10695,10695,152,140,10258
Chinese,1571,1571,1571,1571,1571,45,34,1501
Inconsistent,134,134,134,134,134,97,60,131
Inconsistent_white,607,607,607,607,607,448,271,590
Mixed,1068,1068,1068,1068,1068,14,7,1028


In [34]:
def find_non_white(row):
    return row["ethnicity"] not in white and row["ethnicity"] != "Unknown" and row["ethnicity"] != "Inconsistent_white"

In [35]:
ex_non_white = df_white[["ethnicity"]].apply(find_non_white, axis=1)

In [36]:
df_white = df_white[~ex_non_white]

In [37]:
df_white

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
0,1000019,0,1960,11,1001,,,0,1001
1,1000022,1,1954,8,1001,,,1,1001
2,1000035,1,1944,5,1001,,,1,1001
3,1000046,0,1946,3,1001,,,0,1001
4,1000054,0,1942,1,1001,,,0,1001
...,...,...,...,...,...,...,...,...,...
502456,6025409,0,1946,11,1001,1001,,0,1001
502457,6025411,0,1960,11,1001,,,0,1001
502458,6025425,0,1963,8,1001,,,0,1001
502459,6025438,1,1952,9,1001,,,1,1001


In [38]:
print(sum(ex_non_white), "individuals removed for being non-white")

27072 individuals removed for being non-white


In [22]:
df_white[df_white["ethnicity"] == "1001"] #British

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
0,1000019,0,1960,11,1001,,,0,1001
1,1000022,1,1954,8,1001,,,1,1001
2,1000035,1,1944,5,1001,,,1,1001
3,1000046,0,1946,3,1001,,,0,1001
4,1000054,0,1942,1,1001,,,0,1001
...,...,...,...,...,...,...,...,...,...
502455,6025390,0,1942,3,1001,,,0,1001
502456,6025409,0,1946,11,1001,1001,,0,1001
502457,6025411,0,1960,11,1001,,,0,1001
502458,6025425,0,1963,8,1001,,,0,1001


In [23]:
df_white[df_white["ethnicity"] == "1002"] #Irish

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
8,1000090,0,1945,7,1002,,,0,1002
40,1000415,1,1942,12,1002,,,1,1002
130,1001316,1,1964,9,1002,,1002,1,1002
148,1001492,1,1947,8,1002,,,1,1002
202,1002031,0,1946,2,1002,,,0,1002
...,...,...,...,...,...,...,...,...,...
502201,6022857,1,1964,11,1002,,,1,1002
502271,6023551,0,1966,2,1002,,,0,1002
502299,6023832,1,1942,6,1002,,,1,1002
502316,6024002,1,1957,8,1002,,,1,1002


In [24]:
df_white[df_white["ethnicity"] == "1"] #White

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
645,1006463,0,1946,6,1,,,,1
883,1008841,0,1964,10,1,,,0,1
1203,1012043,1,1943,6,1,,,1,1
1730,1017312,0,1947,3,1,,,0,1
1954,1019550,0,1942,7,1,,,0,1
...,...,...,...,...,...,...,...,...,...
498297,5983819,0,1943,10,1,,,0,1
498551,5986358,0,1955,5,1,,,0,1
499802,5998866,1,1946,3,1,,,1,1
500021,6001050,0,1939,5,1,,,0,1


In [25]:
df_white[df_white["ethnicity"] == "Inconsistent_white"]

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
705,1007061,1,1950,5,1002,,1001,1,Inconsistent_white
2498,1024996,0,1939,2,1002,1001,,0,Inconsistent_white
4025,1040261,0,1961,7,1001,,1003,,Inconsistent_white
4147,1041484,1,1951,12,1001,1002,,1,Inconsistent_white
4530,1045313,1,1953,8,1003,1001,,1,Inconsistent_white
...,...,...,...,...,...,...,...,...,...
498134,5982180,1,1967,6,1001,1003,,1,Inconsistent_white
498463,5985476,0,1955,10,1001,1003,,0,Inconsistent_white
498968,5990529,0,1949,8,1001,1003,1003,0,Inconsistent_white
500935,6010193,0,1948,4,1002,1001,1001,0,Inconsistent_white


In [26]:
df_white[df_white["ethnicity"] == "Unknown"]

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
69,1000701,0,1949,10,-3,,,0,Unknown
772,1007738,0,1944,12,-3,,,0,Unknown
889,1008909,1,1944,3,-3,,,1,Unknown
898,1008997,1,1945,8,-3,,,1,Unknown
984,1009852,1,1963,6,-1,,,1,Unknown
...,...,...,...,...,...,...,...,...,...
501783,6018674,0,1947,4,-3,,,0,Unknown
502111,6021951,1,1967,1,,,,1,Unknown
502249,6023331,1,1953,7,,,,,Unknown
502278,6023625,0,1941,12,,,,0,Unknown


In [27]:
df_white[df_white["ethnicity"] == "1003"]

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
32,1000331,0,1956,12,1003,,,0,1003
76,1000776,0,1946,6,1003,,,0,1003
78,1000799,1,1963,7,1003,,,1,1003
84,1000858,1,1947,5,1003,,,1,1003
90,1000914,1,1962,7,1003,,,1,1003
...,...,...,...,...,...,...,...,...,...
502321,6024051,0,1960,2,1003,,,0,1003
502326,6024100,1,1952,6,1003,,,1,1003
502336,6024208,0,1965,11,1003,,,0,1003
502338,6024221,0,1959,9,1003,,,,1003


In [40]:
df_white = df_white.rename(columns={'f.eid': 'IID', 'f.31.0.0': 'sex'})

In [42]:
df_white['FID'] = df_white['IID']

In [43]:
df_white[["FID","IID","ethnicity"]].to_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/white_IID/011022_ukb47922_white_expanded_475389.iid", sep="\t", index=False)

## Select Asians 

In [23]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['1001', '1002', '1','1003']
african = ['4001','2001', '4002', '2002', '4', '4003' ]
asian = ['3001', '3002', '2003', '3004', '3003', '3']
mixed = ['2', '2004']
chinese = ['5']
other = ['6']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "-3" and x != "-1"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in asian: # if we have only one unique answer and the answer is in the asian variable
        return "_".join(temp[0].split(" ")) # return the unique answer
    
    if len([x for x in temp if x in asian]) == len(temp):
        return "Inconsistent_asian"
    if len([x for x in temp if x in white]) == len(temp):
        return "White"
    if len([x for x in temp if x in african]) == len(temp):
        return "African"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"

Set an ancestry row that combines the ancestry answers from the database into one 

In [24]:
df2 = df.copy()

In [25]:
df2["ethnicity"] = df2[ethnicity].apply(ancestry, axis=1)

In [26]:
def find_asian(row):
    return row["ethnicity"] in asian or row["ethnicity"] == "Inconsistent_asian"

In [27]:
inc_asian = df2[["ethnicity"]].apply(find_asian, axis=1)

In [30]:
print(sum(inc_asian), "individuals considered asian")

10695 individuals considered asian


In [28]:
# Filter the asian individuals
filtered = df2[inc_asian]

In [29]:
filtered

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
89,1000906,1,1962,5,3003,,,1,3003
186,1001874,0,1947,8,3004,,,0,3004
248,1002497,0,1968,4,3001,,,,3001
270,1002712,0,1965,12,3001,,,0,3001
301,1003025,1,1942,10,3001,,,1,3001
...,...,...,...,...,...,...,...,...,...
502347,6024313,0,1964,5,3004,,,0,3004
502399,6024837,1,1965,5,3001,,,1,3001
502405,6024898,0,1950,9,3001,,,0,3001
502425,6025096,0,1965,9,2003,,,0,2003


In [34]:
filtered[filtered["ethnicity"] == "3001"] #Indian

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
248,1002497,0,1968,4,3001,,,,3001
270,1002712,0,1965,12,3001,,,0,3001
301,1003025,1,1942,10,3001,,,1,3001
307,1003083,1,1961,10,3001,,,1,3001
518,1005191,0,1945,1,3001,,,0,3001
...,...,...,...,...,...,...,...,...,...
502334,6024189,0,1947,6,3001,,,0,3001
502342,6024266,0,1943,2,3001,,,0,3001
502399,6024837,1,1965,5,3001,,,1,3001
502405,6024898,0,1950,9,3001,,,0,3001


In [35]:
filtered[filtered["ethnicity"] == "3002"] #Pakistani

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
419,1004204,1,1959,5,3002,,,1,3002
421,1004223,1,1953,10,3002,,,1,3002
425,1004262,1,1964,9,3002,,,1,3002
669,1006702,0,1957,1,3002,,,0,3002
789,1007904,0,1958,9,3002,,,0,3002
...,...,...,...,...,...,...,...,...,...
501313,6013978,1,1966,6,3002,,,1,3002
501357,6014412,0,1966,6,3002,,,0,3002
501425,6015097,1,1955,8,3002,,,1,3002
501890,6019749,0,1954,4,3002,,,0,3002


In [36]:
filtered[filtered["ethnicity"] == "2003"] #White and Asian

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
887,1008886,0,1955,11,2003,,,0,2003
1105,1011060,0,1966,5,2003,,,0,2003
1310,1013112,1,1946,12,2003,,,1,2003
1332,1013331,1,1958,5,2003,,,1,2003
1783,1017845,1,1964,2,2003,,,1,2003
...,...,...,...,...,...,...,...,...,...
499464,5995487,0,1959,3,2003,,,0,2003
500076,6001605,1,1956,10,2003,,,1,2003
500726,6008102,0,1946,7,2003,,,0,2003
501394,6014783,0,1956,4,2003,,,0,2003


In [37]:
filtered[filtered["ethnicity"] == "3004"] #Any other Asian background

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
186,1001874,0,1947,8,3004,,,0,3004
526,1005270,0,1948,10,3004,,,0,3004
685,1006862,1,1953,4,3004,,,1,3004
902,1009036,0,1954,2,3004,,,0,3004
1356,1013573,1,1947,1,3004,,,1,3004
...,...,...,...,...,...,...,...,...,...
501648,6017324,1,1966,9,3004,,,1,3004
501679,6017630,1,1966,10,3004,,,1,3004
501878,6019629,0,1944,3,3004,,,0,3004
501987,6020711,0,1947,2,3004,,,,3004


In [38]:
filtered[filtered["ethnicity"] == "3003"] #Bangladeshi

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
89,1000906,1,1962,5,3003,,,1,3003
1398,1013992,0,1950,1,3003,,,0,3003
5175,1051760,0,1946,1,3003,,,0,3003
6763,1067648,1,1963,2,3003,,,1,3003
6986,1069878,1,1966,12,3003,,,1,3003
...,...,...,...,...,...,...,...,...,...
491214,5912971,1,1964,12,3003,,,1,3003
491949,5920327,0,1943,2,3003,,,0,3003
493003,5930867,0,1952,6,3003,,,,3003
495715,5957999,1,1957,1,3003,,,1,3003


In [44]:
filtered[filtered["ethnicity"] == "3"] #Asian or Asian British
#filtered[filtered["ethnicity"] == "3"].count()

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
6640,1066410,0,1949,12,3,,,0.0,3
13264,1132651,1,1954,3,3,,,1.0,3
13421,1134229,0,1964,1,3,,,0.0,3
18792,1187957,0,1950,10,3,,,0.0,3
21350,1213548,1,1966,4,3,,,1.0,3
36646,1366516,0,1955,4,3,,,0.0,3
37673,1376782,0,1952,7,3,,,0.0,3
42948,1429546,0,1944,1,3,,,0.0,3
46738,1467443,1,1966,1,3,,,1.0,3
63381,1633907,1,1957,3,3,,,1.0,3


In [40]:
filtered[filtered["ethnicity"] == "Inconsistent_asian"]  

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
18434,1184372,1,1954,4,3001,3002.0,,1,Inconsistent_asian
66460,1664704,1,1962,12,3004,,3001.0,1,Inconsistent_asian
106027,2060423,0,1954,2,3004,2003.0,2003.0,0,Inconsistent_asian
125857,2258760,1,1966,7,3002,,3001.0,1,Inconsistent_asian
146519,2465438,1,1951,10,3001,,3004.0,1,Inconsistent_asian
253232,3532747,1,1958,7,3004,3002.0,,1,Inconsistent_asian
402397,5024641,0,1960,5,3002,,3001.0,0,Inconsistent_asian
438456,5385280,0,1955,3,2003,,3003.0,0,Inconsistent_asian


In [45]:
filtered = filtered.rename(columns={'f.eid': 'IID', 'f.31.0.0': 'sex'})

In [47]:
filtered['FID'] = filtered['IID']

In [48]:
filtered

Unnamed: 0,IID,sex,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity,FID
89,1000906,1,1962,5,3003,,,1,3003,1000906
186,1001874,0,1947,8,3004,,,0,3004,1001874
248,1002497,0,1968,4,3001,,,,3001,1002497
270,1002712,0,1965,12,3001,,,0,3001,1002712
301,1003025,1,1942,10,3001,,,1,3001,1003025
...,...,...,...,...,...,...,...,...,...,...
502347,6024313,0,1964,5,3004,,,0,3004,6024313
502399,6024837,1,1965,5,3001,,,1,3001,6024837
502405,6024898,0,1950,9,3001,,,0,3001,6024898
502425,6025096,0,1965,9,2003,,,0,2003,6025096


In [49]:
filtered[["FID","IID","ethnicity"]].to_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/asian_IID/010622_ukb47922_asian_10695.iid", sep="\t", index=False)

## Select Africans

In [50]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['1001', '1002', '1','1003']
african = ['4001','2001', '4002', '2002', '4', '4003' ]
asian = ['3001', '3002', '2003', '3004', '3003', '3']
mixed = ['2', '2004']
chinese = ['5']
other = ['6']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "-3" and x != "-1"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in african: # if we have only one unique answer and the answer is in the asian variable
        return "_".join(temp[0].split(" ")) # return the unique answer
    
    if len([x for x in temp if x in african]) == len(temp):
        return "Inconsistent_african"
    if len([x for x in temp if x in white]) == len(temp):
        return "White"
    if len([x for x in temp if x in asian]) == len(temp):
        return "Asian"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"

In [51]:
df_afr = df.copy()

In [52]:
df_afr["ethnicity"] = df_afr[ethnicity].apply(ancestry, axis=1)

In [53]:
def find_african(row):
    return row["ethnicity"] in african or row["ethnicity"] == "Inconsistent_african"

In [54]:
inc_african = df_afr[["ethnicity"]].apply(find_african, axis=1)

In [55]:
print(sum(inc_african), "individuals considered african")

9096 individuals considered african


In [56]:
# Filter the asian individuals
filtered_afr = df_afr[inc_african]

In [57]:
filtered_afr

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
68,1000697,0,1965,10,4001,,,0,4001
143,1001447,0,1961,6,4001,,,0,4001
145,1001465,0,1942,6,2001,,,0,2001
199,1002004,1,1957,2,4002,,,1,4002
234,1002354,0,1962,9,4001,,,0,4001
...,...,...,...,...,...,...,...,...,...
502221,6023054,0,1941,9,4001,,,0,4001
502390,6024740,0,1945,12,4001,,,0,4001
502417,6025018,0,1960,2,4001,,,0,4001
502443,6025273,0,1947,7,4001,,,0,4001


In [58]:
filtered_afr[filtered_afr["ethnicity"] == "4001"] #Caribbean

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
68,1000697,0,1965,10,4001,,,0,4001
143,1001447,0,1961,6,4001,,,0,4001
234,1002354,0,1962,9,4001,,,0,4001
259,1002608,0,1963,3,4001,,,0,4001
451,1004525,0,1962,2,4001,,,0,4001
...,...,...,...,...,...,...,...,...,...
502165,6022490,0,1958,7,4001,,,0,4001
502221,6023054,0,1941,9,4001,,,0,4001
502390,6024740,0,1945,12,4001,,,0,4001
502417,6025018,0,1960,2,4001,,,0,4001


In [59]:
filtered_afr[filtered_afr["ethnicity"] == "2001"] #White and Black Caribbean

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
145,1001465,0,1942,6,2001,,,0,2001
312,1003139,0,1967,5,2001,,,0,2001
956,1009579,1,1955,4,2001,,,1,2001
5510,1055119,1,1962,12,2001,,,1,2001
6813,1068145,0,1965,6,2001,2001,2001,0,2001
...,...,...,...,...,...,...,...,...,...
500451,6005357,0,1962,8,2001,,2001,0,2001
500483,6005674,0,1959,2,2001,,,0,2001
501595,6016790,0,1960,9,2001,,,0,2001
501666,6017509,1,1962,10,2001,,,1,2001


In [60]:
filtered_afr[filtered_afr["ethnicity"] == "4002"] #African

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
199,1002004,1,1957,2,4002,,,1,4002
238,1002390,0,1965,8,4002,,,0,4002
427,1004285,1,1966,2,4002,,,1,4002
461,1004627,0,1957,6,4002,,,0,4002
1268,1012698,1,1957,3,4002,,,1,4002
...,...,...,...,...,...,...,...,...,...
501770,6018549,0,1964,5,4002,,,0,4002
501790,6018745,1,1943,8,4002,,,1,4002
501903,6019879,0,1946,11,4002,,,0,4002
502175,6022599,1,1964,1,4002,,,1,4002


In [62]:
filtered_afr[filtered_afr["ethnicity"] == "2002"] #White and Black African

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
674,1006751,0,1954,4,2002,,,0,2002
882,1008837,1,1954,10,2002,,,1,2002
1501,1015023,1,1966,2,2002,,,1,2002
2921,1029220,0,1956,1,2002,,,0,2002
3141,1031422,0,1954,12,2002,,,0,2002
...,...,...,...,...,...,...,...,...,...
498126,5982101,0,1966,5,2002,,,0,2002
500414,6004982,1,1968,2,2002,,2002,1,2002
500446,6005304,0,1963,9,2002,,,0,2002
500725,6008093,0,1945,5,2002,,,0,2002


In [65]:
filtered_afr[filtered_afr["ethnicity"] == "4"] #Black or Black British

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
50562,1505691,1,1955,10,4,,,1.0,4
52673,1526806,0,1955,5,4,,,0.0,4
63988,1639972,1,1952,5,4,,,1.0,4
71989,1720015,0,1942,2,4,,,,4
80823,1808366,0,1951,9,4,,,0.0,4
93484,1934998,0,1960,2,4,,,0.0,4
132183,2322064,0,1954,2,4,,,0.0,4
136042,2360659,1,1953,7,4,,,1.0,4
144739,2447634,0,1943,11,4,,,0.0,4
160450,2604757,1,1958,11,4,,,1.0,4


In [66]:
filtered_afr[filtered_afr["ethnicity"] == "4003"] #Any other Black background

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
3568,1035697,0,1965,4,4003,,,0,4003
19433,1194361,0,1955,9,4003,,,0,4003
20756,1207604,1,1952,9,4003,,,1,4003
25336,1253403,0,1962,9,4003,,,0,4003
28529,1285334,0,1947,9,4003,,,0,4003
...,...,...,...,...,...,...,...,...,...
479088,5791683,0,1942,3,4003,,,0,4003
480121,5802016,0,1962,4,4003,,,0,4003
484116,5841965,1,1959,7,4003,,,1,4003
492252,5923353,0,1949,7,4003,,,0,4003


In [67]:
filtered_afr[filtered_afr["ethnicity"] == "Inconsistent_african"] 

Unnamed: 0,f.eid,f.31.0.0,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity
29801,1298050,0,1962,4,4003,,2002.0,0,Inconsistent_african
131128,2311513,1,1940,11,4001,,2001.0,1,Inconsistent_african
178191,2782201,1,1963,4,4001,4003.0,,1,Inconsistent_african
325896,4259501,0,1961,1,4,4001.0,,0,Inconsistent_african
390300,4903663,0,1965,9,4003,4001.0,,0,Inconsistent_african
487712,5877928,0,1955,6,2002,2001.0,,0,Inconsistent_african


In [68]:
filtered_afr = filtered_afr.rename(columns={'f.eid': 'IID', 'f.31.0.0': 'sex'})

In [69]:
filtered_afr['FID'] = filtered_afr['IID']

In [70]:
filtered_afr

Unnamed: 0,IID,sex,f.34.0.0,f.52.0.0,f.21000.0.0,f.21000.1.0,f.21000.2.0,f.22001.0.0,ethnicity,FID
68,1000697,0,1965,10,4001,,,0,4001,1000697
143,1001447,0,1961,6,4001,,,0,4001,1001447
145,1001465,0,1942,6,2001,,,0,2001,1001465
199,1002004,1,1957,2,4002,,,1,4002,1002004
234,1002354,0,1962,9,4001,,,0,4001,1002354
...,...,...,...,...,...,...,...,...,...,...
502221,6023054,0,1941,9,4001,,,0,4001,6023054
502390,6024740,0,1945,12,4001,,,0,4001,6024740
502417,6025018,0,1960,2,4001,,,0,4001,6025018
502443,6025273,0,1947,7,4001,,,0,4001,6025273


In [71]:
filtered_afr[["FID","IID","ethnicity"]].to_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/african_IID/010622_ukb47922_african_9096.iid", sep="\t", index=False)