## Basic Statistics on UKB dataset

Set up global variables and import modules

In [20]:
import os
import sys
import pandas as pd
from matplotlib import pyplot as plt
from pathlib import Path 

ukb_root = '/project_freenas/3022017.02/UKB'
sys.path.append(os.path.join(ukb_root,'scripts'))
from ukb_utils import get_variables_UKB, lookup_UKB

ukb_idp_dir = os.path.join(ukb_root,'phenotypes','current')

#### Read dataframe

In [None]:
basic_demo = pd.read_csv(os.path.join(ukb_idp_dir,'01_basic_demographics.csv'),nrows=1)
brain_IDPs = pd.read_csv(os.path.join(ukb_idp_dir,'31_brain_IDPs.csv'),nrows=1)

Lookup field IDs with Lookup_UKB function <br>
This will to help figure out what fields are present in the dataframe

In [80]:
def list_field_names(data_frame):
  fields_ID = data_frame.columns
  field_list = pd.DataFrame(columns=['Field', 'FieldID'])
  for idf in fields_ID[1:]:
    fld = lookup_UKB(field_ids=[idf]) 
    field_list = field_list.append(fld)
  return field_list

Save resulted dataframes for later use. This may take a while.

In [None]:
#brain_IDPs_fields = list_field_names(brain_IDPs)
#brain_IDPs_fields.to_csv('/home/preclineu/ramcir/Desktop/Diffusion/diffusion_nm/brain_IDPs_fields.csv')
#basic_demo_fields = list_field_names(basic_demo)
#basic_demo_fields.to_csv('/home/preclineu/ramcir/Desktop/Diffusion/diffusion_nm/basic_demo_fields.csv')

Turns out some dataframes contain too many fields to allow for visual inspection. As an alternative, specific metrics (field names) can be looked up on the UK Biobank website and their respective field codes can be used to extract data from the containing dataframe.

#### Identifying and extracting subjects who present diffusion data

Assuming 'Data-Field 25737 (Discrepancy between dMRI brain image and T1 brain image) contains all the subjects who were scanned for dMRI, look up the demographic information in the basic_demo dataframe using the participant id's

In [None]:
lookup_UKB(field_ids=[25737])

In [None]:
# first check inside the brain IDP dataframe for the metric we are interssted in
fieldID_check = brain_IDPs.filter(regex='25737')
fieldID_check

In [None]:
# read the demographic metric we are intersted in, remove the NANs
basic_demo = pd.read_csv(os.path.join(ukb_idp_dir,'01_basic_demographics.csv'), usecols = ['eid','34-0.0'], low_memory = True)
basic_demo.dropna(inplace=True)

In [87]:
# read the dMRI metric we are intersted in, remove the NANs
brain_IDPs = pd.read_csv(os.path.join(ukb_idp_dir,'31_brain_IDPs.csv'), usecols = ['eid','25737-2.0'], low_memory = True)
brain_IDPs.dropna(inplace=True)