In [1]:
import pandas as pd
import datetime as dt
import numpy as np

# load measures from project data dictionary
measures = ['TEPS', 'MAP-SR', 'CESD', 'COPE', 'CAPE']
dataDict = {m:pd.read_excel('dataPrep/HBP_NDA_DataDict.xlsx', sheet_name=m) for m in measures}

# load pseudo-GUIDs
pGUIDs = pd.read_excel('dataPrep/HBP_NDA_DataDict.xlsx', sheet_name='pseudo-GUIDs') 

# load collected data & sort by 'subnum'
qltrcs_data = pd.read_csv('dataPrep/Healthy+Brains+Project+-+Qualtrics+Survey_April+23,+2020_09.22.csv', skiprows=[1, 2]).sort_values(by='subnum')
intvw_data = None # update when we get the database, remember to sort by subnum
rawData = pd.concat([qltrcs_data, intvw_data], axis=1, sort=False)

In [2]:
class Subject():
    
    SID = None # a string
    pGUID = None # a string
    intvw_date = None # a datetime object?
    age = None# an integer
    data = None # a DataFrame with one row
    index = None # a panda Int64 obj, behaves like an integer
    
    def calcAge(self, bday):
        self.intvw_date = dt.datetime.strptime(str(self.intvw_date), "%Y-%m-%d %H:%M:%S")
        bday = dt.datetime.strptime(str(bday), "%m/%d/%Y")
        age = self.intvw_date - bday
        age_in_months = round(age.days/30)
        return age_in_months
    
    def __init__(self, sid):
        global rawData, pGUIDs
        self.SID = sid # a string?
        self.pGUID = pGUIDs.query("study_ids == @sid")['pGUIDs'].iloc[0]
        self.intvw_date = rawData.query("subnum == @sid")['StartDate'].iloc[0]
        bday = rawData.query("subnum == @sid")['SD1'].iloc[0]
        self.age = self.calcAge(bday)
        self.data = rawData.query("subnum == @sid")
        self.index = self.data.index
        
    def displayInfo(self):
        print("SUBJECT ", self.SID)
        print("pGUID: ", self.pGUID)
        print("interview_date: ", str(self.intvw_date))
        print("age in months: ", str(self.age))
        
    def getVarData(self, var):
        return self.data[var].iloc[0]

    
# list of subjects (objs) that have participated in study
subs = pd.Series([Subject(s) for s in rawData['subnum']]) # there's a way to do this without a list comprehension

In [24]:
class Measure():
    
    name = None
    NDA_vars = None
    HBP_vars = None # a Series
    varmatches = dict([]) 
    Data = None
    
    # remove NDA vars for which we don't collect data & data is not required
    def removeNDAvars(self):
        recommended = self.NDA_vars.query("required == False")
        deleteThese = recommended.query("`HBP varname` == ''")
        return None
        
    # matches the HBP and NDA vars in a dictionary
    def matchVars(self):
        HBP = list(self.HBP_vars)
        matches = {v:self.NDA_vars.iloc[HBP.index(v)] for v in HBP}
        return matches
    
    def __init__(self, name):
        global dataDict
        self.name = name
        self.NDA_vars = dataDict[name]['NDA varname']
        self.HBP_vars = dataDict[name]['HBP varname']
        self.varmatches = self.matchVars()
        self.Data = pd.DataFrame(columns=self.NDA_vars) # modify to check if we collect data for that var & if it's required
    
    def addSubnums(self):
        global subs
        SIDs = subs.apply(lambda x: x.SID)
        pGUIDs = subs.apply(lambda x: x.pGUID)
        #SIDs = pd.Series()
        #pGUIDs = pd.Series()
        #for s in subs: # eventually I'll need to find a different approach -- try the approach used in fillColumns
        #    SIDs = SIDs.append(s.SID)
        #    pGUIDs = pGUIDs.append(s.pGUID)
        self.Data['subjectkey'] = pGUIDs
        self.Data['src_subject_id'] = SIDs        
    
    def fillColumns(self):
        global subs
        HBPvars = self.HBP_vars.query()
        def matchNfill(hbpv):
            vdata = subs.apply(lambda s: s.getVarData(hbpv)) 
            ndaeq = self.varmatches[hbpv]
            self.Data[ndaeq] = vdata
            return None
        self.HBP_vars.apply(lambda x: matchNfill(x))
    
    def fillColumns(self):
        for hbpv in self.HBP_vars:
            vdata = subs.apply(lambda s: s.getVarData(hbpv)) 
            ndaeq = self.varmatches[hbpv]
            self.Data[ndaeq] = vdata
            
    def prepData(self):
        self.addSubnums()
        self.fillColumns()
        
        
Measures = {m:Measure(m) for m in measures}

In [27]:
TEPS = Measures['TEPS']
#TEPS.prepData()

#vdata = subs.apply(lambda s: s.getVarData('TEPS_1'))
#TEPS.varmatches['TEPS_1']
#TEPS.Data[TEPS.varmatches['TEPS_1']] = vdata
TEPS.HBP_vars

0           NaN
1        subnum
2     StartDate
3           SD1
4           SD2
5        TEPS_1
6        TEPS_2
7        TEPS_3
8        TEPS_4
9        TEPS_5
10       TEPS_6
11       TEPS_7
12       TEPS_8
13       TEPS_9
14      TEPS_10
15      TEPS_11
16      TEPS_12
17      TEPS_13
18      TEPS_14
19      TEPS_15
20      TEPS_16
21      TEPS_17
22      TEPS_18
23          NaN
24          NaN
25          NaN
26          NaN
27          NaN
28          NaN
29          NaN
30          NaN
31          NaN
Name: HBP varname, dtype: object

In [None]:
class DataCleaner():
    
    measure = None # a str
    data = None # a df
    
    def __init__(self):
        return None
    
    def formatSexData(self):
        self.data['sex'].replace({1:'M', 2:'F'}, inplace=True)
        
    def formatDate(self):
        # **FILL IN LATER**
        return None
        
    def fillMissing(self):
        # **FILL IN LATER**
        
    def setDtypes(self):
        global data
        # **FILL IN LATER**
        return None
    
    def cleanData(self, m, d):
        self.measure = m
        self.data = d
        

LEFT TO DO:

Subject class:


Measure Class:
1. add pGUIDs from each sub object

DataCleaner class:
(takes in pd.DataFrame obj, makes changes, returns obj)
1. add missing values where necessary (remember convo with Evan
2. change sex data from 1s & 2s to Ms & Fs
3. set data types for each variable according to codebook
4. Add a function to format the interview_date properly (mm/dd/yyyy)


In writing to CSV remember to set index=False so that it doesn't write the index column.


AFTER WRITING TO CSV:
1. reformat interview_date to MM/DD/YYYY in MS Excel -- first try upload with dates saved as string data and see what happens
2. add first row with label to spreadsheet
3. do a spot check of 5 subs for each measure to make sure everything looks good