In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

%matplotlib inline

import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
#df = pd.read_sas('./data/LLCP2021.XPT')
#df = pd.to_csv('./data/LLCP2021.csv')
df = pd.read_csv('./data/LLCP2021.csv')

In [3]:
# drop s0q1, interview state
# drop s0q8, file month
# drop s0q9, interview date
# drop s0q10, interview month
# drop s0q11, interview day
# drop s0q12, interview year
# drop s0q14, final disposition, can use for dropping partial interviews
# drop s0q15, annual sequence number
# drop s0q16, primary sampling unit
df = df[df.CTELENM1 != 2] # s0q1, dropping incorrect phone numbers

In [4]:
# drop s0q1, correct telephone number
# drop s0q2, private residence
# drop s0q3, college housing
# drop s0q4, resident of state
# drop s0q5, cellular telephone
# drop s0q6, 18 or older
# drop s0q7, college male or female
# drop s0q8, number af adults in household
# drop s0q9, landline sex
# drop s0q10, number of adult men in household
# drop s0q11, number of adult women in household
# drop s0q12, respondent selection
# drop s0q1, safe time to talk
# drop s0q2, correct phone number
# drop s0q3, is this a cell phone
# drop s0q4, 18 years or older cell phone
# drop s0q5, cell phone male or female
# drop s0q6, private residence cell phone
# drop s0q7, cell phone college housing
# drop s0q8, cell phone state confirmation
# drop s0q10, do you have a landline as well
# drop s0q11, number of adults in household
df = df.drop(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'SEQNO', 'DISPCODE', '_PSU', 'CTELENM1', 
                'PVTRESD1', 'COLGHOUS', 'STATERE1', 'CELPHON1', 'LADULT1','COLGSEX', 'NUMADULT', 'LANDSEX', 'NUMMEN', 
              'NUMWOMEN', 'RESPSLCT', 'SAFETIME', 'CTELNUM1', 'CELLFON5',
               'CADULT1','CELLSEX', 'PVTRESD3', 'CCLGHOUS', 'CSTATE1', 'LANDLINE', 'HHADULT'], axis=1)

In [5]:
pd.isna(df["SEXVAR"]).value_counts() # no nans

False    438691
Name: SEXVAR, dtype: int64

In [6]:
# dropping continual missing values from survey respondents leaving calls
df = df.dropna(axis=0, subset=['_VEGESU1'])
df = df.dropna(axis=0, subset=['VEGETAB2'])

In [7]:
# drop s3q1:PRIMINSR, health care access, _EDUCAG is a better metric
df = df.drop(['PRIMINSR'], axis=1)

In [8]:
# drop s8q2, suggested physical activity for arthritis, not relevant for diagnosis
# drop s8q3, taken course for arthritis, not relevant for diagnosis
# drop s8q4, limited because of joint symptoms, not relevant for diagnosis
# drop s8q5, arthritis affects work, not relevant for diagnosis
df = df.drop(['ARTHEXER', 'ARTHEDU', 'LMTJOIN3', 'ARTHDIS2'], axis=1)

In [9]:
df.columns

Index(['Unnamed: 0', 'SEXVAR', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH',
       'PERSDOC3', 'MEDCOST1', 'CHECKUP1', 'EXERANY2',
       ...
       '_FRTRES1', '_VEGRES1', '_FRUTSU1', '_VEGESU1', '_FRTLT1A', '_VEGLT1A',
       '_FRT16A', '_VEG23A', '_FRUITE1', '_VEGETE1'],
      dtype='object', length=268)

In [10]:
# drop s7q12, old when told had diabetes, not relevant for diagnosing
#df = df.drop(['DIABAGE3'], axis=1)

In [11]:
# drop s9q6, education level, _EDUCAG is better
df = df.drop(['EDUCA'], axis=1)

In [12]:
# drop s9q10, household telephones, not relevant
# drop s9q11, residential phones
# drop s9q12, cell phone for personal use
df = df.drop(['NUMHHOL3', 'NUMPHON3', 'CPDEMO1B'], axis=1)

In [13]:
# drop s9q15, number of children in household, not relevant
# drop s9q16, income level, _INCOMG1 is a better metric
df = df.drop(['CHILDREN', 'INCOME3'], axis=1)

In [14]:
# drop s9q18, reported weight pounds, WTKG3 is a better metric
# drop s9q19, reported height in feet and inches, HTM4 is a better metric
df = df.drop(['WEIGHT2', 'HEIGHT3'], axis=1)

In [15]:
# drop s13q3, not relevant for diagnosis
df = df.drop(['IMFVPLA2'], axis=1)

In [16]:
# drop HIVTSTD3 s14q2, makes the model vulnerable to differences in time taking test
df = df.drop(['HIVTSTD3'], axis=1)

In [17]:
# drop FRUIT2 s15q1, in favor of FRUTDA2_ instead
# drop FRUITJU2 s15q2, in favor of FTJUDA2_ instead
# drop FVGREEN1 s15q3, in favor of GRENDA1_ instead
# drop FRENCHF1 s15q4, in favor of FRNCHDA_ instead
# drop POTATOE1 s15q5, in favor of POTADA1_ instead
# drop VEGETAB2 s15q6, in favor of VEGEDA2_ instead
df = df.drop(['FRUIT2', 'FRUITJU2', 'FVGREEN1', 'FRENCHF1', 'POTATOE1', 'VEGETAB2'], axis=1)

In [18]:
# drop PREDIAB1 m1q2, not relevant for diagnosis
df = df.drop(['PREDIAB1'], axis=1)

In [19]:
# drop BLDSUGAR m2q2, introduces bias
# drop FEETCHK3 m2q3, introduces bias
# drop DOCTDIAB m2q4, already confirmed diabetes
# drop CHKHEMO3 m2q5, already confirmed diabetes
# drop FEETCHK m2q6, already confirmed diabetes
# drop EYEEXAM1 m2q7, already confirmed diabetes
# drop DIABEYE m2q8, already confirmed diabetes
# drop DIABEDU m2q9, already confirmed diabetes
# drop TOLDCFS m3q1, no one has it
# drop HAVECFS m3q2, no one has it
# drop WORKCFS m3q2, no one has it
# drop TOLDHEPC m4q1, barely anyone was asked
# drop TRETHEPC m4q2, barely anyone was asked
# drop PRIRHEPC m4q3, barely anyone was asked
# drop HAVEHEPC m4q4, barely anyone was asked
# drop HAVEHEPB m4q5, barely anyone was asked
# drop MEDSHEPB m4q6, barely anyone was asked
# drop HPVADVC4 m5q1, barely anyone was asked
# drop HPVADSHT m5q2, barely anyone was asked
# drop TETANUS1 m6q1, barely anyone was asked
# drop SHINGLE2 m7q1, barely anyone was asked
# drop LCSFIRST m9q1, barely anyone was asked
# drop LCSLAST m9q2, barely anyone was asked
# drop LCSNUMCG m9q3, barely anyone was asked
# drop LCSCTSCN m9q4, barely anyone was asked
# drop HADMAM m10q1, barely anyone was asked
# drop HOWLONG m10q2, barely anyone was asked
# drop CERVSCRN m10q3, barely anyone was asked
# drop CRVCLCNC m10q4, barely anyone was asked
# drop CRVCLPAP m10q5, barely anyone was asked
# drop CRVCLHPV m10q6, barely anyone was asked
# drop HADHYST2 m10q7, barely anyone was asked
# drop PSATEST1 m11q1, barely anyone was asked
# drop PSATIME1 m11q2, barely anyone was asked
# drop PCPSARS2 m11q3, barely anyone was asked
# drop PCSTALK m11q4, barely anyone was asked
# drop HADSIGM4 m12q1, barely anyone was asked
# drop COLNSIGM m12q2, barely anyone was asked
# drop COLNTES1 m12q3, barely anyone was asked
# drop SIGMTES1 m12q4, barely anyone was asked
# drop LASTSIG4 m12q5, barely anyone was asked
# drop COLNCNCR m12q6, barely anyone was asked
# drop VIRCOLO1 m12q7, barely anyone was asked
# drop VCLNTES1 m12q8, barely anyone was asked
# drop SMALSTOL m12q9, barely anyone was asked
# drop STOLTEST m12q10, barely anyone was asked
# drop STOOLDN1 m12q11, barely anyone was asked
# drop BLDSTFIT m12q12, barely anyone was asked
# drop SDNATES1 m12q13, barely anyone was asked
# drop CNCRDIFF m13q1, barely anyone was asked
# drop CNCRAGE m13q2, barely anyone was asked
# drop CNCRTYP1 m13q3, barely anyone was asked
# drop CSRVTRT3 m14q1, barely anyone was asked
# drop CSRVDOC1 m14q2, barely anyone was asked
# drop CSRVSUM m14q3, barely anyone was asked
# drop CSRVRTRN m14q4, barely anyone was asked
# drop CSRVINST m14q5, barely anyone was asked
# drop CSRVINSR m14q6, barely anyone was asked
# drop CSRVDEIN m14q7, barely anyone was asked
# drop CSRVCLIN m14q8, barely anyone was asked
# drop CSRVPAIN m15q1, barely anyone was asked
# drop CSRVCTL2 m15q2, barely anyone was asked
# drop HOMBPCHK m16q1, barely anyone was asked
# drop HOMRGCHK m16q2, barely anyone was asked
# drop WHEREBP m16q3, barely anyone was asked
# drop SHAREBP m16q4, barely anyone was asked
# drop WTCHSALT m17q1, barely anyone was asked
# drop DRADVISE m17q2, barely anyone was asked
# drop CIMEMLOS m18q1, barely anyone was asked
# drop CDHOUSE m18q2, barely anyone was asked
# drop CDASSIST m18q3, barely anyone was asked
# drop CDHELP m18q4, barely anyone was asked
# drop CDSOCIAL m18q5, barely anyone was asked
# drop CDDISCUS m18q6, barely anyone was asked
# drop CAREGIV1 m19q1, barely anyone was asked
# drop CRGVREL4 m19q2, barely anyone was asked
# drop CRGVLNG1 m19q3, barely anyone was asked
# drop CRGVHRS1 m19q4, barely anyone was asked
# ...
# drop SOFEMALE m28q1b, barely anyone was asked
# drop TRNSGNDR m28q2, almost no yeses and outlier
# drop QSTVER m1q1, not relevant
# drop QSTLANG m1q1, not relevant
#QSTVER

df = df.drop(['BLDSUGAR', 'FEETCHK3', 'DOCTDIAB', 'CHKHEMO3', 'FEETCHK', 'EYEEXAM1', 'DIABEYE', 'DIABEDU'], axis=1)
df = df.drop(['TOLDCFS', 'HAVECFS', 'WORKCFS', 'TOLDHEPC', 'TRETHEPC', 'PRIRHEPC', 'HAVEHEPC'], axis=1)
df = df.drop(['HAVEHEPB', 'MEDSHEPB', 'HPVADVC4', 'HPVADSHT', 'TETANUS1', 'SHINGLE2', 'LCSFIRST'], axis=1)
df = df.drop(['LCSLAST', 'LCSNUMCG', 'LCSCTSCN', 'HADMAM', 'HOWLONG', 'CERVSCRN', 'CRVCLCNC'], axis=1)
df = df.drop(['CRVCLPAP', 'CRVCLHPV', 'HADHYST2', 'PSATEST1', 'PSATIME1', 'PCPSARS2', 'PCSTALK'], axis=1)
df = df.drop(['HADSIGM4', 'COLNSIGM', 'COLNTES1', 'SIGMTES1', 'LASTSIG4', 'COLNCNCR', 'VIRCOLO1'], axis=1)
df = df.drop(['VCLNTES1', 'SMALSTOL', 'STOLTEST', 'STOOLDN1', 'BLDSTFIT', 'SDNATES1', 'CNCRDIFF'], axis=1)
df = df.drop(['CNCRAGE', 'CNCRTYP1', 'CSRVTRT3', 'CSRVDOC1', 'CSRVSUM', 'CSRVRTRN', 'CSRVINST'], axis=1)
df = df.drop(['CSRVINSR', 'CSRVDEIN', 'CSRVCLIN', 'CSRVPAIN', 'CSRVCTL2', 'HOMBPCHK', 'HOMRGCHK'], axis=1)
df = df.drop(['WHEREBP', 'SHAREBP', 'WTCHSALT', 'DRADVISE', 'CIMEMLOS', 'CDHOUSE', 'CDASSIST', 'CDHELP'], axis=1)
df = df.drop(['CDSOCIAL', 'CDDISCUS', 'CAREGIV1', 'CRGVREL4', 'CRGVLNG1', 'CRGVHRS1', 'CRGVPRB3'], axis=1)
df = df.drop(['CRGVALZD', 'CRGVPER1', 'CRGVHOU1', 'CRGVEXPT', 'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS'], axis=1)
df = df.drop(['ACEPRISN', 'ACEDIVRC', 'ACEPUNCH', 'ACEHURT1', 'ACESWEAR', 'ACETOUCH', 'ACETTHEM'], axis=1)
df = df.drop(['ACEHVSEX', 'ACEADSAF', 'ACEADNED', 'MARIJAN1', 'RSNMRJN2', 'LASTSMK2', 'STOPSMK2'], axis=1)
df = df.drop(['FIREARM5', 'GUNLOAD', 'LOADULK2', 'RCSGENDR', 'RCSRLTN2', 'CASTHDX2', 'CASTHNO2'], axis=1)
df = df.drop(['BIRTHSEX', 'SOMALE', 'SOFEMALE', 'TRNSGNDR', 'QSTVER', 'QSTLANG'], axis=1)

In [20]:
# drop MSCODE m1q2, not relevant
# drop _STSTR m1q3, not relevant
# drop _STRWT m1q4, not relevant
# drop _RAWRAKE m1q7, not relevant
# drop _WT2RAKE m1q8, not relevant

df = df.drop(['MSCODE', '_STSTR', '_STRWT', '_RAWRAKE'], axis=1)

In [21]:
# drop _CHISPNC m1q01, not relevant
# drop _CRACE1 m1q04, not enough data
# drop _CPRACE1 m1q05, not enough data
# drop CAGEG m1q04, not relevant
# drop _CLLCPWT m1q5, not relevant
# drop _DUALUSE m1q2, not relevant
# drop _DUALCOR m1q3, not relevant
# drop _LLCPWT2 m1q4, not relevant
# drop _LLCPWT m1q21, not relevant
# drop _RFHLTH m1q5, already in data
# drop _PHYS14D m2q1, already in data
# drop _MENT14D m2q2, not relevant
df = df.drop(['_CHISPNC', '_CRACE1', '_CPRACE1', 'CAGEG', '_CLLCPWT', '_DUALUSE', '_DUALCOR', '_LLCPWT2'], axis=1)
df = df.drop(['_LLCPWT', '_RFHLTH', '_MENT14D'], axis=1)

In [22]:
# drop _HCVU652 m3s2, already contains this data
# drop _TOTINDA m4q1, already contains this data
# drop _RFHYPE6 m5q1, already contains this data
# drop _CHOLCH3 m6q1, already contains this data
# ...
df = df.drop(['_HCVU652', '_TOTINDA', '_RFHYPE6', '_CHOLCH3','_RFCHOL3', '_MICHD', '_LTASTH1', '_CASTHM1'], axis=1)
df = df.drop(['_ASTHMS1', '_DRDXAR3', '_LMTACT3', '_LMTWRK3', '_PRACE1', '_MRACE1', '_HISPANC', '_RACE'], axis=1)
df = df.drop(['_RACEG21', '_RACEGR3', '_RACEPRV', '_SEX'], axis=1)

In [23]:
# drop _AGEG5YR m9q12, _AGE80 is better
# drop _AGE65YR m9q13, already contains this data
# ...
df = df.drop(['_AGEG5YR', '_AGE65YR'], axis=1)

In [24]:
# drop _AGE_G m9q15, _AGE80 is better
df = df.drop(['_AGE_G'], axis=1)

In [25]:
# drop HTM4 m9q17, HTIN4 is better
df = df.drop(['HTM4'], axis=1)

In [26]:
# drop _BMI5 m9q19, data already contained
# drop _BMI5CAT m9q20, data already contained
# drop _RFBMI5 m9q21, data already contained
# drop _CHLDCNT m9q22, not relevant
df = df.drop(['_BMI5', '_BMI5CAT', '_RFBMI5', '_CHLDCNT'], axis=1)

In [27]:
# drop _SMOKER3 m11q1, data already contained
# drop _RFSMOK3 m11q2, data already contained
# drop _CURECI1 m12q1, data already contained
# drop DRNKANY5 m11q2, data already contained
# drop DROCDY3_ m12q2, data already contained
# drop _RFBING5 m12q3, data already contained
# ...
df = df.drop(['_SMOKER3', '_RFSMOK3', '_CURECI1', 'DRNKANY5', 'DROCDY3_', '_RFBING5', '_DRNKWK1'], axis=1)
df = df.drop(['_RFDRHV7', '_FLSHOT7', '_PNEUMO3', '_AIDTST4'], axis=1)

In [28]:
# drop _MISVEG1, missing response feature
# drop _FRTRES1, missing response feature
# drop _VEGRES1, missing response feature
# drop _FRUTSU1, Total fruits consumed per day, data already available
# drop _VEGESU1, Total vegetables consumed per day, data already available
# drop _FRTLT1A, Consume Fruit 1 or more times per day, data already available
# drop _VEGLT1A, Consume vegatables 1 or more times per day, data already available
# drop _FRT16A, Reported consuming Fruit >16 per day, data already available
# drop _VEG23A, Reported consuming Vegetables >23 per day, data already available
# drop _FRUITE1, Fruit Exclusion from analyses, data already available
# drop _VEGETE1, Vegetable Exclusion from analyses, data already available

df = df.drop(['_MISVEG1', '_FRTRES1', '_VEGRES1', '_FRUTSU1', '_VEGESU1', '_FRTLT1A', '_VEGLT1A'], axis=1)
df = df.drop(['_FRT16A', '_VEG23A', '_FRUITE1', '_VEGETE1'], axis=1)

In [29]:
df = df.drop(['USEMRJN3'], axis=1)

In [30]:
# drop s7q12, old when told had diabetes, not relevant for diagnosing
df = df.drop(['DIABAGE3'], axis=1)

In [31]:
df.to_csv('./data/brfss_reduced.csv')