In [1]:
import numpy as np
import pandas as pd

In [2]:
orig_df = pd.read_csv('BRFSS_2020_DATA.csv')

# Cleaning:

In [3]:
orig_df.isna().sum()

STATE FIPS CODE                                                                                                               0
FILE MONTH                                                                                                                    0
INTERVIEW DATE                                                                                                                0
INTERVIEW MONTH                                                                                                               0
INTERVIEW DAY                                                                                                                 0
                                                                                                                          ...  
RESPONDENTS AGED 50-75 WHO HAVE HAD A STOOL DNA TEST WITHIN THE PAST THREE YEARS                                         225601
RESPONDENTS AGED 50-75 WHO HAVE HAD A VIRTUAL COLONOSCOPY WITHIN THE PAST FIVE YEARS                    

#### Check columns that will be removed by criteria (must have at least 300k not nulls): 

In [4]:
temp = (orig_df.isna().sum()>(401958-300000))
list(temp[temp==True].index)

['CORRECT TELEPHONE NUMBER?',
 'PRIVATE RESIDENCE?',
 'DO YOU LIVE IN COLLEGE HOUSING?',
 'RESIDENT OF STATE',
 'CELLULAR TELEPHONE',
 'ARE YOU 18 YEARS OF AGE OR OLDER?',
 'ARE YOU MALE OR FEMALE?',
 'NUMBER OF ADULTS IN HOUSEHOLD',
 'ARE YOU MALE OR FEMALE?.1',
 'NUMBER OF ADULT MEN IN HOUSEHOLD',
 'NUMBER OF ADULT WOMEN IN HOUSEHOLD',
 'RESPONDENT SELECTION',
 'SAFE TIME TO TALK?',
 'CORRECT PHONE NUMBER?',
 'IS THIS A CELL PHONE?',
 'ARE YOU 18 YEARS OF AGE OR OLDER?.1',
 'ARE YOU MALE OR FEMALE?.2',
 'DO YOU LIVE IN A PRIVATE RESIDENCE?',
 'DO YOU LIVE IN COLLEGE HOUSING?.1',
 'DO YOU CURRENTLY LIVE IN  ____(STATE)____?',
 'DO YOU ALSO HAVE A LANDLINE TELEPHONE?',
 'NUMBER OF ADULTS IN HOUSEHOLD.1',
 'POOR PHYSICAL OR MENTAL HEALTH',
 'STILL HAVE ASTHMA',
 'AGE WHEN TOLD DIABETES',
 'HOUSEHOLD TELEPHONES',
 'RESIDENTIAL PHONES',
 'PREGNANCY STATUS',
 'FREQUENCY OF DAYS NOW SMOKING',
 'STOPPED SMOKING IN PAST 12 MONTHS',
 'INTERVAL SINCE LAST SMOKED',
 'AVG ALCOHOLIC DRINKS PER DAY

#### Check few potential columns from the list above. May be we can work with them? 

In [5]:
orig_df[[
'EVER BEEN TOLD YOU HAVE PRE-DIABETES OR BORDERLINE DIABETES',
'TOLD HAD HEPATITIS C',
'TOLD  HAD HEPATITIS B',
'HOW OLD WHEN YOU FIRST STARTED SMOKING?',
'HOW OLD WHEN YOU LAST SMOKED?',
'ON AVERAGE, HOW MANY CIGARETTES DO YOU SMOKE EACH DAY?',
]].isna().sum()

EVER BEEN TOLD YOU HAVE PRE-DIABETES OR BORDERLINE DIABETES    182965
TOLD HAD HEPATITIS C                                           383151
TOLD  HAD HEPATITIS B                                          383186
HOW OLD WHEN YOU FIRST STARTED SMOKING?                        387914
HOW OLD WHEN YOU LAST SMOKED?                                  388332
ON AVERAGE, HOW MANY CIGARETTES DO YOU SMOKE EACH DAY?         388351
dtype: int64

#### To preserve most of the rows, we'll remove all columns with that threshhold: 

In [6]:
clean_df_1 = orig_df.dropna(axis=1, thresh=300000)
clean_df_1.shape

(401958, 112)

In [7]:
list(clean_df_1.columns)

['STATE FIPS CODE',
 'FILE MONTH',
 'INTERVIEW DATE',
 'INTERVIEW MONTH',
 'INTERVIEW DAY',
 'INTERVIEW YEAR',
 'FINAL DISPOSITION',
 'ANNUAL SEQUENCE NUMBER',
 'PRIMARY SAMPLING UNIT',
 'SEX OF RESPONDENT',
 'GENERAL HEALTH',
 'NUMBER OF DAYS PHYSICAL HEALTH NOT GOOD',
 'NUMBER OF DAYS MENTAL HEALTH NOT GOOD',
 'HAVE ANY HEALTH CARE COVERAGE',
 'MULTIPLE HEALTH CARE PROFESSIONALS',
 'COULD NOT SEE DR. BECAUSE OF COST',
 'LENGTH OF TIME SINCE LAST ROUTINE CHECKUP',
 'EXERCISE IN PAST 30 DAYS',
 'HOW MUCH TIME DO YOU SLEEP',
 'EVER DIAGNOSED WITH HEART ATTACK',
 'EVER DIAGNOSED WITH ANGINA OR CORONARY HEART DISEASE',
 'EVER DIAGNOSED WITH A STROKE',
 'EVER TOLD HAD ASTHMA',
 '(EVER TOLD) YOU HAD SKIN CANCER?',
 '(EVER TOLD) YOU HAD ANY OTHER TYPES OF CANCER?',
 '(EVER TOLD) YOU HAD (COPD) CHRONIC OBSTRUCTIVE PULMONARY DISEASE, EMPHYSEMA OR CHRONIC BRONCHITIS?',
 'TOLD HAVE ARTHRITIS',
 '(EVER TOLD) YOU HAD A DEPRESSIVE DISORDER',
 'EVER TOLD YOU HAVE KIDNEY DISEASE?',
 '(EVER TOLD) YOU 

In [8]:
clean_df_1.isna().sum()

STATE FIPS CODE                                       0
FILE MONTH                                            0
INTERVIEW DATE                                        0
INTERVIEW MONTH                                       0
INTERVIEW DAY                                         0
                                                  ...  
HEAVY ALCOHOL CONSUMPTION  CALCULATED VARIABLE        0
ALWAYS OR NEARLY ALWAYS WEAR SEAT BELTS               0
ALWAYS WEAR SEAT BELTS                                0
DRINKING AND DRIVING                                  0
EVER BEEN TESTED FOR HIV CALCULATED VARIABLE      34037
Length: 112, dtype: int64

#### Looks like these columns are not very inetersting to us, we'll remove them too:

In [9]:
columns_to_remove = [
 'STATE FIPS CODE',
 'FILE MONTH',
 'INTERVIEW DATE',
 'INTERVIEW MONTH',
 'INTERVIEW DAY',
 'INTERVIEW YEAR',
 'FINAL DISPOSITION',
 'ANNUAL SEQUENCE NUMBER',
 'PRIMARY SAMPLING UNIT',
 'HAVE ANY HEALTH CARE COVERAGE',
 'MULTIPLE HEALTH CARE PROFESSIONALS',
 'COULD NOT SEE DR. BECAUSE OF COST',
 'LAST VISITED DENTIST OR DENTAL CLINIC',
 'NUMBER OF PERMANENT TEETH REMOVED',
 'MARITAL STATUS',
 'EDUCATION LEVEL',
 'OWN OR RENT HOME',
 'DO YOU HAVE A CELL PHONE FOR PERSONAL USE?',
 'ARE YOU A VETERAN',
 'EMPLOYMENT STATUS',
 'NUMBER OF CHILDREN IN HOUSEHOLD',
 'INCOME LEVEL',
 'ARE YOU DEAF OR DO YOU HAVE SERIOUS DIFFICULTY HEARING?',
 'BLIND OR DIFFICULTY SEEING',
 'DIFFICULTY CONCENTRATING OR REMEMBERING',
 'DIFFICULTY WALKING OR CLIMBING STAIRS',
 'DIFFICULTY DRESSING OR BATHING',
 'DIFFICULTY DOING ERRANDS ALONE',
 'ADULT FLU SHOT/SPRAY PAST 12 MOS',
 'PNEUMONIA SHOT EVER',
 'HOW OFTEN USE SEATBELTS IN CAR?',
 'EVER TESTED H.I.V.',
 'DO ANY HIGH RISK SITUATIONS APPLY',
 'QUESTIONNAIRE VERSION IDENTIFIER',
 'LANGUAGE IDENTIFIER',
 'METROPOLITAN STATUS',
 'URBAN/RURAL STATUS',
 'SAMPLE DESIGN STRATIFICATION VARIABLE',
 'STRATUM WEIGHT',
 'RAW WEIGHTING FACTOR USED IN RAKING',
 'DESIGN WEIGHT USED IN RAKING',
 'IMPUTED RACE/ETHNICITY VALUE',
 'DUAL PHONE USE CATEGORIES',
 'TRUNCATED DESIGN WEIGHT USED IN ADULT COMBINED LAN  LINE AND CELL PHONE RAKING',
 'FINAL WEIGHT: LAND-LINE AND CELL-PHONE DATA',
 'RESPONDENTS AGED 18-64 WITH HEALTH CARE COVERAGE',
 'RISK FACTOR FOR HAVING HAD PERMANENT TEETH EXTRACTED',
 'ADULTS WHO HAVE VISITED A DENTIST, DENTAL HYGENIST OR DENTAL CLINIC WITHIN THE PAST YEAR',
 'COMPUTED PREFERRED RACE',
 'CALCULATED NON-HISPANIC RACE INCLUDING MULTIRACIAL',
 'HISPANIC, LATINO/A, OR SPANISH ORIGIN CALCULATED VARIABLE',
 'COMPUTED RACE-ETHNICITY GROUPING',
 'COMPUTED NON-HISPANIC WHITES/ALL OTHERS RACE CATEGORIES RACE/ETHNIC GROUP CODES USED IN POST-STRATIFICATION.',
 'COMPUTED FIVE LEVEL RACE/ETHNICITY CATEGORY.',
 'COMPUTED RACE GROUPS USED FOR INTERNET PREVALENCE TABLES',
 'CALCULATED SEX VARIABLE',
 'COMPUTED HEIGHT IN INCHES',
 'COMPUTED HEIGHT IN METERS',
 'COMPUTED WEIGHT IN KILOGRAMS',
 'COMPUTED NUMBER OF CHILDREN IN HOUSEHOLD',
 'COMPUTED LEVEL OF EDUCATION COMPLETED CATEGORIES',
 'COMPUTED INCOME CATEGORIES',
 'CURRENT SMOKING CALCULATED VARIABLE',
 'ALWAYS OR NEARLY ALWAYS WEAR SEAT BELTS',
 'ALWAYS WEAR SEAT BELTS',
 'DRINKING AND DRIVING'
]

In [10]:
clean_df_2 = clean_df_1.drop(columns_to_remove, axis=1)
clean_df_2.shape

(401958, 46)

In [11]:
clean_df_2.isna().sum()

SEX OF RESPONDENT                                                                                            0
GENERAL HEALTH                                                                                               8
NUMBER OF DAYS PHYSICAL HEALTH NOT GOOD                                                                      5
NUMBER OF DAYS MENTAL HEALTH NOT GOOD                                                                        5
LENGTH OF TIME SINCE LAST ROUTINE CHECKUP                                                                    5
EXERCISE IN PAST 30 DAYS                                                                                     3
HOW MUCH TIME DO YOU SLEEP                                                                                   3
EVER DIAGNOSED WITH HEART ATTACK                                                                             6
EVER DIAGNOSED WITH ANGINA OR CORONARY HEART DISEASE                                                         3
E

#### Remove rows with nulls:

In [12]:
clean_df_3 = clean_df_2.dropna()
clean_df_3.shape

(336836, 46)

#### We are left with those features:

In [13]:
list(clean_df_3.columns)

['SEX OF RESPONDENT',
 'GENERAL HEALTH',
 'NUMBER OF DAYS PHYSICAL HEALTH NOT GOOD',
 'NUMBER OF DAYS MENTAL HEALTH NOT GOOD',
 'LENGTH OF TIME SINCE LAST ROUTINE CHECKUP',
 'EXERCISE IN PAST 30 DAYS',
 'HOW MUCH TIME DO YOU SLEEP',
 'EVER DIAGNOSED WITH HEART ATTACK',
 'EVER DIAGNOSED WITH ANGINA OR CORONARY HEART DISEASE',
 'EVER DIAGNOSED WITH A STROKE',
 'EVER TOLD HAD ASTHMA',
 '(EVER TOLD) YOU HAD SKIN CANCER?',
 '(EVER TOLD) YOU HAD ANY OTHER TYPES OF CANCER?',
 '(EVER TOLD) YOU HAD (COPD) CHRONIC OBSTRUCTIVE PULMONARY DISEASE, EMPHYSEMA OR CHRONIC BRONCHITIS?',
 'TOLD HAVE ARTHRITIS',
 '(EVER TOLD) YOU HAD A DEPRESSIVE DISORDER',
 'EVER TOLD YOU HAVE KIDNEY DISEASE?',
 '(EVER TOLD) YOU HAD DIABETES',
 'REPORTED WEIGHT IN POUNDS',
 'REPORTED HEIGHT IN FEET AND INCHES',
 'SMOKED AT LEAST 100 CIGARETTES',
 'USE OF SMOKELESS TOBACCO PRODUCTS',
 'DAYS IN PAST 30 HAD ALCOHOLIC BEVERAGE',
 'ADULTS WITH GOOD OR BETTER HEALTH',
 'COMPUTED PHYSICAL HEALTH STATUS',
 'COMPUTED MENTAL HEALT

In [14]:
clean_df_3.head(5)

Unnamed: 0,SEX OF RESPONDENT,GENERAL HEALTH,NUMBER OF DAYS PHYSICAL HEALTH NOT GOOD,NUMBER OF DAYS MENTAL HEALTH NOT GOOD,LENGTH OF TIME SINCE LAST ROUTINE CHECKUP,EXERCISE IN PAST 30 DAYS,HOW MUCH TIME DO YOU SLEEP,EVER DIAGNOSED WITH HEART ATTACK,EVER DIAGNOSED WITH ANGINA OR CORONARY HEART DISEASE,EVER DIAGNOSED WITH A STROKE,...,COMPUTED BODY MASS INDEX,COMPUTED BODY MASS INDEX CATEGORIES,OVERWEIGHT OR OBESE CALCULATED VARIABLE,COMPUTED SMOKING STATUS,DRINK ANY ALCOHOLIC BEVERAGES IN PAST 30 DAYS,COMPUTED DRINK-OCCASIONS-PER-DAY,BINGE DRINKING CALCULATED VARIABLE,COMPUTED NUMBER OF DRINKS OF ALCOHOL BEVERAGES PER WEEK,HEAVY ALCOHOL CONSUMPTION CALCULATED VARIABLE,EVER BEEN TESTED FOR HIV CALCULATED VARIABLE
0,2,2.0,3.0,30.0,4.0,1.0,5.0,2.0,2.0,2.0,...,1660.0,1.0,1,1,2,0,1,0,1,1.0
4,2,2.0,88.0,88.0,1.0,1.0,7.0,2.0,2.0,1.0,...,2034.0,2.0,1,4,2,0,1,0,1,9.0
5,1,4.0,20.0,30.0,2.0,1.0,8.0,2.0,2.0,2.0,...,2658.0,3.0,2,3,2,0,1,0,1,1.0
6,2,3.0,88.0,88.0,1.0,2.0,6.0,2.0,2.0,2.0,...,2421.0,2.0,1,4,2,0,1,0,1,2.0
8,2,2.0,28.0,88.0,1.0,1.0,8.0,2.0,2.0,2.0,...,2371.0,2.0,1,4,2,0,1,0,1,2.0
