# Team 6: Data mining project

# Project Title: Youth at Risk: Predicting Suicidal Behavior in Teens
This file contains the mapping details which is explained in file Data mining - YRBSS project - data preparation and mapping details.pdf.
Reference files: 2021_YRBS_Data_Users_Guide_508.pdf and 2021-YRBS-Standard-HS-Questionnaire.pdf
This is survey data for 2021 from Youth Rish Behavior Surveillance System conducted by Centers for Disease Control and Prevention

In [None]:
# Loading the libaries for dataset
import numpy as np
import pandas as pd
import pdfplumber

# Visualization Libraies
import matplotlib.pyplot as plt
import seaborn as sns

## Reading in the survey response data for 2021

In [None]:
# Reading in the the survey response Excel file into a df (Raw Data)
yrbs_survey_response_raw_df = pd.read_excel('XXHq.xlsx', engine='openpyxl')

In [None]:
yrbs_survey_response_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17232 entries, 0 to 17231
Columns: 109 entries, site to psu
dtypes: float64(102), int64(3), object(4)
memory usage: 14.3+ MB


In [None]:
yrbs_survey_response_raw_df.describe

<bound method NDFrame.describe of       site  raceeth q6orig q7orig  record  orig_rec   q1   q2   q3   q4  ...  \
0       XX      3.0    502    155       1       NaN  5.0  1.0  3.0  2.0  ...   
1       XX      7.0    509    NaN       2       NaN  4.0  2.0  NaN  1.0  ...   
2       XX      6.0    507    136       3       NaN  4.0  2.0  2.0  1.0  ...   
3       XX      6.0    509    133       4       NaN  4.0  2.0  2.0  1.0  ...   
4       XX      7.0    603    196       5       NaN  3.0  2.0  2.0  1.0  ...   
...    ...      ...    ...    ...     ...       ...  ...  ...  ...  ...  ...   
17227   XX      5.0    411    150   17502       NaN  5.0  1.0  2.0  2.0  ...   
17228   XX      5.0    600    165   17503       NaN  6.0  2.0  4.0  2.0  ...   
17229   XX      5.0    507    200   17504       NaN  4.0  2.0  1.0  2.0  ...   
17230   XX      5.0    506    190   17505       NaN  4.0  1.0  1.0  2.0  ...   
17231   XX      5.0    502    125   17507       NaN  7.0  1.0  4.0  2.0  ...   

     

In [None]:
yrbs_survey_response_raw_df.columns

Index(['site', 'raceeth', 'q6orig', 'q7orig', 'record', 'orig_rec', 'q1', 'q2',
       'q3', 'q4',
       ...
       'q94', 'q95', 'q96', 'q97', 'q98', 'q99', 'BMIPCT', 'weight', 'stratum',
       'psu'],
      dtype='object', length=109)

In [None]:
# Defining function to get counts of NaN, None, and 0 for all columns and also total missing
def get_missing_counts_by_column(df):
    total_count = len(df)
    # Count of 0s
    zero_count = (df == 0).sum()
    # Count of NaNs
    nan_count = df.isna().sum()
    # Count of None
    none_count = df.apply(lambda col: col.map(lambda x: x is None).sum())
    # Count of all null-like values
    null_count = df.isnull().sum()

    # df to summarize results
    report = pd.DataFrame({
        'column': df.columns,
        'total_row_count': total_count,
        'zero_count': zero_count,
        'nan_count': nan_count,
        'none_count': none_count,
        'null_count': null_count,
    })

    # Total column summing all counts for each missing column
    report['total_missing'] = report[['zero_count', 'nan_count',
                                                 'none_count', 'null_count']].sum(axis=1)

    return report

In [None]:
# Applying function on the data
missing_summary = get_missing_counts_by_column(yrbs_survey_response_raw_df)

# % missing
missing_summary['pecentage_missing'] = (missing_summary['total_missing'] / missing_summary['total_row_count'] * 100).map('{:.2f}%'.format)

# Sorting by descending on % missing
missing_summary = missing_summary.sort_values(by='pecentage_missing', ascending=False)

In [None]:
missing_summary.head(30)

Unnamed: 0,column,total_row_count,zero_count,nan_count,none_count,null_count,total_missing,pecentage_missing
q98,q98,17232,0,8181,0,8181,16362,94.95%
q90,q90,17232,0,8149,0,8149,16298,94.58%
q97,q97,17232,0,8140,0,8140,16280,94.48%
q99,q99,17232,0,8109,0,8109,16218,94.12%
q95,q95,17232,0,8101,0,8101,16202,94.02%
q92,q92,17232,0,7973,0,7973,15946,92.54%
q89,q89,17232,0,7968,0,7968,15936,92.48%
q67,q67,17232,0,7959,0,7959,15918,92.37%
q96,q96,17232,0,7863,0,7863,15726,91.26%
q17,q17,17232,0,814,0,814,1628,9.45%


## Reading in the mapping data (data guide) for 2021

### Answer label mapping: Manual approach using code generation to page-wise df approach
### The page dfs for each page will be appended to a master mapping df in the end

#### Page 1

In [None]:
# Raw input code for page 1 of Appendix C
appendix_c_raw_pg1 = {
    'question_number': ['Q1'] * 8 + ['Q2'] * 3 + ['Q3'] * 6 + ['Q4'] * 3,
    'question': [
        'How old are you?'] * 8 +
        ['What is your sex?'] * 3 +
        ['In what grade are you?'] * 6 +
        ['Are you Hispanic or Latino?'] * 3,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 'NA',
                    1, 2, 3, 4, 5, 'NA',
                    1, 2, 'NA'
                   ],
    'answer_label': [
        '12 years old or younger', '13 years old', '14 years old', '15 years old',
        '16 years old', '17 years old', '18 years old or older', 'Missing',
        'Female', 'Male', 'Missing',
        '9th grade', '10th grade', '11th grade', '12th grade', 'Ungraded or other grade', 'Missing',
        'Yes', 'No', 'Missing'
    ],
    'frequency': [39, 62, 3403, 4427, 4276, 3904, 1023, 98,
                  8152, 8816, 264,
                  4646, 4466, 4118, 3843, 23, 136,
                  3258, 13700, 274
                 ],
    'weighted_percentage': [0.2, 0.4, 20.0, 25.3, 24.7, 23.6, 5.8, 'NA',
                            48.3, 51.7, 'NA',
                            26.6, 25.4, 24.3, 23.5, 0.2, 'NA',
                            25.3, 74.7, 'NA'
                           ]
}


# Converting to df
df_appendix_c_raw_pg1 = pd.DataFrame(appendix_c_raw_pg1)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg1 = df_appendix_c_raw_pg1.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg1

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q1,How old are you?,1.0,12 years old or younger,39,0.2
1,Q1,How old are you?,2.0,13 years old,62,0.4
2,Q1,How old are you?,3.0,14 years old,3403,20.0
3,Q1,How old are you?,4.0,15 years old,4427,25.3
4,Q1,How old are you?,5.0,16 years old,4276,24.7
5,Q1,How old are you?,6.0,17 years old,3904,23.6
6,Q1,How old are you?,7.0,18 years old or older,1023,5.8
7,Q1,How old are you?,,Missing,98,
8,Q2,What is your sex?,1.0,Female,8152,48.3
9,Q2,What is your sex?,2.0,Male,8816,51.7


#### Page 2

In [None]:
# Raw input code for page 2 of Appendix C
# Page 2 is multi-select and hence, will need to be encoded manually as a dummy df
appendix_c_raw_pg2 = {
    'question_number': ['Q5'] * 1,
    'question': ['What is your race?'] * 1,
    'answer_code': ['NA'],
    'answer_label': ['NA'],
    'frequency': ['NA'],
    'weighted_percentage': ['NA']
}

# Converting to df
df_appendix_c_raw_pg2 = pd.DataFrame(appendix_c_raw_pg2)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg2 = df_appendix_c_raw_pg2.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg2

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q5,What is your race?,,,,


#### Page 3

In [None]:
# Raw input code for page 3 of Appendix C
# Questions 6 and 7 are related to height and weight, hence no drop-down options are available
# They have to be encoded manually as dummy values
appendix_c_raw_pg3 = {
    'question_number': ['Q6'] * 1 + ['Q7'] * 1 + ['Q8'] * 6 + ['Q9'] * 6 + ['Q10'] * 7,
    'question': ['How tall are you without your shoes on? (data in meters)'] * 1 +
        ['How much do you weigh without your shoes on? (data in kg)'] * 1 +
        ['How often do you wear a seat belt when riding in a car driven by someone else?'] * 6 +
        ['During the past 30 days, how many times did you ride in a car or other vehicle driven by someone who had been drinking alcohol?'] * 6 +
        ['During the past 30 days, how many times did you drive a car or other vehicle when you had been drinking alcohol?'] * 7,
    'answer_code': [
        'NA',
        'NA',
        1, 2, 3, 4, 5, 'NA',
        1, 2, 3, 4, 5, 'NA',
        1, 2, 3, 4, 5, 6, 'NA'
    ],
    'answer_label': [
        'NA',
        'NA',
        'Never', 'Rarely', 'Sometimes', 'Most of the time', 'Always', 'Missing',
        '0 times', '1 time', '2 or 3 times', '4 or 5 times', '6 or more times', 'Missing',
        'I did not drive a car or other vehicle during the past 30 days', '0 times', '1 time', '2 or 3 times', '4 or 5 times', '6 or more times', 'Missing'
    ],
    'frequency': [
        'NA',
        'NA',
        265, 512, 1109, 3253, 7638, 4455,
        14501, 1031, 827, 187, 427, 259,
        6668, 8291, 184, 107, 30, 61, 1891
    ],
    'weighted_percentage': [
        'NA',
        'NA',
        1.7, 3.8, 8.9, 25.4, 60.1, 'NA',
        85.9, 6.1, 4.6, 1.1, 2.4, 'NA',
        44.3, 53.1, 1.3, 0.7, 0.2, 0.4, 'NA'
    ]
}

# Converting to df
df_appendix_c_raw_pg3 = pd.DataFrame(appendix_c_raw_pg3)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg3 = df_appendix_c_raw_pg3.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg3

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q6,How tall are you without your shoes on? (data ...,,,,
1,Q7,How much do you weigh without your shoes on? (...,,,,
2,Q8,How often do you wear a seat belt when riding ...,1.0,Never,265.0,1.7
3,Q8,How often do you wear a seat belt when riding ...,2.0,Rarely,512.0,3.8
4,Q8,How often do you wear a seat belt when riding ...,3.0,Sometimes,1109.0,8.9
5,Q8,How often do you wear a seat belt when riding ...,4.0,Most of the time,3253.0,25.4
6,Q8,How often do you wear a seat belt when riding ...,5.0,Always,7638.0,60.1
7,Q8,How often do you wear a seat belt when riding ...,,Missing,4455.0,
8,Q9,"During the past 30 days, how many times did yo...",1.0,0 times,14501.0,85.9
9,Q9,"During the past 30 days, how many times did yo...",2.0,1 time,1031.0,6.1


#### Page 4

In [None]:
# Raw input code for page 4 of Appendix C
appendix_c_raw_pg4 = {
    'question_number': ['Q11'] * 9 + ['Q12'] * 6 + ['Q13'] * 6 + ['Q14'] * 6,
    'question': [
        'During the past 30 days, on how many days did you text or e-mail while driving a car or other vehicle?'] * 9 +
        ['During the past 30 days, on how many days did you carry a weapon such as a gun, knife, or club on school property?'] * 6 +
        ['During the past 12 months, on how many days did you carry a gun?'] * 6 +
        ['During the past 30 days, on how many days did you not go to school because you felt you would be unsafe at school or on your way to or from school?'] * 6,
    'answer_code': [
        1, 2, 3, 4, 5, 6, 7, 8, 'NA',
        1, 2, 3, 4, 5, 'NA',
        1, 2, 3, 4, 5, 'NA',
        1, 2, 3, 4, 5, 'NA'
    ],
    'answer_label': [
        'I did not drive a car or other vehicle during the past 30 days', '0 days', '1 or 2 days', '3 to 5 days', '6 to 9 days',
        '10 to 19 days', '20 to 29 days', 'All 30 days', 'Missing',
        '0 days', '1 day', '2 or 3 days', '4 or 5 days', '6 or more days', 'Missing',
        '0 days', '1 day', '2 or 3 days', '4 or 5 days', '6 or more days', 'Missing',
        '0 days', '1 day', '2 or 3 days', '4 or 5 days', '6 or more days', 'Missing'
    ],
    'frequency': [
        6660, 5538, 1045, 493, 299, 365, 279, 726, 1827,
        12333, 164, 93, 34, 168, 4440,
        12488, 179, 159, 59, 191, 4156,
        15716, 691, 407, 106, 190, 122
    ],
    'weighted_percentage': [
        44.4, 35.5, 6.9, 3.0, 2.0, 2.4, 1.7, 4.1, 'NA',
        96.9, 1.0, 0.6, 0.2, 1.3, 'NA',
        96.5, 1.0, 1.0, 0.4, 1.1, 'NA',
        91.4, 4.3, 2.6, 0.6, 1.1, 'NA'
    ]
}

# Converting to df
df_appendix_c_raw_pg4 = pd.DataFrame(appendix_c_raw_pg4)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg4 = df_appendix_c_raw_pg4.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg4

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q11,"During the past 30 days, on how many days did ...",1.0,I did not drive a car or other vehicle during ...,6660,44.4
1,Q11,"During the past 30 days, on how many days did ...",2.0,0 days,5538,35.5
2,Q11,"During the past 30 days, on how many days did ...",3.0,1 or 2 days,1045,6.9
3,Q11,"During the past 30 days, on how many days did ...",4.0,3 to 5 days,493,3.0
4,Q11,"During the past 30 days, on how many days did ...",5.0,6 to 9 days,299,2.0
5,Q11,"During the past 30 days, on how many days did ...",6.0,10 to 19 days,365,2.4
6,Q11,"During the past 30 days, on how many days did ...",7.0,20 to 29 days,279,1.7
7,Q11,"During the past 30 days, on how many days did ...",8.0,All 30 days,726,4.1
8,Q11,"During the past 30 days, on how many days did ...",,Missing,1827,
9,Q12,"During the past 30 days, on how many days did ...",1.0,0 days,12333,96.9


#### Page 5

In [None]:
# Raw input code for page 5 of Appendix C
appendix_c_raw_pg5 = {
    'question_number': ['Q15'] * 9 + ['Q16'] * 9 + ['Q17'] * 9,
    'question': [
        'During the past 12 months, how many times has someone threatened or injured you with a weapon such as a gun, knife, or club on school property?'] * 9 +
        ['During the past 12 months, how many times were you in a physical fight?'] * 9 +
        ['During the past 12 months, how many times were you in a physical fight on school property?'] * 9,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 8, 'NA'] * 3,
    'answer_label': [
        '0 times', '1 time', '2 or 3 times', '4 or 5 times', '6 or 7 times', '8 or 9 times', '10 or 11 times', '12 or more times', 'Missing'
    ] * 3,
    'frequency': [
        15560, 496, 336, 110, 36, 27, 18, 95, 554,
        8572, 902, 683, 202, 70, 36, 26, 122, 6619,
        15388, 644, 194, 63, 34, 8, 8, 79, 814
    ],
    'weighted_percentage': [
        93.4, 3.0, 2.0, 0.6, 0.2, 0.2, 0.1, 0.6, 'NA',
        81.7, 8.1, 6.0, 1.8, 0.6, 0.4, 0.2, 1.3, 'NA',
        94.2, 3.6, 1.0, 0.5, 0.2, 0.0, 0.0, 0.5, 'NA'
    ]
}

# Converting to df
df_appendix_c_raw_pg5 = pd.DataFrame(appendix_c_raw_pg5)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg5 = df_appendix_c_raw_pg5.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg5

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q15,"During the past 12 months, how many times has ...",1.0,0 times,15560,93.4
1,Q15,"During the past 12 months, how many times has ...",2.0,1 time,496,3.0
2,Q15,"During the past 12 months, how many times has ...",3.0,2 or 3 times,336,2.0
3,Q15,"During the past 12 months, how many times has ...",4.0,4 or 5 times,110,0.6
4,Q15,"During the past 12 months, how many times has ...",5.0,6 or 7 times,36,0.2
5,Q15,"During the past 12 months, how many times has ...",6.0,8 or 9 times,27,0.2
6,Q15,"During the past 12 months, how many times has ...",7.0,10 or 11 times,18,0.1
7,Q15,"During the past 12 months, how many times has ...",8.0,12 or more times,95,0.6
8,Q15,"During the past 12 months, how many times has ...",,Missing,554,
9,Q16,"During the past 12 months, how many times were...",1.0,0 times,8572,81.7


#### Page 6

In [None]:
# Raw input code for page 6 of Appendix C
appendix_c_raw_pg6 = {
    'question_number': ['Q18'] * 3 + ['Q19'] * 3 + ['Q20'] * 6 + ['Q21'] * 7 + ['Q22'] * 7,
    'question': [
        'Have you ever seen someone get physically attacked, beaten, stabbed, or shot in your neighborhood?'] * 3 +
        ['Have you ever been physically forced to have sexual intercourse when you did not want to?'] * 3 +
        ['During the past 12 months, how many times did anyone force you to do sexual things that you did not want to do?'] * 6 +
        ['During the past 12 months, how many times did someone you were dating or going out with force you to do sexual things that you did not want to do?'] * 7 +
        ['During the past 12 months, how many times did someone you were dating or going out with physically hurt you on purpose?'] * 7,
    'answer_code': [
        1, 2, 'NA',
        1, 2, 'NA',
        1, 2, 3, 4, 5, 'NA',
        1, 2, 3, 4, 5, 6, 'NA',
        1, 2, 3, 4, 5, 6, 'NA'
    ],
    'answer_label': [
        'Yes', 'No', 'Missing',
        'Yes', 'No', 'Missing',
        '0 times', '1 time', '2 or 3 times', '4 or 5 times', '6 or more times', 'Missing',
        'I did not date or go out with anyone during the past 12 months', '0 times', '1 time', '2 or 3 times', '4 or 5 times', '6 or more times', 'Missing',
        'I did not date or go out with anyone during the past 12 months', '0 times', '1 time', '2 or 3 times', '4 or 5 times', '6 or more times', 'Missing'
    ],
    'frequency': [
        2595, 11241, 3396,
        1193, 12065, 3074,
        11858, 598, 501, 138, 242, 3895,
        5169, 7100, 293, 253, 67, 123, 4227,
        6803, 9004, 322, 262, 87, 166, 588
    ],
    'weighted_percentage': [
        19.9, 80.1, 'NA',
        8.5, 91.5, 'NA',
        89.0, 4.6, 3.6, 0.9, 1.9, 'NA',
        42.0, 52.3, 2.4, 1.7, 0.5, 1.0, 'NA',
        41.8, 53.2, 2.0, 1.5, 0.5, 1.0, 'NA'
    ]
}

# Converting to df
df_appendix_c_raw_pg6 = pd.DataFrame(appendix_c_raw_pg6)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg6 = df_appendix_c_raw_pg6.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg6

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q18,Have you ever seen someone get physically atta...,1.0,Yes,2595,19.9
1,Q18,Have you ever seen someone get physically atta...,2.0,No,11241,80.1
2,Q18,Have you ever seen someone get physically atta...,,Missing,3396,
3,Q19,Have you ever been physically forced to have s...,1.0,Yes,1193,8.5
4,Q19,Have you ever been physically forced to have s...,2.0,No,12065,91.5
5,Q19,Have you ever been physically forced to have s...,,Missing,3074,
6,Q20,"During the past 12 months, how many times did ...",1.0,0 times,11858,89.0
7,Q20,"During the past 12 months, how many times did ...",2.0,1 time,598,4.6
8,Q20,"During the past 12 months, how many times did ...",3.0,2 or 3 times,501,3.6
9,Q20,"During the past 12 months, how many times did ...",4.0,4 or 5 times,138,0.9


#### Page 7

In [None]:
# Raw input code for page 7 of Appendix C
appendix_c_raw_pg7 = {
    'question_number': ['Q23'] * 3 + ['Q24'] * 3 + ['Q25'] * 3 + ['Q26'] * 3 + ['Q27'] * 3 + ['Q28'] * 6,
    'question': [
        'During the past 12 months, have you ever been bullied on school property?'] * 3 +
        ['During the past 12 months, have you ever been electronically bullied?'] * 3 +
        ['During the past 12 months, did you ever feel so sad or hopeless almost every day for two weeks or more in a row that you stopped doing some usual activities?'] * 3 +
        ['During the past 12 months, did you ever seriously consider attempting suicide?'] * 3 +
        ['During the past 12 months, did you make a plan about how you would attempt suicide?'] * 3 +
        ['During the past 12 months, how many times did you actually attempt suicide?'] * 6,
    'answer_code': [1, 2, 'NA',
                    1, 2, 'NA',
                    1, 2, 'NA',
                    1, 2, 'NA',
                    1, 2, 'NA',
                    1, 2, 3, 4, 5, 'NA'
                   ],
    'answer_label': [
        'Yes', 'No', 'Missing',
        'Yes', 'No', 'Missing',
        'Yes', 'No', 'Missing',
        'Yes', 'No', 'Missing',
        'Yes', 'No', 'Missing',
        '0 times', '1 time', '2 or 3 times', '4 or 5 times', '6 or more times', 'Missing'
    ],
    'frequency': [2712, 13994, 526,
                  2765, 14267, 200,
                  6749, 10212, 271,
                  3593, 13334, 305,
                  2801, 13520, 911,
                  13820, 993, 525, 114, 121, 1659
                 ],
    'weighted_percentage': [15.0, 85.0, 'NA',
                            15.9, 84.1, 'NA',
                            42.3, 57.7, 'NA',
                            22.2, 77.8, 'NA',
                            17.6, 82.4, 'NA',
                            89.8, 5.3, 3.2, 0.8, 0.8, 'NA']
}

# Converting to df
df_appendix_c_raw_pg7 = pd.DataFrame(appendix_c_raw_pg7)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg7 = df_appendix_c_raw_pg7.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg7

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q23,"During the past 12 months, have you ever been ...",1.0,Yes,2712,15.0
1,Q23,"During the past 12 months, have you ever been ...",2.0,No,13994,85.0
2,Q23,"During the past 12 months, have you ever been ...",,Missing,526,
3,Q24,"During the past 12 months, have you ever been ...",1.0,Yes,2765,15.9
4,Q24,"During the past 12 months, have you ever been ...",2.0,No,14267,84.1
5,Q24,"During the past 12 months, have you ever been ...",,Missing,200,
6,Q25,"During the past 12 months, did you ever feel s...",1.0,Yes,6749,42.3
7,Q25,"During the past 12 months, did you ever feel s...",2.0,No,10212,57.7
8,Q25,"During the past 12 months, did you ever feel s...",,Missing,271,
9,Q26,"During the past 12 months, did you ever seriou...",1.0,Yes,3593,22.2


#### Page 8

In [None]:
# Raw input code for page 8 of Appendix C
appendix_c_raw_pg8 = {
    'question_number': ['Q29'] * 4 + ['Q30'] * 3 + ['Q31'] * 8 + ['Q32'] * 8,
    'question': [
        'If you attempted suicide during the past 12 months, did any attempt result in an injury, poisoning, or overdose that had to be treated by a doctor or nurse?'] * 4 +
        ['Have you ever tried cigarette smoking, even one or two puffs?'] * 3 +
        ['How old were you when you first tried cigarette smoking, even one or two puffs?'] * 8 +
        ['During the past 30 days, on how many days did you smoke cigarettes?'] * 8,

    'answer_code': [1, 2, 3, 'NA',
                    1, 2, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA'],
    'answer_label': [
        'I did not attempt suicide during the past 12 months', 'Yes', 'No', 'Missing',
        'Yes', 'No', 'Missing',
        'I have never tried cigarette smoking, not even one or two puffs', '8 years old or younger', '9 or 10 years old', '11 or 12 years old', '13 or 14 years old',
        '15 or 16 years old', '17 years old or older', 'Missing',
        '0 days', '1 or 2 days', '3 to 5 days', '6 to 9 days', '10 to 19 days', '20 to 29 days', 'All 30 days', 'Missing'
    ],
    'frequency': [10870, 359, 854, 5149,
                  2398, 11224, 3610,
                  12947, 334, 265, 475, 847, 684, 130, 1550,
                  16253, 302, 111, 45, 72, 25, 104, 320
                 ],
    'weighted_percentage': [90.1, 2.9, 7.0, 'NA',
                            17.8, 82.2, 'NA',
                            82.5, 1.9, 1.5, 2.9, 5.7, 4.5, 1.0, 'NA',
                            96.2, 1.7, 0.6, 0.3, 0.5, 0.2, 0.6, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg8 = pd.DataFrame(appendix_c_raw_pg8)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg8 = df_appendix_c_raw_pg8.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg8


Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q29,If you attempted suicide during the past 12 mo...,1.0,I did not attempt suicide during the past 12 m...,10870,90.1
1,Q29,If you attempted suicide during the past 12 mo...,2.0,Yes,359,2.9
2,Q29,If you attempted suicide during the past 12 mo...,3.0,No,854,7.0
3,Q29,If you attempted suicide during the past 12 mo...,,Missing,5149,
4,Q30,"Have you ever tried cigarette smoking, even on...",1.0,Yes,2398,17.8
5,Q30,"Have you ever tried cigarette smoking, even on...",2.0,No,11224,82.2
6,Q30,"Have you ever tried cigarette smoking, even on...",,Missing,3610,
7,Q31,How old were you when you first tried cigarett...,1.0,"I have never tried cigarette smoking, not even...",12947,82.5
8,Q31,How old were you when you first tried cigarett...,2.0,8 years old or younger,334,1.9
9,Q31,How old were you when you first tried cigarett...,3.0,9 or 10 years old,265,1.5


#### Page 9

In [None]:
# Raw input code for page 9 of Appendix C
appendix_c_raw_pg9 = {
    'question_number': ['Q33'] * 8 + ['Q34'] * 3 + ['Q35'] * 8 + ['Q36'] * 9,
    'question': [
        'During the past 30 days, on the days you smoked, how many cigarettes did you smoke per day?'] * 8 +
        ['Have you ever used an electronic vapor product?'] * 3 +
        ['During the past 30 days, on how many days did you use an electronic vapor product?'] * 8 +
        ['During the past 30 days, how did you usually get your electronic vapor products?'] * 9,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 8, 'NA'
                   ],
    'answer_label': [
        'I did not smoke cigarettes during the past 30 days', 'Less than 1 cigarette per day', '1 cigarette per day', '2 to 5 cigarettes per day',
        '6 to 10 cigarettes per day', '11 to 20 cigarettes per day', 'More than 20 cigarettes per day', 'Missing',
        'Yes', 'No', 'Missing',
        '0 days', '1 or 2 days', '3 to 5 days', '6 to 9 days', '10 to 19 days', '20 to 29 days', 'All 30 days', 'Missing',
        'I did not use any electronic vapor products during the past 30 days', 'I got or bought them from a friend, family member, or someone else',
        'I bought them myself in a vape shop or tobacco shop', 'I bought them myself in a convenience store, supermarket, discount store, or gas station',
        'I bought them myself at a mall or shopping center kiosk or stand', 'I bought them myself on the Internet, such as from a product website, vape store website, or other website like eBay, Amazon, Facebook Marketplace, or Craigslist',
        'I took them from a store or another person', 'I got them in some other way', 'Missing'
    ],
    'frequency': [9426, 151, 94, 115, 22, 10, 31, 7383,
                  6045, 10761, 426,
                  13155, 674, 373, 266, 394, 334, 881, 1155,
                  10129, 1376, 305, 162, 13, 31, 70, 512, 4634],
    'weighted_percentage': [96.0, 1.4, 0.9, 1.0, 0.3, 0.1, 0.3, 'NA',
                            36.2, 63.8, 'NA',
                            82.0, 4.3, 2.2, 1.7, 2.5, 2.3, 5.0, 'NA',
                            82.2, 9.6, 2.3, 1.2, 0.1, 0.3, 0.5, 3.9, 'NA'
                           ]
}


# Converting to df
df_appendix_c_raw_pg9 = pd.DataFrame(appendix_c_raw_pg9)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg9 = df_appendix_c_raw_pg9.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg9

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q33,"During the past 30 days, on the days you smoke...",1.0,I did not smoke cigarettes during the past 30 ...,9426,96.0
1,Q33,"During the past 30 days, on the days you smoke...",2.0,Less than 1 cigarette per day,151,1.4
2,Q33,"During the past 30 days, on the days you smoke...",3.0,1 cigarette per day,94,0.9
3,Q33,"During the past 30 days, on the days you smoke...",4.0,2 to 5 cigarettes per day,115,1.0
4,Q33,"During the past 30 days, on the days you smoke...",5.0,6 to 10 cigarettes per day,22,0.3
5,Q33,"During the past 30 days, on the days you smoke...",6.0,11 to 20 cigarettes per day,10,0.1
6,Q33,"During the past 30 days, on the days you smoke...",7.0,More than 20 cigarettes per day,31,0.3
7,Q33,"During the past 30 days, on the days you smoke...",,Missing,7383,
8,Q34,Have you ever used an electronic vapor product?,1.0,Yes,6045,36.2
9,Q34,Have you ever used an electronic vapor product?,2.0,No,10761,63.8


#### Page 10

In [None]:
# Raw input code for page 10 of Appendix C
appendix_c_raw_pg10 = {
    'question_number': ['Q37'] * 8 + ['Q38'] * 8 + ['Q39'] * 4 + ['Q40'] * 8,
    'question': [
        'During the past 30 days, on how many days did you use chewing tobacco, snuff, dip, snus, or dissolvable tobacco products, such as Copenhagen, Grizzly, Skoal, or Camel Snus?'] * 8 +
        ['During the past 30 days, on how many days did you smoke cigars, cigarillos, or little cigars?'] * 8 +
        ['During the past 12 months, did you ever try to quit using all tobacco products?'] * 4 +
        ['How old were you when you had your first drink of alcohol other than a few sips?'] * 8,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA'
                   ],
    'answer_label': [
        '0 days', '1 or 2 days', '3 to 5 days', '6 to 9 days', '10 to 19 days', '20 to 29 days', 'All 30 days', 'Missing',
        '0 days', '1 or 2 days', '3 to 5 days', '6 to 9 days', '10 to 19 days', '20 to 29 days', 'All 30 days', 'Missing',
        'I did not use cigarettes, electronic vapor products, smokeless tobacco, cigars, shisha or hookah tobacco, or pipe tobacco during the past 12 months', 'Yes', 'No', 'Missing',
        'I have never had a drink of alcohol other than a few sips', '8 years old or younger', '9 or 10 years old', '11 or 12 years old', '13 or 14 years old',
        '15 or 16 years old', '17 years old or older', 'Missing'
    ],
    'frequency': [15826, 160, 74, 52, 29, 12, 100, 979,
                  15515, 193, 109, 50, 46, 19, 98, 1202,
                  9446, 1651, 1433, 4702,
                  8742, 900, 595, 1004, 2536, 2340, 396, 719
                 ],
    'weighted_percentage': [97.5, 0.9, 0.5, 0.3, 0.2, 0.1, 0.5, 'NA',
                            96.9, 1.3, 0.7, 0.3, 0.3, 0.1, 0.5, 'NA',
                            75.1, 13.5, 11.4, 'NA',
                            52.6, 5.4, 3.8, 6.0, 15.5, 14.6, 2.4, 'NA']
}

# Converting to df
df_appendix_c_raw_pg10 = pd.DataFrame(appendix_c_raw_pg10)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg10 = df_appendix_c_raw_pg10.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg10


Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q37,"During the past 30 days, on how many days did ...",1.0,0 days,15826,97.5
1,Q37,"During the past 30 days, on how many days did ...",2.0,1 or 2 days,160,0.9
2,Q37,"During the past 30 days, on how many days did ...",3.0,3 to 5 days,74,0.5
3,Q37,"During the past 30 days, on how many days did ...",4.0,6 to 9 days,52,0.3
4,Q37,"During the past 30 days, on how many days did ...",5.0,10 to 19 days,29,0.2
5,Q37,"During the past 30 days, on how many days did ...",6.0,20 to 29 days,12,0.1
6,Q37,"During the past 30 days, on how many days did ...",7.0,All 30 days,100,0.5
7,Q37,"During the past 30 days, on how many days did ...",,Missing,979,
8,Q38,"During the past 30 days, on how many days did ...",1.0,0 days,15515,96.9
9,Q38,"During the past 30 days, on how many days did ...",2.0,1 or 2 days,193,1.3


#### Page 11

In [None]:
# Raw input code for page 11 of Appendix C
appendix_c_raw_pg11 = {
    'question_number': ['Q41'] * 8 + ['Q42'] * 8 + ['Q43'] * 9,
    'question': [
        'During the past 30 days, on how many days did you have at least one drink of alcohol?'] * 8 +
        ['During the past 30 days, on how many days did you have 4 or more drinks of alcohol in a row, that is, within a couple of hours (if you are female) or 5 or more drinks of alcohol in a row, that is, within a couple of hours (if you are male)?'] * 8 +
        ['During the past 30 days, what is the largest number of alcoholic drinks you had in a row, that is, within a couple of hours?'] * 9,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 8, 'NA'
                   ],
    'answer_label': [
        '0 days', '1 or 2 days', '3 to 5 days', '6 to 9 days', '10 to 19 days', '20 to 29 days', 'All 30 days', 'Missing',
        '0 days', '1 day', '2 days', '3 to 5 days', '6 to 9 days', '10 to 19 days', '20 or more days', 'Missing',
        'I did not drink alcohol during the past 30 days', '1 or 2 drinks', '3 drinks', '4 drinks', '5 drinks', '6 or 7 drinks',
        '8 or 9 drinks', '10 or more drinks', 'Missing'
    ],
    'frequency': [12586, 1991, 850, 441, 220, 54, 92, 998,
                  11185, 515, 332, 314, 132, 61, 57, 4636,
                  9761, 1013, 176, 231, 299, 327, 158, 344, 4923
                 ],
    'weighted_percentage': [77.3, 12.6, 5.4, 2.5, 1.4, 0.2, 0.6, 'NA',
                            89.5, 3.9, 2.5, 2.2, 1.0, 0.4, 0.4, 'NA',
                            79.1, 8.5, 1.5, 1.8, 2.6, 2.9, 1.3, 2.7, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg11 = pd.DataFrame(appendix_c_raw_pg11)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg11 = df_appendix_c_raw_pg11.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg11


Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q41,"During the past 30 days, on how many days did ...",1.0,0 days,12586,77.3
1,Q41,"During the past 30 days, on how many days did ...",2.0,1 or 2 days,1991,12.6
2,Q41,"During the past 30 days, on how many days did ...",3.0,3 to 5 days,850,5.4
3,Q41,"During the past 30 days, on how many days did ...",4.0,6 to 9 days,441,2.5
4,Q41,"During the past 30 days, on how many days did ...",5.0,10 to 19 days,220,1.4
5,Q41,"During the past 30 days, on how many days did ...",6.0,20 to 29 days,54,0.2
6,Q41,"During the past 30 days, on how many days did ...",7.0,All 30 days,92,0.6
7,Q41,"During the past 30 days, on how many days did ...",,Missing,998,
8,Q42,"During the past 30 days, on how many days did ...",1.0,0 days,11185,89.5
9,Q42,"During the past 30 days, on how many days did ...",2.0,1 day,515,3.9


#### Page 12

In [None]:
# Raw input code for page 12 of Appendix C
appendix_c_raw_pg12 = {
    'question_number': ['Q44'] * 9 + ['Q45'] * 8 + ['Q46'] * 8,
    'question': [
        'During the past 30 days, how did you usually get the alcohol you drank?'] * 9 +
        ['During your life, how many times have you used marijuana?'] * 8 +
        ['How old were you when you tried marijuana for the first time?'] * 8,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 8, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA'
                   ],
    'answer_label': [
        'I did not drink alcohol during the past 30 days', 'I bought it in a store such as a liquor store, convenience store, supermarket, discount store, or gas station',
        'I bought it at a restaurant, bar, or club', 'I bought it at a public event such as a concert or sporting event', 'I gave someone else money to buy it for me',
        'Someone gave it to me', 'I took it from a store or family member', 'I got it some other way', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 to 99 times', '100 or more times', 'Missing',
        'I have never tried marijuana', '8 years old or younger', '9 or 10 years old', '11 or 12 years old', '13 or 14 years old',
        '15 or 16 years old', '17 years old or older', 'Missing'
    ],
    'frequency': [7844, 159, 14, 21, 309, 895, 334, 552, 7104,
                  10141, 931, 694, 431, 354, 341, 1007, 3333,
                  12275, 158, 174, 541, 1736, 1700, 226, 422
                 ],
    'weighted_percentage': [77.5, 1.6, 0.1, 0.2, 2.8, 9.0, 3.5, 5.4, 'NA',
                            72.2, 6.7, 5.5, 3.4, 2.8, 2.3, 7.2, 'NA',
                            72.5, 0.7, 1.0, 3.1, 11.1, 10.2, 1.3, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg12 = pd.DataFrame(appendix_c_raw_pg12)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg12 = df_appendix_c_raw_pg12.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg12

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q44,"During the past 30 days, how did you usually g...",1.0,I did not drink alcohol during the past 30 days,7844,77.5
1,Q44,"During the past 30 days, how did you usually g...",2.0,"I bought it in a store such as a liquor store,...",159,1.6
2,Q44,"During the past 30 days, how did you usually g...",3.0,"I bought it at a restaurant, bar, or club",14,0.1
3,Q44,"During the past 30 days, how did you usually g...",4.0,I bought it at a public event such as a concer...,21,0.2
4,Q44,"During the past 30 days, how did you usually g...",5.0,I gave someone else money to buy it for me,309,2.8
5,Q44,"During the past 30 days, how did you usually g...",6.0,Someone gave it to me,895,9.0
6,Q44,"During the past 30 days, how did you usually g...",7.0,I took it from a store or family member,334,3.5
7,Q44,"During the past 30 days, how did you usually g...",8.0,I got it some other way,552,5.4
8,Q44,"During the past 30 days, how did you usually g...",,Missing,7104,
9,Q45,"During your life, how many times have you used...",1.0,0 times,10141,72.2


#### Page 13

In [None]:
# Raw input code for page 13 of Appendix C
appendix_c_raw_pg13 = appendix_c_raw_pg8 = {
    'question_number': ['Q47'] * 7 + ['Q48'] * 7 + ['Q49'] * 7 + ['Q50'] * 7,
    'question': [
        'During the past 30 days, how many times did you use marijuana?'] * 7 +
        ['During your life, how many times have you used synthetic marijuana?'] * 7 +
        ['During your life, how many times have you taken prescription pain medicine without a doctor\'s prescription or differently than how a doctor told you to use it?'] * 7 +
        ['During your life, how many times have you used any form of cocaine, including powder, crack, or freebase?'] * 7,
    'answer_code': [1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA'
                   ],
    'answer_label': [
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing'
    ],
    'frequency': [14250, 875, 529, 353, 315, 575, 335,
                  9229, 278, 138, 76, 51, 129, 7331,
                  14811, 956, 508, 228, 116, 239, 374,
                  12682, 159, 70, 33, 22, 78, 4188
                 ],
    'weighted_percentage': [84.2, 5.2, 3.2, 2.1, 2.0, 3.3, 'NA',
                            93.5, 2.9, 1.2, 0.7, 0.5, 1.1, 'NA',
                            87.8, 5.8, 3.2, 1.3, 0.6, 1.3, 'NA',
                            97.5, 1.2, 0.4, 0.2, 0.1, 0.6, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg13 = pd.DataFrame(appendix_c_raw_pg13)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg13 = df_appendix_c_raw_pg13.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg13


Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q47,"During the past 30 days, how many times did yo...",1.0,0 times,14250,84.2
1,Q47,"During the past 30 days, how many times did yo...",2.0,1 or 2 times,875,5.2
2,Q47,"During the past 30 days, how many times did yo...",3.0,3 to 9 times,529,3.2
3,Q47,"During the past 30 days, how many times did yo...",4.0,10 to 19 times,353,2.1
4,Q47,"During the past 30 days, how many times did yo...",5.0,20 to 39 times,315,2.0
5,Q47,"During the past 30 days, how many times did yo...",6.0,40 or more times,575,3.3
6,Q47,"During the past 30 days, how many times did yo...",,Missing,335,
7,Q48,"During your life, how many times have you used...",1.0,0 times,9229,93.5
8,Q48,"During your life, how many times have you used...",2.0,1 or 2 times,278,2.9
9,Q48,"During your life, how many times have you used...",3.0,3 to 9 times,138,1.2


#### Page 14

In [None]:
# Raw input code for page 14 of Appendix C
appendix_c_raw_pg14 = {
    'question_number': ['Q51'] * 7 + ['Q52'] * 7 + ['Q53'] * 7 + ['Q54'] * 7,
    'question': [
        'During your life, how many times have you sniffed glue, breathed the contents of aerosol spray cans, or inhaled any paints or sprays to get high?'] * 7 +
        ['During your life, how many times have you used heroin (also called smack, junk, or China White)?'] * 7 +
        ['During your life, how many times have you used methamphetamines (also called speed, crystal meth, crank, ice, or meth)?'] * 7 +
        ['During your life, how many times have you used ecstasy (also called MDMA or Molly)?'] * 7,
    'answer_code': [1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA'
                   ],
    'answer_label': [
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing'
    ],
    'frequency': [9086, 424, 190, 71, 40, 100, 7321,
                  16593, 82, 30, 29, 22, 81, 395,
                  16511, 116, 44, 39, 17, 90, 415,
                  12565, 208, 89, 35, 16, 62, 4257
                 ],
    'weighted_percentage': [91.9, 4.3, 1.8, 0.6, 0.4, 1.0, 'NA',
                            98.7, 0.4, 0.1, 0.1, 0.1, 0.5, 'NA',
                            98.2, 0.6, 0.3, 0.2, 0.1, 0.5, 'NA',
                            97.1, 1.6, 0.6, 0.2, 0.1, 0.5, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg14 = pd.DataFrame(appendix_c_raw_pg14)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg14 = df_appendix_c_raw_pg14.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg14

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q51,"During your life, how many times have you snif...",1.0,0 times,9086,91.9
1,Q51,"During your life, how many times have you snif...",2.0,1 or 2 times,424,4.3
2,Q51,"During your life, how many times have you snif...",3.0,3 to 9 times,190,1.8
3,Q51,"During your life, how many times have you snif...",4.0,10 to 19 times,71,0.6
4,Q51,"During your life, how many times have you snif...",5.0,20 to 39 times,40,0.4
5,Q51,"During your life, how many times have you snif...",6.0,40 or more times,100,1.0
6,Q51,"During your life, how many times have you snif...",,Missing,7321,
7,Q52,"During your life, how many times have you used...",1.0,0 times,16593,98.7
8,Q52,"During your life, how many times have you used...",2.0,1 or 2 times,82,0.4
9,Q52,"During your life, how many times have you used...",3.0,3 to 9 times,30,0.1


#### Page 15

In [None]:
# Raw input code for page 15 of Appendix C
appendix_c_raw_pg15 = {
    'question_number': ['Q55'] * 4 + ['Q56'] * 3 + ['Q57'] * 3 + ['Q58'] * 9 + ['Q59'] * 8,
    'question': [
        'During your life, how many times have you used a needle to inject any illegal drug into your body?'] * 4 +
        ['During the past 12 months, has anyone offered, sold, or given you an illegal drug on school property?'] * 3 +
        ['Have you ever had sexual intercourse?'] * 3 +
        ['How old were you when you had sexual intercourse for the first time?'] * 9 +
        ['During your life, with how many people have you had sexual intercourse?'] * 8,
    'answer_code': [1, 2, 3, 'NA',
                    1, 2, 'NA',
                    1, 2, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 8, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA'
                   ],
    'answer_label': [
        '0 times', '1 time', '2 or more times', 'Missing',
        'Yes', 'No', 'Missing',
        'Yes', 'No', 'Missing',
        'I have never had sexual intercourse', '11 years old or younger', '12 years old', '13 years old', '14 years old',
        '15 years old', '16 years old', '17 years old or older', 'Missing',
        'I have never had sexual intercourse', '1 person', '2 people', '3 people', '4 people', '5 people', '6 or more people', 'Missing'
    ],
    'frequency': [12745, 104, 116, 4267,
                  2265, 14101, 866,
                  3920, 8237, 5075,
                  10742, 300, 228, 496, 1102, 1198, 1010, 423, 1733,
                  10739, 2244, 936, 533, 330, 174, 500, 1776
                 ],
    'weighted_percentage': [98.6, 0.5, 0.9, 'NA',
                            13.9, 86.1, 'NA',
                            30.0, 70.0, 'NA',
                            70.2, 1.9, 1.3, 3.1, 6.8, 7.4, 6.2, 3.0, 'NA',
                            70.4, 14.5, 5.7, 3.5, 2.2, 1.1, 2.7, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg15 = pd.DataFrame(appendix_c_raw_pg15)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg15 = df_appendix_c_raw_pg15.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg15


Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q55,"During your life, how many times have you used...",1.0,0 times,12745,98.6
1,Q55,"During your life, how many times have you used...",2.0,1 time,104,0.5
2,Q55,"During your life, how many times have you used...",3.0,2 or more times,116,0.9
3,Q55,"During your life, how many times have you used...",,Missing,4267,
4,Q56,"During the past 12 months, has anyone offered,...",1.0,Yes,2265,13.9
5,Q56,"During the past 12 months, has anyone offered,...",2.0,No,14101,86.1
6,Q56,"During the past 12 months, has anyone offered,...",,Missing,866,
7,Q57,Have you ever had sexual intercourse?,1.0,Yes,3920,30.0
8,Q57,Have you ever had sexual intercourse?,2.0,No,8237,70.0
9,Q57,Have you ever had sexual intercourse?,,Missing,5075,


#### Page 16

In [None]:
# Raw input code for page 16 of Appendix C
appendix_c_raw_pg16 = {
    'question_number': ['Q60'] * 9 + ['Q61'] * 4 + ['Q62'] * 4 + ['Q63'] * 9,
    'question': [
        'During the past 3 months, with how many people did you have sexual intercourse?'] * 9 +
        ['Did you drink alcohol or use drugs before you had sexual intercourse the last time?'] * 4 +
        ['The last time you had sexual intercourse, did you or your partner use a condom?'] * 4 +
        ['The last time you had sexual intercourse with an opposite-sex partner, what one method did you or your partner use to prevent pregnancy?'] * 9,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 8, 'NA',
                    1, 2, 3, 'NA',
                    1, 2, 3, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 8, 'NA'],
    'answer_label': [
        'I have never had sexual intercourse', 'I have had sexual intercourse, but not during the past 3 months', '1 person', '2 people', '3 people',
        '4 people', '5 people', '6 or more people', 'Missing',
        'I have never had sexual intercourse', 'Yes', 'No', 'Missing',
        'I have never had sexual intercourse', 'Yes', 'No', 'Missing',
        'I have never had sexual intercourse with an opposite-sex partner', 'No method was used to prevent pregnancy', 'Birth control pills', 'Condoms',
        'An IUD or implant', 'A shot or birth control ring', 'Withdrawal or some other method', 'Not sure', 'Missing'
    ],
    'frequency': [10739, 1397, 2692, 376, 127, 52, 20, 80, 1749,
                  8049, 742, 3135, 5306,
                  10738, 2462, 2279, 1753,
                  11553, 623, 846, 1779, 325, 132, 409, 205, 1360
                 ],
    'weighted_percentage': [70.2, 9.0, 17.0, 2.3, 0.6, 0.2, 0.1, 0.5, 'NA',
                            69.8, 5.7, 24.5, 'NA',
                            70.3, 16.1, 13.7, 'NA',
                            73.6, 3.7, 4.8, 11.5, 1.9, 0.7, 2.6, 1.2, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg16 = pd.DataFrame(appendix_c_raw_pg16)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg16 = df_appendix_c_raw_pg16.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg16

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q60,"During the past 3 months, with how many people...",1.0,I have never had sexual intercourse,10739,70.2
1,Q60,"During the past 3 months, with how many people...",2.0,"I have had sexual intercourse, but not during ...",1397,9.0
2,Q60,"During the past 3 months, with how many people...",3.0,1 person,2692,17.0
3,Q60,"During the past 3 months, with how many people...",4.0,2 people,376,2.3
4,Q60,"During the past 3 months, with how many people...",5.0,3 people,127,0.6
5,Q60,"During the past 3 months, with how many people...",6.0,4 people,52,0.2
6,Q60,"During the past 3 months, with how many people...",7.0,5 people,20,0.1
7,Q60,"During the past 3 months, with how many people...",8.0,6 or more people,80,0.5
8,Q60,"During the past 3 months, with how many people...",,Missing,1749,
9,Q61,Did you drink alcohol or use drugs before you ...,1.0,I have never had sexual intercourse,8049,69.8


#### Page 17

In [None]:
# Raw input code for page 17 of Appendix C
appendix_c_raw_pg17 = {
    'question_number': ['Q64'] * 5 + ['Q65'] * 7 + ['Q66'] * 6 + ['Q67'] * 5,
    'question': [
        'During your life, with whom have you had sexual contact?'] * 5 +
        ['Which of the following best describes you?'] * 7 +
        ['How do you describe your weight?'] * 6 +
        ['Which of the following are you trying to do about your weight?'] * 5,
    'answer_code': [1, 2, 3, 4, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 'NA',
                    1, 2, 3, 4, 'NA'
                   ],
    'answer_label': [
        'I have never had sexual contact', 'Females', 'Males', 'Females and males', 'Missing',
        'Heterosexual (straight)', 'Gay or lesbian', 'Bisexual', 'I describe my sexual identity some other way',
        'I am not sure about my sexual identity (questioning)', 'I do not know what this question is asking', 'Missing',
        'Very underweight', 'Slightly underweight', 'About the right weight', 'Slightly overweight', 'Very overweight', 'Missing',
        'Lose weight', 'Gain weight', 'Stay the same weight', 'I am not trying to do anything about my weight', 'Missing'
    ],
    'frequency': [7703, 2913, 2220, 783, 3613,
                  12421, 520, 1848, 659, 823, 330, 631,
                  459, 1947, 5888, 3218, 763, 4957,
                  4262, 1933, 1359, 1719, 7959
                 ],
    'weighted_percentage': [57.0, 21.1, 15.7, 6.2, 'NA',
                            74.2, 3.2, 11.9, 3.8, 5.1, 1.8, 'NA',
                            3.3, 16.0, 48.4, 26.2, 6.1, 'NA',
                            45.7, 20.1, 14.9, 19.3, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg17 = pd.DataFrame(appendix_c_raw_pg17)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg17 = df_appendix_c_raw_pg17.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg17

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q64,"During your life, with whom have you had sexua...",1.0,I have never had sexual contact,7703,57.0
1,Q64,"During your life, with whom have you had sexua...",2.0,Females,2913,21.1
2,Q64,"During your life, with whom have you had sexua...",3.0,Males,2220,15.7
3,Q64,"During your life, with whom have you had sexua...",4.0,Females and males,783,6.2
4,Q64,"During your life, with whom have you had sexua...",,Missing,3613,
5,Q65,Which of the following best describes you?,1.0,Heterosexual (straight),12421,74.2
6,Q65,Which of the following best describes you?,2.0,Gay or lesbian,520,3.2
7,Q65,Which of the following best describes you?,3.0,Bisexual,1848,11.9
8,Q65,Which of the following best describes you?,4.0,I describe my sexual identity some other way,659,3.8
9,Q65,Which of the following best describes you?,5.0,I am not sure about my sexual identity (questi...,823,5.1


#### Page 18

In [None]:
# Raw input code for page 18 of Appendix C
appendix_c_raw_pg18 = {
    'question_number': ['Q68'] * 8 + ['Q69'] * 8 + ['Q70'] * 8 + ['Q71'] * 8,
    'question': [
        'During the past 7 days, how many times did you drink 100% fruit juices such as orange juice, apple juice, or grape juice?'] * 8 +
        ['During the past 7 days, how many times did you eat fruit?'] * 8 +
        ['During the past 7 days, how many times did you eat green salad?'] * 8 +
        ['During the past 7 days, how many times did you eat potatoes?'] * 8,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA'
                   ],
    'answer_label': [
        'I did not drink 100% fruit juice during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing',
        'I did not eat fruit during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing',
        'I did not eat green salad during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing',
        'I did not eat potatoes during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing'
    ],
    'frequency': [4368, 4960, 1638, 634, 620, 270, 437, 4305,
                  2324, 5749, 3408, 1760, 1804, 757, 841, 589,
                  6145, 4381, 972, 602, 193, 78, 139, 4722,
                  5103, 5464, 1243, 377, 158, 74, 184, 4884
                 ],
    'weighted_percentage': [34.5, 39.0, 12.1, 4.9, 4.5, 1.9, 3.0, 'NA',
                            13.8, 35.5, 21.2, 10.4, 10.2, 4.5, 4.4, 'NA',
                            47.2, 36.1, 8.3, 5.1, 1.7, 0.6, 1.0, 'NA',
                            41.5, 44.8, 8.4, 2.8, 1.5, 0.6, 1.3, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg18 = pd.DataFrame(appendix_c_raw_pg18)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg18 = df_appendix_c_raw_pg18.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg18


Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q68,"During the past 7 days, how many times did you...",1.0,I did not drink 100% fruit juice during the pa...,4368,34.5
1,Q68,"During the past 7 days, how many times did you...",2.0,1 to 3 times during the past 7 days,4960,39.0
2,Q68,"During the past 7 days, how many times did you...",3.0,4 to 6 times during the past 7 days,1638,12.1
3,Q68,"During the past 7 days, how many times did you...",4.0,1 time per day,634,4.9
4,Q68,"During the past 7 days, how many times did you...",5.0,2 times per day,620,4.5
5,Q68,"During the past 7 days, how many times did you...",6.0,3 times per day,270,1.9
6,Q68,"During the past 7 days, how many times did you...",7.0,4 or more times per day,437,3.0
7,Q68,"During the past 7 days, how many times did you...",,Missing,4305,
8,Q69,"During the past 7 days, how many times did you...",1.0,I did not eat fruit during the past 7 days,2324,13.8
9,Q69,"During the past 7 days, how many times did you...",2.0,1 to 3 times during the past 7 days,5749,35.5


#### Page 19

In [None]:
# Raw input code for page 19 of Appendix C
appendix_c_raw_pg19 = {
    'question_number': ['Q72'] * 8 + ['Q73'] * 8 + ['Q74'] * 8,
    'question': [
        'During the past 7 days, how many times did you eat carrots?'] * 8 +
        ['During the past 7 days, how many times did you eat other vegetables?'] * 8 +
        ['During the past 7 days, how many times did you drink a can, bottle, or glass of soda or pop, such as Coke, Pepsi, or Sprite?'] * 8,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA'
                   ],
    'answer_label': [
        'I did not eat carrots during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing',
        'I did not eat other vegetables during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing',
        'I did not drink soda or pop during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing'
    ],
    'frequency': [7223, 3884, 660, 308, 126, 52, 111, 4868,
                  2691, 4697, 2686, 1108, 645, 258, 253, 4894,
                  3775, 4980, 1916, 830, 584, 250, 364, 4533
                 ],
    'weighted_percentage': [55.7, 33.4, 5.8, 2.6, 1.1, 0.5, 0.9, 'NA',
                            20.5, 37.9, 22.4, 9.3, 5.5, 2.4, 2.1, 'NA',
                            31.0, 39.8, 14.5, 6.2, 4.2, 1.7, 2.6, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg19 = pd.DataFrame(appendix_c_raw_pg19)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg19 = df_appendix_c_raw_pg19.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg19


Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q72,"During the past 7 days, how many times did you...",1.0,I did not eat carrots during the past 7 days,7223,55.7
1,Q72,"During the past 7 days, how many times did you...",2.0,1 to 3 times during the past 7 days,3884,33.4
2,Q72,"During the past 7 days, how many times did you...",3.0,4 to 6 times during the past 7 days,660,5.8
3,Q72,"During the past 7 days, how many times did you...",4.0,1 time per day,308,2.6
4,Q72,"During the past 7 days, how many times did you...",5.0,2 times per day,126,1.1
5,Q72,"During the past 7 days, how many times did you...",6.0,3 times per day,52,0.5
6,Q72,"During the past 7 days, how many times did you...",7.0,4 or more times per day,111,0.9
7,Q72,"During the past 7 days, how many times did you...",,Missing,4868,
8,Q73,"During the past 7 days, how many times did you...",1.0,I did not eat other vegetables during the past...,2691,20.5
9,Q73,"During the past 7 days, how many times did you...",2.0,1 to 3 times during the past 7 days,4697,37.9


#### Page 20

In [None]:
# Raw input code for page 20 of Appendix C
appendix_c_raw_pg20 = {
    'question_number': ['Q75'] * 8 + ['Q76'] * 9 + ['Q77'] * 9,
    'question': [
        'During the past 7 days, how many glasses of milk did you drink?'] * 8 +
        ['During the past 7 days, on how many days did you eat breakfast?'] * 9 +
        ['During the past 7 days, on how many days were you physically active for a total of at least 60 minutes per day?'] * 9,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 8, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 8, 'NA'
                   ],
    'answer_label': [
        'I did not drink milk during the past 7 days', '1 to 3 glasses during the past 7 days', '4 to 6 glasses during the past 7 days',
        '1 glass per day', '2 glasses per day', '3 glasses per day', '4 or more glasses per day', 'Missing',
        '0 days', '1 day', '2 days', '3 days', '4 days', '5 days', '6 days', '7 days', 'Missing',
        '0 days', '1 day', '2 days', '3 days', '4 days', '5 days', '6 days', '7 days', 'Missing'
    ],
    'frequency': [3469, 2705, 1078, 1089, 651, 257, 301, 7682,
                  3570, 1578, 2084, 1595, 1163, 1273, 837, 3976, 1156,
                  2626, 1160, 1626, 1865, 1717, 2200, 1313, 4145, 580
                 ],
    'weighted_percentage': [35.7, 28.4, 11.7, 11.5, 7.2, 2.5, 3.0, 'NA',
                            22.0, 9.7, 12.6, 10.0, 7.1, 8.0, 5.4, 25.0, 'NA',
                            15.8, 6.9, 9.8, 11.7, 10.4, 13.6, 7.8, 23.9, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg20 = pd.DataFrame(appendix_c_raw_pg20)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg20 = df_appendix_c_raw_pg20.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg20

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q75,"During the past 7 days, how many glasses of mi...",1.0,I did not drink milk during the past 7 days,3469,35.7
1,Q75,"During the past 7 days, how many glasses of mi...",2.0,1 to 3 glasses during the past 7 days,2705,28.4
2,Q75,"During the past 7 days, how many glasses of mi...",3.0,4 to 6 glasses during the past 7 days,1078,11.7
3,Q75,"During the past 7 days, how many glasses of mi...",4.0,1 glass per day,1089,11.5
4,Q75,"During the past 7 days, how many glasses of mi...",5.0,2 glasses per day,651,7.2
5,Q75,"During the past 7 days, how many glasses of mi...",6.0,3 glasses per day,257,2.5
6,Q75,"During the past 7 days, how many glasses of mi...",7.0,4 or more glasses per day,301,3.0
7,Q75,"During the past 7 days, how many glasses of mi...",,Missing,7682,
8,Q76,"During the past 7 days, on how many days did y...",1.0,0 days,3570,22.0
9,Q76,"During the past 7 days, on how many days did y...",2.0,1 day,1578,9.7


#### Page 21

In [None]:
# Raw input code for page 21 of Appendix C
appendix_c_raw_pg21 = {
    'question_number': ['Q78'] * 7 + ['Q79'] * 7 + ['Q80'] * 5 + ['Q81'] * 6,
    'question': [
        'On an average school day, how many hours do you spend in front of a TV, computer, smart phone, or other electronic device watching shows or videos, playing games, accessing the Internet, or using social media (also called "screen time")?'] * 7 +
        ['In an average week when you are in school, on how many days do you go to physical education (PE) classes?'] * 7 +
        ['During the past 12 months, on how many sports teams did you play?'] * 5 +
        ['During the past 12 months, how many times did you have a concussion from playing a sport or being physically active?'] * 6,
    'answer_code': [1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 'NA',
                    1, 2, 3, 4, 5, 'NA'
                   ],
    'answer_label': [
        'Less than 1 hour per day', '1 hour per day', '2 hours per day', '3 hours per day', '4 hours per day', '5 or more hours per day', 'Missing',
        '0 days', '1 day', '2 days', '3 days', '4 days', '5 days', 'Missing',
        '0 teams', '1 team', '2 teams', '3 or more teams', 'Missing',
        '0 times', '1 time', '2 times', '3 times', '4 or more times', 'Missing'
    ],
    'frequency': [1034, 806, 2224, 3236, 2843, 6353, 736,
                  7334, 339, 636, 1363, 317, 2495, 4748,
                  5268, 2517, 1487, 1130, 6830,
                  13739, 1111, 398, 144, 288, 1552
                 ],
    'weighted_percentage': [6.1, 4.6, 13.5, 19.2, 17.5, 39.2, 'NA',
                            53.2, 2.8, 5.9, 15.1, 4.0, 19.0, 'NA',
                            50.9, 24.3, 14.7, 10.0, 'NA',
                            88.1, 6.9, 2.6, 0.8, 1.7, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg21 = pd.DataFrame(appendix_c_raw_pg21)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg21 = df_appendix_c_raw_pg21.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg21

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q78,"On an average school day, how many hours do yo...",1.0,Less than 1 hour per day,1034,6.1
1,Q78,"On an average school day, how many hours do yo...",2.0,1 hour per day,806,4.6
2,Q78,"On an average school day, how many hours do yo...",3.0,2 hours per day,2224,13.5
3,Q78,"On an average school day, how many hours do yo...",4.0,3 hours per day,3236,19.2
4,Q78,"On an average school day, how many hours do yo...",5.0,4 hours per day,2843,17.5
5,Q78,"On an average school day, how many hours do yo...",6.0,5 or more hours per day,6353,39.2
6,Q78,"On an average school day, how many hours do yo...",,Missing,736,
7,Q79,"In an average week when you are in school, on ...",1.0,0 days,7334,53.2
8,Q79,"In an average week when you are in school, on ...",2.0,1 day,339,2.8
9,Q79,"In an average week when you are in school, on ...",3.0,2 days,636,5.9


#### Page 22

In [None]:
# Raw input code for page 22 of Appendix C
appendix_c_raw_pg22 = {
    'question_number': ['Q82'] * 4 + ['Q83'] * 4 + ['Q84'] * 6 + ['Q85'] * 6 + ['Q86'] * 8,
    'question': [
        'Have you ever been tested for HIV, the virus that causes AIDS?'] * 4 +
        ['During the past 12 months, have you been tested for a sexually transmitted disease (STD) other than HIV, such as chlamydia or gonorrhea?'] * 4 +
        ['When was the last time you saw a dentist for a check-up, exam, teeth cleaning, or other dental work?'] * 6 +
        ['During the past 30 days, how often was your mental health not good?'] * 6 +
        ['On an average school night, how many hours of sleep do you get?'] * 8,
    'answer_code': [1, 2, 3, 'NA',
                    1, 2, 3, 'NA',
                    1, 2, 3, 4, 5, 'NA',
                    1, 2, 3, 4, 5, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA'
                   ],
    'answer_label': [
        'Yes', 'No', 'Not sure', 'Missing',
        'Yes', 'No', 'Not sure', 'Missing',
        'During the past 12 months', 'Between 12 and 24 months ago', 'More than 24 months ago', 'Never', 'Not sure', 'Missing',
        'Never', 'Rarely', 'Sometimes', 'Most of the time', 'Always', 'Missing',
        '4 or less hours', '5 hours', '6 hours', '7 hours', '8 hours', '9 hours', '10 or more hours', 'Missing'
    ],
    'frequency': [823, 9800, 1684, 4925,
                  542, 8444, 589, 7657,
                  11337, 1729, 823, 258, 1371, 1714,
                  2893, 2648, 3525, 2545, 1184, 4437,
                  1391, 1899, 3275, 3831, 2423, 560, 236, 3617
                 ],
    'weighted_percentage': [5.8, 79.6, 14.6, 'NA',
                            5.2, 88.2, 6.6, 'NA',
                            73.7, 11.1, 5.6, 1.4, 8.1, 'NA',
                            21.5, 21.4, 27.9, 20.0, 9.3, 'NA',
                            10.7, 14.8, 24.4, 27.4, 17.1, 4.0, 1.6, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg22 = pd.DataFrame(appendix_c_raw_pg22)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg22 = df_appendix_c_raw_pg22.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg22

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q82,"Have you ever been tested for HIV, the virus t...",1.0,Yes,823,5.8
1,Q82,"Have you ever been tested for HIV, the virus t...",2.0,No,9800,79.6
2,Q82,"Have you ever been tested for HIV, the virus t...",3.0,Not sure,1684,14.6
3,Q82,"Have you ever been tested for HIV, the virus t...",,Missing,4925,
4,Q83,"During the past 12 months, have you been teste...",1.0,Yes,542,5.2
5,Q83,"During the past 12 months, have you been teste...",2.0,No,8444,88.2
6,Q83,"During the past 12 months, have you been teste...",3.0,Not sure,589,6.6
7,Q83,"During the past 12 months, have you been teste...",,Missing,7657,
8,Q84,When was the last time you saw a dentist for a...,1.0,During the past 12 months,11337,73.7
9,Q84,When was the last time you saw a dentist for a...,2.0,Between 12 and 24 months ago,1729,11.1


#### Page 23

In [None]:
# Raw input code for page 23 of Appendix C
appendix_c_raw_pg23 = {
    'question_number': ['Q87'] * 8 + ['Q88'] * 7 + ['Q89'] * 7,
    'question': [
        'During the past 30 days, where did you usually sleep?'] * 8 +
        ['During the past 30 days, how many times did you take prescription pain medicine without a doctor\'s prescription or differently than how a doctor told you to use it?'] * 7 +
        ['During your life, how many times have you used hallucinogenic drugs, such as LSD, acid, PCP, angel dust, mescaline, or mushrooms?'] * 7,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA'
                   ],
    'answer_label': [
        'In my parent\'s or guardian\'s home', 'In the home of a friend, family member, or other person because I had to leave my home or my parent or guardian cannot afford housing',
        'In a shelter or emergency housing', 'In a motel or hotel', 'In a car, park, campground, or other public place', 'I do not have a usual place to sleep', 'Somewhere else', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing',
        '0 times', '1 or 2 times', '3 to 9 times', '10 to 19 times', '20 to 39 times', '40 or more times', 'Missing'
    ],
    'frequency': [12290, 243, 47, 46, 34, 49, 138, 4385,
                  9223, 341, 160, 61, 25, 56, 7366,
                  8666, 326, 150, 49, 24, 49, 7968
                 ],
    'weighted_percentage': [96.1, 1.7, 0.3, 0.2, 0.2, 0.3, 1.3, 'NA',
                            94.0, 3.3, 1.5, 0.5, 0.2, 0.5, 'NA',
                            93.5, 3.6, 1.6, 0.5, 0.3, 0.5, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg23 = pd.DataFrame(appendix_c_raw_pg23)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg23 = df_appendix_c_raw_pg23.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg23

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q87,"During the past 30 days, where did you usually...",1.0,In my parent's or guardian's home,12290,96.1
1,Q87,"During the past 30 days, where did you usually...",2.0,"In the home of a friend, family member, or oth...",243,1.7
2,Q87,"During the past 30 days, where did you usually...",3.0,In a shelter or emergency housing,47,0.3
3,Q87,"During the past 30 days, where did you usually...",4.0,In a motel or hotel,46,0.2
4,Q87,"During the past 30 days, where did you usually...",5.0,"In a car, park, campground, or other public place",34,0.2
5,Q87,"During the past 30 days, where did you usually...",6.0,I do not have a usual place to sleep,49,0.3
6,Q87,"During the past 30 days, where did you usually...",7.0,Somewhere else,138,1.3
7,Q87,"During the past 30 days, where did you usually...",,Missing,4385,
8,Q88,"During the past 30 days, how many times did yo...",1.0,0 times,9223,94.0
9,Q88,"During the past 30 days, how many times did yo...",2.0,1 or 2 times,341,3.3


#### Page 24

In [None]:
# Raw input code for page 24 of Appendix C
appendix_c_raw_pg24 = {
    'question_number': ['Q90'] * 8 + ['Q91'] * 8 + ['Q92'] * 9,
    'question': [
        'During the past 7 days, how many times did you drink a can, bottle, or glass of a sports drink such as Gatorade or PowerAde?'] * 8 +
        ['During the past 7 days, how many times did you drink a bottle or glass of plain water?'] * 8 +
        ['During the past 7 days, on how many days did you do exercises to strengthen or tone your muscles, such as push-ups, sit-ups, or weight lifting?'] * 9,
    'answer_code': [1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 'NA',
                    1, 2, 3, 4, 5, 6, 7, 8, 'NA'
                   ],
    'answer_label': [
        'I did not drink sports drinks during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing',
        'I did not drink water during the past 7 days', '1 to 3 times during the past 7 days', '4 to 6 times during the past 7 days',
        '1 time per day', '2 times per day', '3 times per day', '4 or more times per day', 'Missing',
        '0 days', '1 day', '2 days', '3 days', '4 days', '5 days', '6 days', '7 days', 'Missing'
    ],
    'frequency': [4223, 2719, 1003, 447, 308, 142, 241, 8149,
                  506, 1055, 1473, 968, 1748, 2037, 4664, 4781,
                  3238, 887, 1047, 1051, 718, 870, 360, 1088, 7973
                 ],
    'weighted_percentage': [48.2, 30.1, 10.5, 4.7, 3.0, 1.2, 2.2, 'NA',
                            3.8, 7.7, 11.6, 7.3, 13.8, 17.0, 38.8, 'NA',
                            34.0, 9.7, 11.4, 11.7, 7.7, 9.8, 4.0, 11.6, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg24 = pd.DataFrame(appendix_c_raw_pg24)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg24 = df_appendix_c_raw_pg24.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg24


Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q90,"During the past 7 days, how many times did you...",1.0,I did not drink sports drinks during the past ...,4223,48.2
1,Q90,"During the past 7 days, how many times did you...",2.0,1 to 3 times during the past 7 days,2719,30.1
2,Q90,"During the past 7 days, how many times did you...",3.0,4 to 6 times during the past 7 days,1003,10.5
3,Q90,"During the past 7 days, how many times did you...",4.0,1 time per day,447,4.7
4,Q90,"During the past 7 days, how many times did you...",5.0,2 times per day,308,3.0
5,Q90,"During the past 7 days, how many times did you...",6.0,3 times per day,142,1.2
6,Q90,"During the past 7 days, how many times did you...",7.0,4 or more times per day,241,2.2
7,Q90,"During the past 7 days, how many times did you...",,Missing,8149,
8,Q91,"During the past 7 days, how many times did you...",1.0,I did not drink water during the past 7 days,506,3.8
9,Q91,"During the past 7 days, how many times did you...",2.0,1 to 3 times during the past 7 days,1055,7.7


#### Page 25

In [None]:
# Raw input code for page 25 of Appendix C
appendix_c_raw_pg25 = {
    'question_number': ['Q93'] * 6 + ['Q94'] * 4 + ['Q95'] * 7 + ['Q96'] * 6,
    'question': [
        'During the COVID-19 pandemic, how often was your mental health not good?'] * 6 +
        ['During the COVID-19 pandemic, did a parent or other adult in your home lose their job even for a short amount of time?'] * 4 +
        ['During the past 12 months, how many times have you had a sunburn?'] * 7 +
        ['Do you agree or disagree that you feel close to people at your school?'] * 6,
    'answer_code': [1, 2, 3, 4, 5, 'NA',
                    1, 2, 3, 'NA',
                    1, 2, 3, 4, 5, 6, 'NA',
                    1, 2, 3, 4, 5, 'NA'
                   ],
    'answer_label': [
        'Never', 'Rarely', 'Sometimes', 'Most of the time', 'Always', 'Missing',
        'My parents and other adults in my home did not have jobs before the COVID-19 pandemic started', 'Yes', 'No', 'Missing',
        '0 times', '1 time', '2 times', '3 times', '4 times', '5 or more times', 'Missing',
        'Strongly agree', 'Agree', 'Not sure', 'Disagree', 'Strongly disagree', 'Missing'
    ],
    'frequency': [2042, 1753, 2395, 2213, 1313, 7516,
                  476, 2765, 9050, 4941,
                  3540, 1157, 1378, 1129, 540, 1387, 8101,
                  1835, 3744, 2038, 1045, 707, 7863
                 ],
    'weighted_percentage': [19.2, 18.7, 25.1, 23.3, 13.6, 'NA',
                            3.9, 24.4, 71.7, 'NA',
                            35.6, 13.4, 16.3, 12.6, 6.3, 15.7, 'NA',
                            20.2, 41.3, 21.1, 10.3, 7.0, 'NA'
                           ]
}

# Converting to df
df_appendix_c_raw_pg25 = pd.DataFrame(appendix_c_raw_pg25)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg25 = df_appendix_c_raw_pg25.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg25

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q93,"During the COVID-19 pandemic, how often was yo...",1.0,Never,2042,19.2
1,Q93,"During the COVID-19 pandemic, how often was yo...",2.0,Rarely,1753,18.7
2,Q93,"During the COVID-19 pandemic, how often was yo...",3.0,Sometimes,2395,25.1
3,Q93,"During the COVID-19 pandemic, how often was yo...",4.0,Most of the time,2213,23.3
4,Q93,"During the COVID-19 pandemic, how often was yo...",5.0,Always,1313,13.6
5,Q93,"During the COVID-19 pandemic, how often was yo...",,Missing,7516,
6,Q94,"During the COVID-19 pandemic, did a parent or ...",1.0,My parents and other adults in my home did not...,476,3.9
7,Q94,"During the COVID-19 pandemic, did a parent or ...",2.0,Yes,2765,24.4
8,Q94,"During the COVID-19 pandemic, did a parent or ...",3.0,No,9050,71.7
9,Q94,"During the COVID-19 pandemic, did a parent or ...",,Missing,4941,


#### Page 26

In [None]:
# Raw input code for page 26 of Appendix C
appendix_c_raw_pg26 = {
    'question_number': ['Q97'] * 6 + ['Q98'] * 3 + ['Q99'] * 5,
    'question': [
        'How often do your parents or other adults in your family know where you are going or with whom you will be?'] * 6 +
        ['Because of a physical, mental, or emotional problem, do you have serious difficulty concentrating, remembering, or making decisions?'] * 3 +
        ['How well do you speak English?'] * 5,
    'answer_code': [1, 2, 3, 4, 5, 'NA',
                    1, 2, 'NA',
                    1, 2, 3, 4, 'NA'
                   ],
    'answer_label': [
        'Never', 'Rarely', 'Sometimes', 'Most of the time', 'Always', 'Missing',
        'Yes', 'No', 'Missing',
        'Very well', 'Well', 'Not well', 'Not at all', 'Missing'
    ],
    'frequency': [284, 324, 726, 2983, 4775, 8140,
                  4130, 4921, 8181,
                  7581, 1342, 121, 79, 8109
                 ],
    'weighted_percentage': [2.7, 3.2, 7.7, 33.8, 52.7, 'NA',
                            45.6, 54.4, 'NA',
                            83.3, 14.8, 1.1, 0.8, 'NA'
                           ]
}


# Converting to df
df_appendix_c_raw_pg26 = pd.DataFrame(appendix_c_raw_pg26)

# Resetting the index to create a sequential row index
df_appendix_c_raw_pg26 = df_appendix_c_raw_pg26.reset_index(drop=True)

# Checking
df_appendix_c_raw_pg26

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q97,How often do your parents or other adults in y...,1.0,Never,284,2.7
1,Q97,How often do your parents or other adults in y...,2.0,Rarely,324,3.2
2,Q97,How often do your parents or other adults in y...,3.0,Sometimes,726,7.7
3,Q97,How often do your parents or other adults in y...,4.0,Most of the time,2983,33.8
4,Q97,How often do your parents or other adults in y...,5.0,Always,4775,52.7
5,Q97,How often do your parents or other adults in y...,,Missing,8140,
6,Q98,"Because of a physical, mental, or emotional pr...",1.0,Yes,4130,45.6
7,Q98,"Because of a physical, mental, or emotional pr...",2.0,No,4921,54.4
8,Q98,"Because of a physical, mental, or emotional pr...",,Missing,8181,
9,Q99,How well do you speak English?,1.0,Very well,7581,83.3


### Combining all 26 dfs into 1 master df

In [None]:
# Dynamically generating the dataframe names and appending them to a list
appendix_c_dfs = [pd.DataFrame(eval(f'appendix_c_raw_pg{i}')) for i in range(1, 27)]

# Appending all dfs into a master df
appendix_c_master_df = pd.concat(appendix_c_dfs, ignore_index=True)

# Checking the master df
appendix_c_master_df

Unnamed: 0,question_number,question,answer_code,answer_label,frequency,weighted_percentage
0,Q1,How old are you?,1,12 years old or younger,39,0.2
1,Q1,How old are you?,2,13 years old,62,0.4
2,Q1,How old are you?,3,14 years old,3403,20.0
3,Q1,How old are you?,4,15 years old,4427,25.3
4,Q1,How old are you?,5,16 years old,4276,24.7
...,...,...,...,...,...,...
623,Q99,How well do you speak English?,1,Very well,7581,83.3
624,Q99,How well do you speak English?,2,Well,1342,14.8
625,Q99,How well do you speak English?,3,Not well,121,1.1
626,Q99,How well do you speak English?,4,Not at all,79,0.8


In [None]:
appendix_c_dfs

[   question_number                     question answer_code  \
 0               Q1             How old are you?           1   
 1               Q1             How old are you?           2   
 2               Q1             How old are you?           3   
 3               Q1             How old are you?           4   
 4               Q1             How old are you?           5   
 5               Q1             How old are you?           6   
 6               Q1             How old are you?           7   
 7               Q1             How old are you?          NA   
 8               Q2            What is your sex?           1   
 9               Q2            What is your sex?           2   
 10              Q2            What is your sex?          NA   
 11              Q3       In what grade are you?           1   
 12              Q3       In what grade are you?           2   
 13              Q3       In what grade are you?           3   
 14              Q3       In what grade 

In [None]:
# Checking if all questions are present
appendix_c_master_df['question_number'].unique()

array(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20',
       'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q47',
       'Q48', 'Q49', 'Q50', 'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38',
       'Q39', 'Q40', 'Q41', 'Q42', 'Q43', 'Q44', 'Q45', 'Q46', 'Q51',
       'Q52', 'Q53', 'Q54', 'Q55', 'Q56', 'Q57', 'Q58', 'Q59', 'Q60',
       'Q61', 'Q62', 'Q63', 'Q64', 'Q65', 'Q66', 'Q67', 'Q68', 'Q69',
       'Q70', 'Q71', 'Q72', 'Q73', 'Q74', 'Q75', 'Q76', 'Q77', 'Q78',
       'Q79', 'Q80', 'Q81', 'Q82', 'Q83', 'Q84', 'Q85', 'Q86', 'Q87',
       'Q88', 'Q89', 'Q90', 'Q91', 'Q92', 'Q93', 'Q94', 'Q95', 'Q96',
       'Q97', 'Q98', 'Q99'], dtype=object)

In [None]:
# Checking count of questions present
appendix_c_master_df['question_number'].nunique()

95

### Question label mapping: Manual approach to collate all question labels by question number using Appendix A  
### These can be joined with the answer mapping file on question number as key to create master mapping file

In [None]:
# Defining the question numbers and corresponding labels
# Questions will range from 1 to 99, this can be done dynamically using list comprehension
question_number = [f'Q{j}' for j in range(1, 100)]

# Question labels based on Appendix A
question_label = [
    # Attributes (Q1-Q7)
    'age', 'gender', 'grade', 'hispanic_latino', 'race_select', 'height_m', 'weight_kg',
    # Driving, guns, violence and safety (Q8-Q18)
    'seatbelt_use_non_driving', 'riding_with_drinking_driver_L30d', 'drunk_driving_L30d', 'texting_driving_L30d', 'weapon_carry_school_L30d',
    'gun_carrying_L12m', 'safety_concerns_at_school_L30d', 'threatened_at_school_weapon_L12m', 'physical_fighting_L12m',
    'physical_fight_school_L12m', 'saw_physical_violence_in_neighborhood',
    # Sexual and dating violence (Q19-Q22)
    'forced_sexual_intercourse', 'experienced_sexual_violence_L12m', 'experienced_sexual_dating_violence_L12m', 'experienced_physical_dating_violence_L12m',
    # Bullying in last 12 months (Q23-Q25)
    'bullied_at_school_L12m', 'electronically_bullied_L12m', 'felt_sad_hopeless_L12m',
    # Suicide behavior in last 12 months (Q26-Q29)
    'seriously_considered_attempting_suicide_L12m', 'made_suicide_plan_L12m', 'actually_attempted_suicide_L12m', 'injurious_suicide_attempt_L12m',
    # Smoking and vaping (Q30-Q39)
    'ever_tried_cigarette_smoking', 'first_tried_cigarette_smoking_age', 'currently_smoked_cigarettes_L30d',
    'smoked_cigarettes_per_day_L30d', 'ever_used_electronic_vapor_product', 'current_electronic_vapor_use_L30d',
    'electronic_vapor_product_purchase_place_L30d', 'current_smokeless_tobacco_use_L30d', 'current_cigar_use_L30d', 'tried_quitting_all_tobacco_products_L12m',
    # Alcohol and drugs (Q40-Q56)
    'first_tried_alcohol_age', 'currently_drank_alcohol_L30d', 'current_binge_drinking_L30d', 'largest_number_of_drinks_L30d',
    'source_of_alcohol_L30d', 'ever_tried_marijuana', 'first_tried_marijuana_age', 'current_marijuana_use_L30d',
    'ever_tried_synthetic_marijuana', 'ever_took_prescription_meds_without_doc_consult', 'ever_tried_cocaine', 'ever_tried_inhalants',
    'ever_tried_heroin', 'ever_tried_methamphetamines', 'ever_tried_ecstacy', 'ever_injected_illegal_drugs', 'offered_sold_drugs_school_L12m',
    # Sex (Q57-Q65)
    'ever_had_sexual_intercourse', 'age_first_sexual_intercourse', 'number_sex_partners', 'current_sexual_activity_L3m',
    'alcohol_drugs_before_last_sex', 'condom_use', 'birth_control_pill_use', 'gender_sexual_contact', 'sexual_identity',
    # Weight (Q66-Q67)
    'perception_of_weight', 'weight_loss_attempt',
    # Healthy food and drinks consumption in last 7 days (Q68-Q76)
    'fruit_juice_drinking_L7d', 'fruit_eating_L7d', 'green_salad_eating_L7d', 'potato_eating_L7d', 'carrot_eating_L7d', 'other_vegetable_eating_L7d',
    'soda_drinking_L7d', 'milk_drinking_L7d', 'breakfast_eating_L7d',
    # Physical activity and sports (Q77-Q81)
    'physical_activity_L7d', 'screen_time', 'pe_attendance', 'sports_team_participation_L12m', 'concussion_L12m',
    # Physical and mental health (Q82-Q92)
    'HIV_testing', 'STD_testing_L12m', 'last_dentist_visit', 'current_mental_health_L30d', 'sleep_hours_night', 'sleep_homelessness_L30d',
    'current_prescription_meds_without_doc_consult_L30d', 'ever_tried_hallucinogenic_drugs', 'sports_drink_consumption_L7d', 'water_consumption_L7d',
    'muscle_strengthening_L7d',
    # Pandemic (Q93-Q95)
    'mental_health_during_pandemic', 'parent_job_loss_during_pandemic', 'suburn_L12m',
    # Others (Q96-Q99)
    'feel_close_people_school', 'parental_monitoring', 'difficulty_concentrating', 'english_fluency',
                 ]


In [None]:
print(f'Count of question numbers: {len(question_number)}')
print(f'Count of question labels: {len(question_label)}')

Count of question numbers: 99
Count of question labels: 99


In [None]:
# Creating a dictionary to store these
question_mapping_dict = {
    'question_number': question_number,
    'question_label': question_label
}

# Converting the dictionary to df
question_mapping_df = pd.DataFrame(question_mapping_dict)

In [None]:
# Merging both dfs to create master mapping file
yrbs_master_mapping = pd.merge(question_mapping_df, appendix_c_master_df, on='question_number', how='left')

In [None]:
# Checking
yrbs_master_mapping

Unnamed: 0,question_number,question_label,question,answer_code,answer_label,frequency,weighted_percentage
0,Q1,age,How old are you?,1,12 years old or younger,39,0.2
1,Q1,age,How old are you?,2,13 years old,62,0.4
2,Q1,age,How old are you?,3,14 years old,3403,20.0
3,Q1,age,How old are you?,4,15 years old,4427,25.3
4,Q1,age,How old are you?,5,16 years old,4276,24.7
...,...,...,...,...,...,...,...
627,Q99,english_fluency,How well do you speak English?,1,Very well,7581,83.3
628,Q99,english_fluency,How well do you speak English?,2,Well,1342,14.8
629,Q99,english_fluency,How well do you speak English?,3,Not well,121,1.1
630,Q99,english_fluency,How well do you speak English?,4,Not at all,79,0.8


In [None]:
yrbs_master_mapping.columns

Index(['question_number', 'question_label', 'question', 'answer_code',
       'answer_label', 'frequency', 'weighted_percentage'],
      dtype='object')

In [None]:
# Answer label can be cleaned by removing certain special characters, replacing space with underscore and converting to lower case
# Creating a new cleaned column in yrbs_master_mapping for ease of variable naming
yrbs_master_mapping['answer_label_clean'] = (
    yrbs_master_mapping['answer_label']
    .str.lower()                        # Converting to lowercase
    .str.replace("'", "", regex=False)  # Removing apostrophes
    .str.replace(" ", "_", regex=False) # Replacing spaces with underscores
)

# Checking
print(yrbs_master_mapping[['answer_label', 'answer_label_clean']].head(20))

               answer_label       answer_label_clean
0   12 years old or younger  12_years_old_or_younger
1              13 years old             13_years_old
2              14 years old             14_years_old
3              15 years old             15_years_old
4              16 years old             16_years_old
5              17 years old             17_years_old
6     18 years old or older    18_years_old_or_older
7                   Missing                  missing
8                    Female                   female
9                      Male                     male
10                  Missing                  missing
11                9th grade                9th_grade
12               10th grade               10th_grade
13               11th grade               11th_grade
14               12th grade               12th_grade
15  Ungraded or other grade  ungraded_or_other_grade
16                  Missing                  missing
17                      Yes                   

## Combining raw response data with mapping data

In [None]:
# Example of approach to be used
import pandas as pd

# Example dfs
# A = Survey responses with encoded values
A = pd.DataFrame({
    'id': [1, 2, 3],
    'q1': [1, 2, 1],  # q1: {1 -> "Yes", 2 -> "No"}
    'q2': [2, 1, 2],  # q2: {1 -> "Male", 2 -> "Female"}
    'q3': [3, 2, 1],  # q3: {1 -> "Agree", 2 -> "Neutral", 3 -> "Disagree"}
    'q4': [9, 1, 5],  # q4: 9 unique responses in this case
    'age': [15, 16, 14]  # Extra column not related to questions
})

# B = Mapping of questions and answer codes to labels
B = pd.DataFrame({
    'question_number': ['q1', 'q1', 'q2', 'q2', 'q3', 'q3', 'q3', 'q4', 'q4', 'q4'],
    'answer_code': [1, 2, 1, 2, 1, 2, 3, 1, 5, 9],
    'answer_label': ['Yes', 'No', 'Male', 'Female', 'Agree', 'Neutral', 'Disagree',
                     'Very Bad', 'Average', 'Excellent']
})


# Step 1: Melt A to long format (one row per question-answer pair)
A_melted = A.melt(id_vars=['id', 'age'], var_name='question_number', value_name='answer_code')

# Step 2: Join with B to map answer codes to labels
merged = pd.merge(A_melted, B, on=['question_number', 'answer_code'], how='left')

# Step 3: Optional - Pivot back to wide format if needed
A_transformed = merged.pivot(index=['id', 'age'], columns='question_number', values='answer_label').reset_index()

# Display the transformed DataFrame
print(A_transformed)


question_number  id  age   q1      q2        q3         q4
0                 1   15  Yes  Female  Disagree  Excellent
1                 2   16   No    Male   Neutral   Very Bad
2                 3   14  Yes  Female     Agree    Average


# Data processing and mapping

In [None]:
# Creating a working copy
yrbs_survey_response_working = yrbs_survey_response_raw_df.copy()

In [None]:
yrbs_survey_response_working.columns

Index(['site', 'raceeth', 'q6orig', 'q7orig', 'record', 'orig_rec', 'q1', 'q2',
       'q3', 'q4',
       ...
       'q94', 'q95', 'q96', 'q97', 'q98', 'q99', 'BMIPCT', 'weight', 'stratum',
       'psu'],
      dtype='object', length=109)

In [None]:
# Renaming columns to make them more intuitive

# Custom names for the first 6 columns
custom_names_first6 = ['site', 'race_ethnicity', 'height_raw_ft_in', 'weight_pounds', 'record', 'orig_rec',]

# Unique values from question_label column of master mapping df for survey questions
unique_qn_labels = yrbs_master_mapping['question_label'].unique().tolist()

# Custom names for the remaining 4 columns
custom_names_last4 = ['bmi_percentile', 'survey_weight', 'startum', 'psu_code']

# Combining all parts to create the final list of new column names
new_column_names_yrbs = custom_names_first6 + unique_qn_labels + custom_names_last4

In [None]:
# Checking if new column names count matches the number of columns in the working df
if len(new_column_names_yrbs) != yrbs_survey_response_working.shape[1]:
    raise ValueError(f"New column names count: ({len(new_column_names_yrbs)}), raw df column count ({yrbs_survey_response_working.shape[1]}).")

In [None]:
# Renaming the columns if no issue is detected
yrbs_survey_response_working.columns = new_column_names_yrbs

In [None]:
# Checking df
print(yrbs_survey_response_working.head(5))

# Checking new column names
print(yrbs_survey_response_working.columns)

  site  race_ethnicity height_raw_ft_in weight_pounds  record  orig_rec  age  \
0   XX             3.0              502           155       1       NaN  5.0   
1   XX             7.0              509           NaN       2       NaN  4.0   
2   XX             6.0              507           136       3       NaN  4.0   
3   XX             6.0              509           133       4       NaN  4.0   
4   XX             7.0              603           196       5       NaN  3.0   

   gender  grade  hispanic_latino  ... parent_job_loss_during_pandemic  \
0     1.0    3.0              2.0  ...                             2.0   
1     2.0    NaN              1.0  ...                             3.0   
2     2.0    2.0              1.0  ...                             2.0   
3     2.0    2.0              1.0  ...                             2.0   
4     2.0    2.0              1.0  ...                             2.0   

   suburn_L12m  feel_close_people_school  parental_monitoring  \
0        

In [None]:
# Defining columns to exclude for mapping responses
cols_exclude_mapping = custom_names_first6 + ['race_select', 'height_m', 'weight_kg'] + custom_names_last4

# Selecting columns to melt and excluding specific columns
columns_to_melt = [col for col in yrbs_survey_response_working.columns if col not in cols_exclude_mapping]

# Melting the working df with selected columns based on a unique ID i.e. record
yrbs_df_long = yrbs_survey_response_working.melt(id_vars='record', value_vars=columns_to_melt,
                    var_name='question_label', value_name='answer_code')

In [None]:
# Checking
print(yrbs_df_long)

# Replacing NaN with 'NA' in answer_code column for consistency while joining with mapping df
yrbs_df_long['answer_code'] = yrbs_df_long['answer_code'].fillna('NA')

# Checking
print(yrbs_df_long)

         record   question_label  answer_code
0             1              age          5.0
1             2              age          4.0
2             3              age          4.0
3             4              age          4.0
4             5              age          3.0
...         ...              ...          ...
1654267   17502  english_fluency          NaN
1654268   17503  english_fluency          NaN
1654269   17504  english_fluency          NaN
1654270   17505  english_fluency          NaN
1654271   17507  english_fluency          NaN

[1654272 rows x 3 columns]
         record   question_label answer_code
0             1              age         5.0
1             2              age         4.0
2             3              age         4.0
3             4              age         4.0
4             5              age         3.0
...         ...              ...         ...
1654267   17502  english_fluency          NA
1654268   17503  english_fluency          NA
1654269   17504

In [None]:
# Joining with master mapping df to map answer codes to labels
merged = pd.merge(yrbs_df_long, yrbs_master_mapping, on=['question_label', 'answer_code'], how='left')

# Pivoting back to wide format using pivot_table to handle duplicates by aggregating with 'first'
yrbs_df_mapped = merged.pivot_table(
    index='record',
    columns='question_label',
    values='answer_label_clean',
    aggfunc='first'
).reset_index()

In [None]:
yrbs_df_mapped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17232 entries, 0 to 17231
Data columns (total 93 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   record                                              17232 non-null  int64 
 1   HIV_testing                                         17232 non-null  object
 2   STD_testing_L12m                                    17232 non-null  object
 3   actually_attempted_suicide_L12m                     17232 non-null  object
 4   age                                                 17232 non-null  object
 5   age_first_sexual_intercourse                        17232 non-null  object
 6   alcohol_drugs_before_last_sex                       17232 non-null  object
 7   birth_control_pill_use                              17232 non-null  object
 8   breakfast_eating_L7d                                17232 non-null  object
 9   bullie

In [None]:
yrbs_master_mapping.columns

Index(['question_number', 'question_label', 'question', 'answer_code',
       'answer_label', 'frequency', 'weighted_percentage',
       'answer_label_clean'],
      dtype='object')

In [None]:
# Checking the mapped dataframe
yrbs_df_mapped.head(10)

question_label,record,HIV_testing,STD_testing_L12m,actually_attempted_suicide_L12m,age,age_first_sexual_intercourse,alcohol_drugs_before_last_sex,birth_control_pill_use,breakfast_eating_L7d,bullied_at_school_L12m,...,source_of_alcohol_L30d,sports_drink_consumption_L7d,sports_team_participation_L12m,suburn_L12m,texting_driving_L30d,threatened_at_school_weapon_L12m,tried_quitting_all_tobacco_products_L12m,water_consumption_L7d,weapon_carry_school_L30d,weight_loss_attempt
0,1,yes,no,1_time,16_years_old,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse_with_an_op...,3_days,yes,...,i_took_it_from_a_store_or_family_member,i_did_not_drink_sports_drinks_during_the_past_...,2_teams,0_times,i_did_not_drive_a_car_or_other_vehicle_during_...,0_times,no,4_to_6_times_during_the_past_7_days,0_days,lose_weight
1,2,no,no,0_times,15_years_old,15_years_old,yes,withdrawal_or_some_other_method,5_days,no,...,i_did_not_drink_alcohol_during_the_past_30_days,1_to_3_times_during_the_past_7_days,0_teams,2_times,i_did_not_drive_a_car_or_other_vehicle_during_...,0_times,no,3_times_per_day,0_days,lose_weight
2,3,yes,yes,1_time,15_years_old,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse,missing,5_days,no,...,i_did_not_drink_alcohol_during_the_past_30_days,i_did_not_drink_sports_drinks_during_the_past_...,0_teams,2_times,i_did_not_drive_a_car_or_other_vehicle_during_...,0_times,"i_did_not_use_cigarettes,_electronic_vapor_pro...",2_times_per_day,0_days,i_am_not_trying_to_do_anything_about_my_weight
3,4,no,no,0_times,15_years_old,12_years_old,no,withdrawal_or_some_other_method,1_day,no,...,missing,i_did_not_drink_sports_drinks_during_the_past_...,0_teams,2_times,0_days,0_times,yes,4_or_more_times_per_day,0_days,gain_weight
4,5,not_sure,not_sure,0_times,14_years_old,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse_with_an_op...,7_days,no,...,i_got_it_some_other_way,4_to_6_times_during_the_past_7_days,0_teams,2_times,i_did_not_drive_a_car_or_other_vehicle_during_...,0_times,"i_did_not_use_cigarettes,_electronic_vapor_pro...",4_or_more_times_per_day,0_days,stay_the_same_weight
5,6,no,no,0_times,15_years_old,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse_with_an_op...,3_days,no,...,i_did_not_drink_alcohol_during_the_past_30_days,1_time_per_day,2_teams,1_time,i_did_not_drive_a_car_or_other_vehicle_during_...,0_times,"i_did_not_use_cigarettes,_electronic_vapor_pro...",4_or_more_times_per_day,0_days,stay_the_same_weight
6,7,no,no,0_times,15_years_old,14_years_old,no,condoms,7_days,no,...,i_got_it_some_other_way,1_to_3_times_during_the_past_7_days,0_teams,2_times,i_did_not_drive_a_car_or_other_vehicle_during_...,0_times,yes,1_time_per_day,0_days,lose_weight
7,8,no,no,0_times,14_years_old,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse_with_an_op...,4_days,no,...,i_did_not_drink_alcohol_during_the_past_30_days,i_did_not_drink_sports_drinks_during_the_past_...,0_teams,0_times,i_did_not_drive_a_car_or_other_vehicle_during_...,0_times,"i_did_not_use_cigarettes,_electronic_vapor_pro...",4_or_more_times_per_day,0_days,gain_weight
8,9,no,no,0_times,15_years_old,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse,i_have_never_had_sexual_intercourse_with_an_op...,1_day,no,...,i_did_not_drink_alcohol_during_the_past_30_days,4_to_6_times_during_the_past_7_days,2_teams,3_times,0_days,0_times,"i_did_not_use_cigarettes,_electronic_vapor_pro...",4_or_more_times_per_day,0_days,gain_weight
9,10,no,no,missing,15_years_old,13_years_old,no,not_sure,0_days,no,...,i_did_not_drink_alcohol_during_the_past_30_days,i_did_not_drink_sports_drinks_during_the_past_...,0_teams,2_times,0_days,0_times,yes,4_to_6_times_during_the_past_7_days,0_days,stay_the_same_weight


In [None]:
# Joining with working df to bring back other columns
yrbs_mapped_final = pd.merge(yrbs_survey_response_working[cols_exclude_mapping], yrbs_df_mapped, on=['record'], how='left')

In [None]:
# Checking column counts
print(f'Count of columns in final mapped df: {len(yrbs_mapped_final.columns)}')
print(f'Count of columns in working df:{len(yrbs_survey_response_working.columns)}')

Count of columns in final mapped df: 105
Count of columns in working df:109


In [None]:
# Comparing column names, ignoring order
same_columns_check = set(yrbs_mapped_final.columns) == set(yrbs_survey_response_working.columns)

print(f"Are the columns the same: {same_columns_check}")

# In case of differences
missing_in_mapped = set(yrbs_mapped_final.columns) - set(yrbs_survey_response_working.columns)
missing_in_orig = set(yrbs_survey_response_working.columns) - set(yrbs_mapped_final.columns)

print(f"Columns in original but not in mapped: {missing_in_mapped}")
print(f"Columns in mapped but not in original: {missing_in_orig}")

Are the columns the same: False
Columns in original but not in mapped: set()
Columns in mapped but not in original: {'first_tried_cigarette_smoking_age', 'ever_tried_cigarette_smoking', 'currently_smoked_cigarettes_L30d', 'injurious_suicide_attempt_L12m'}


In [None]:
# Writing out the mapped df to csv with ; separator
yrbs_mapped_final.to_csv('yrbs_mapped_final_clean.csv', index=False, sep=';', encoding='utf-8')

### File - "yrbs_mapped_final_clean.csv" will be used for exploratory data analysis and modeling.