D606 - Data Science Capstone  
EBN2 — EBN2 Task 2: Data Analysis Report  
Logistic Regression Analysis of Adolescent Suicidal Ideation  
John D. Pickering

### Data Import
- Import Exported .csv files for YBRS 2023
- Convert multi-point answers to binary
- Data File
  - yrbs2023.csv
- Support Files
    - IndenpendentVariables.xlsx
    - 05_race.csv
    - 04_grade.csv
    - 02_age.csv
   

In [1]:
# Import Dependencies

import pandas as pd
from pathlib import Path
import re
import fitz  # PyMuPDF
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
import pandas as pd

def load_and_prepare_yrbs(mapping_path, data_path):
    """
    Load mapping + data file, select fields, rename them, 
    and apply 0/1 conversion for multi-select variables only.
    """

    # -----------------------------
    # Load files
    # -----------------------------
    mapping = pd.read_csv(mapping_path, low_memory=False)
    df = pd.read_csv(data_path, low_memory=False)

    # Normalize fields for safe comparisons
    mapping["type"] = mapping["type"].astype(str).str.lower()
    mapping["variable_type"] = mapping["variable_type"].astype(str).str.lower()

    # -----------------------------
    # Keep only mapping rows where dataset contains the column
    # -----------------------------
    mapping_valid = mapping[mapping["column"].isin(df.columns)].copy()

    # -----------------------------
    # Create subset dataframe + rename using variable_label
    # -----------------------------
    rename_dict = dict(zip(mapping_valid["column"], mapping_valid["variable_label"]))
    df = df[mapping_valid["column"]].rename(columns=rename_dict)

    # -----------------------------
    # Convert multi-select variables to 0/1
    # -----------------------------
    for _, row in mapping_valid.iterrows():

        label = row["variable_label"]
        convert_id = row["convert_id"]
        type_ = row["type"]
        var_type = row["variable_type"]

        # Only convert multi-select, non-demographic rows
        if type_ == "multi-select" and var_type != "demographic":

            # Must have a threshold
            if pd.isna(convert_id):
                continue

            # Make sure values are numeric
            df[label] = pd.to_numeric(df[label], errors="coerce").fillna(0)

            # Apply threshold rule
            df[label] = (df[label] >= convert_id).astype(int)

    return df, mapping_valid

In [3]:
mapping_file = "2023_indenpendent_variables.csv"
data_file = "yrbs2023.csv"   # adjust path if needed

df_2023, mapping_used = load_and_prepare_yrbs(mapping_file, data_file)
print('Mapping Complete')

Mapping Complete


In [4]:
df_2023.head()

Unnamed: 0,considered suicide,current binge drinking,how old are you,what is your sex,in what grade are you,what is your race,weapon carrying at school,gun carrying,safety concerns at school,threatened at school,...,ever cigarette use,electronic vapor product use,ever marijuana use,ever prescription pain medicine use,ever use cocaine,ever inhalant use,ever heroin use,ever methamphetamine use,ever ecstacy use,ever sexual intercourse
0,2.0,0,3.0,1.0,1.0,,0,0,1,0,...,2.0,2.0,0,1,0,0,0,0,0,2.0
1,2.0,0,4.0,2.0,1.0,5.0,0,0,0,0,...,2.0,2.0,0,0,0,0,0,0,0,2.0
2,2.0,0,5.0,2.0,3.0,5.0,0,0,0,0,...,1.0,1.0,0,0,0,1,0,0,0,1.0
3,2.0,0,6.0,1.0,2.0,5.0,0,0,1,0,...,2.0,1.0,1,0,0,1,0,0,0,1.0
4,2.0,0,3.0,2.0,1.0,5.0,0,0,0,0,...,1.0,1.0,0,0,0,0,0,0,0,1.0


In [5]:
mapping_used.head()

Unnamed: 0,column,question,variable_label,type,convert_id,variable_type
0,q27,"During the past 12 months, did you ever seriou...",considered suicide,binary,,dependent
1,q43,38_5_Drinks_30_Days_Boy_Girl,current binge drinking,multi-select,2.0,independent
2,q1,how old are you?,how old are you,multi-select,,demographic
3,q2,what is your sex?,what is your sex,binary,,demographic
4,q3,in what grade are you,in what grade are you,multi-select,,demographic


In [6]:
# --- Identify binary, non-demographic variables from mapping ---
binary_vars = (
    mapping_used[
        (mapping_used["type"].str.lower() == "binary") &
        (mapping_used["variable_type"].str.lower() != "demographic")
    ]["variable_label"]
    .tolist()
)

print("Binary variables to convert:", binary_vars)

# --- Convert 1→1 and 2→0 in df_2023 ---
for col in binary_vars:
    if col in df_2023.columns:
        df_2023[col] = df_2023[col].replace({1: 1, 2: 0})


Binary variables to convert: ['considered suicide', 'saw physical violence in neighborhood', 'forced sexual intercourse', 'bullying at school', 'electronic bullying', 'sad or hopeless', 'made a suicide plan', 'ever cigarette use', 'electronic vapor product use', 'ever sexual intercourse']


In [7]:
df_2023.head()

Unnamed: 0,considered suicide,current binge drinking,how old are you,what is your sex,in what grade are you,what is your race,weapon carrying at school,gun carrying,safety concerns at school,threatened at school,...,ever cigarette use,electronic vapor product use,ever marijuana use,ever prescription pain medicine use,ever use cocaine,ever inhalant use,ever heroin use,ever methamphetamine use,ever ecstacy use,ever sexual intercourse
0,0.0,0,3.0,1.0,1.0,,0,0,1,0,...,0.0,0.0,0,1,0,0,0,0,0,0.0
1,0.0,0,4.0,2.0,1.0,5.0,0,0,0,0,...,0.0,0.0,0,0,0,0,0,0,0,0.0
2,0.0,0,5.0,2.0,3.0,5.0,0,0,0,0,...,1.0,1.0,0,0,0,1,0,0,0,1.0
3,0.0,0,6.0,1.0,2.0,5.0,0,0,1,0,...,0.0,1.0,1,0,0,1,0,0,0,1.0
4,0.0,0,3.0,2.0,1.0,5.0,0,0,0,0,...,1.0,1.0,0,0,0,0,0,0,0,1.0


In [8]:
# convert all to Int64 for Analysis
df_2023[binary_vars] = df_2023[binary_vars].astype("Int64")

In [9]:
df_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20103 entries, 0 to 20102
Data columns (total 30 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   considered suicide                          19667 non-null  Int64  
 1   current binge drinking                      20103 non-null  int64  
 2   how old are you                             20005 non-null  float64
 3   what is your sex                            19945 non-null  float64
 4   in what grade are you                       19910 non-null  float64
 5   what is your race                           19733 non-null  float64
 6   weapon carrying at school                   20103 non-null  int64  
 7   gun carrying                                20103 non-null  int64  
 8   safety concerns at school                   20103 non-null  int64  
 9   threatened at school                        20103 non-null  int64  
 10  pyhsical F

In [None]:
print('Download Data file for future use')
df_2023.to_csv("df_2023.csv")

In [None]:
# -------------------------------------
# 2. IDENTIFY DEMOGRAPHIC LABELS
# -------------------------------------
demo_labels = (
    mapping[mapping["variable_type"].str.lower() == "demographic"]["variable_label"]
    .str.strip()
    .str.lower()
    # .str.replace(" ", "_")
    .tolist()
)

print("Demographic labels:", demo_labels)

# -------------------------------------
# 3. EXTRACT DEMOGRAPHIC COLUMNS
# -------------------------------------
df_demo = df_2023[[label for label in demo_labels if label in df_2023.columns]].copy()

#print("Extracted demographic data shape:", df_demo.shape)
#print(df_demo.head())


In [None]:
# -------------------------------------
# Load lookup tables
# -------------------------------------
age_lookup = pd.read_csv("02_age.csv")
grade_lookup = pd.read_csv("04_grade.csv")
# race_lookup = pd.read_csv("05_race.csv")
race_lookup = pd.read_csv("05_race.csv", encoding="latin1")
# Make lookup dictionaries
age_map = dict(zip(age_lookup["ID"], age_lookup["Description"]))
grade_map = dict(zip(grade_lookup["ID"], grade_lookup["Description"]))
race_map = dict(zip(race_lookup["ID"], race_lookup["Description"]))

# -------------------------------------
# Apply mappings to demographic columns
# -------------------------------------
# Replace raw codes with text labels
if "how old are you" in df_demo.columns:
    df_demo["age_label"] = df_demo["how old are you"].map(age_map)

if "in what grade are you" in df_demo.columns:
    df_demo["grade_label"] = df_demo["in what grade are you"].map(grade_map)

if "what is your race" in df_demo.columns:
    df_demo["race_label"] = df_demo["what is your race"].map(race_map)


In [None]:

# -------------------------------------
# Create visuals with correct labels
# -------------------------------------

# AGE
if "age_label" in df_demo.columns:
    plt.figure(figsize=(12,6))
    df_demo["age_label"].value_counts().sort_index().plot(kind="bar", color="steelblue")
    plt.title("Age Distribution (YRBS 2023)")
    plt.xlabel("Age Group")
    plt.ylabel("Count")
    plt.xticks(rotation=0)
    plt.show()

# GRADE
if "grade_label" in df_demo.columns:
    plt.figure(figsize=(12,6))
    df_demo["grade_label"].value_counts().sort_index().plot(kind="bar", color="darkcyan")
    plt.title("Grade Distribution (YRBS 2023)")
    plt.xlabel("Grade")
    plt.ylabel("Count")
    plt.xticks(rotation=0)
    plt.show()

# RACE
if "race_label" in df_demo.columns:
    plt.figure(figsize=(12,6))
    df_demo["race_label"].value_counts().plot(kind="bar", color="slateblue")
    plt.title("Race Distribution (YRBS 2023)")
    plt.xlabel("Race")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()
