# Capstone Two: Pre-processing & Training Data Development

**Create dummy or indicator features for categorical variables**

**Standardize the magnitude of numeric features using a scaler**

**Split your data into testing and training datasets**

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('mental_data.csv')

In [3]:
df.head(5)

Unnamed: 0,YearStart,YearEnd,State,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Age_group,Stratification2,StratificationCategoryID2
0,2022,2022,MD,9.0,6.5,12.3,65+,"Black, non-Hispanic",RACE
1,2022,2022,WI,5.6,4.4,7.2,65+,Male,GENDER
2,2022,2022,OK,21.5,15.4,29.2,Overall,Native Am/Alaskan Native,RACE
3,2022,2022,PA,10.0,8.3,12.1,Overall,"White, non-Hispanic",RACE
4,2016,2016,NH,10.4,8.6,12.6,50-64,Female,GENDER


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4778 entries, 0 to 4777
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   YearStart                  4778 non-null   int64  
 1   YearEnd                    4778 non-null   int64  
 2   State                      4681 non-null   object 
 3   Data_Value                 4778 non-null   float64
 4   Low_Confidence_Limit       4778 non-null   float64
 5   High_Confidence_Limit      4778 non-null   float64
 6   Age_group                  4778 non-null   object 
 7   Stratification2            4778 non-null   object 
 8   StratificationCategoryID2  4778 non-null   object 
dtypes: float64(3), int64(2), object(4)
memory usage: 336.1+ KB


In [5]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Drop rows with missing values
df = df.dropna()  

# One-hot encoding categorical features
categorical_cols = ['State', 'Age_group', 'Stratification2', 'StratificationCategoryID2']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [6]:
# Standardize numeric features
numeric_cols = ['YearStart', 'YearEnd', 'Data_Value', 'Low_Confidence_Limit', 'High_Confidence_Limit']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [7]:
# Split into training and testing sets (80% train, 20% test)
X = df.drop(columns=['Data_Value'])  # Features
y = df['Data_Value']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display dataset info
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (3744, 62)
Testing data shape: (937, 62)


In [9]:
X_train.columns

Index(['YearStart', 'YearEnd', 'Low_Confidence_Limit', 'High_Confidence_Limit',
       'State_AL', 'State_AR', 'State_AZ', 'State_CA', 'State_CO', 'State_CT',
       'State_DE', 'State_FL', 'State_GA', 'State_HI', 'State_IA', 'State_ID',
       'State_IL', 'State_IN', 'State_KS', 'State_KY', 'State_LA', 'State_MA',
       'State_MD', 'State_ME', 'State_MI', 'State_MN', 'State_MO', 'State_MS',
       'State_MT', 'State_NC', 'State_ND', 'State_NE', 'State_NH', 'State_NJ',
       'State_NM', 'State_NV', 'State_NY', 'State_OH', 'State_OK', 'State_OR',
       'State_PA', 'State_RI', 'State_SC', 'State_SD', 'State_TN', 'State_TX',
       'State_UT', 'State_VA', 'State_VT', 'State_WA', 'State_WI', 'State_WV',
       'State_WY', 'Age_group_65+', 'Age_group_Overall',
       'Stratification2_Black, non-Hispanic', 'Stratification2_Female',
       'Stratification2_Hispanic', 'Stratification2_Male',
       'Stratification2_Native Am/Alaskan Native',
       'Stratification2_White, non-Hispanic',
   