In [31]:
 # Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, r2_score
%matplotlib inline

In [3]:
# Loading data
ob_df = pd.read_csv("../Resources/Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv")
ob_df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2020,2020,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Race/Ethnicity,Hispanic,RACE,RACEHIS
1,2014,2014,GU,Guam,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(13.444304, 144.793731)",OWS,OWS1,Q036,VALUE,66,Education,High school graduate,EDU,EDUHSGRAD
2,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,,OWS,OWS1,Q036,VALUE,59,Income,"$50,000 - $74,999",INC,INC5075
3,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,,OWS,OWS1,Q037,VALUE,59,Income,Data not reported,INC,INCNR
4,2015,2015,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who achieve at least 300 min...,,Value,...,,PA,PA1,Q045,VALUE,59,Income,"Less than $15,000",INC,INCLESS15


In [34]:
ob_clean_df = ob_df[['YearStart', 'LocationAbbr',       
       'Data_Value',
       #'Low_Confidence_Limit', 'High_Confidence_Limit ',
       'Sample_Size', 'TopicID', 'QuestionID',
       'StratificationID1']].copy()

In [35]:
ob_clean_df2 = ob_clean_df[ob_clean_df['StratificationID1'] != 'OVERALL'].copy()

In [36]:
ob_dummies_df = pd.get_dummies(ob_clean_df2)
ob_dummies_df.head()

Unnamed: 0,YearStart,Data_Value,Sample_Size,LocationAbbr_AK,LocationAbbr_AL,LocationAbbr_AR,LocationAbbr_AZ,LocationAbbr_CA,LocationAbbr_CO,LocationAbbr_CT,...,StratificationID1_INCNR,StratificationID1_MALE,StratificationID1_RACE2PLUS,StratificationID1_RACEASN,StratificationID1_RACEBLK,StratificationID1_RACEHIS,StratificationID1_RACEHPI,StratificationID1_RACENAA,StratificationID1_RACEOTH,StratificationID1_RACEWHT
0,2020,30.6,31255.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2014,29.3,842.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2013,28.8,62562.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013,32.7,60069.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,2015,26.6,30904.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
ob_dummies_df["Years_since_start"] = 2023 - ob_dummies_df['YearStart']
del ob_dummies_df['YearStart']
ob_dummies_df.head()

Unnamed: 0,Data_Value,Sample_Size,LocationAbbr_AK,LocationAbbr_AL,LocationAbbr_AR,LocationAbbr_AZ,LocationAbbr_CA,LocationAbbr_CO,LocationAbbr_CT,LocationAbbr_DC,...,StratificationID1_MALE,StratificationID1_RACE2PLUS,StratificationID1_RACEASN,StratificationID1_RACEBLK,StratificationID1_RACEHIS,StratificationID1_RACEHPI,StratificationID1_RACENAA,StratificationID1_RACEOTH,StratificationID1_RACEWHT,Years_since_start
0,30.6,31255.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,3
1,29.3,842.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
2,28.8,62562.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
3,32.7,60069.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
4,26.6,30904.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8


In [38]:
print(ob_dummies_df.shape)

ob_dummies_df = ob_dummies_df.dropna()

print(ob_dummies_df.shape)

(85464, 97)
(76703, 97)


In [39]:

# Select features and target variable
X = ob_dummies_df.drop("Data_Value", axis=1)
y = ob_dummies_df["Data_Value"]  



In [40]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [43]:
# Create the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=50, random_state=42)


In [44]:
# Train the model
model.fit(X_train_scaled, y_train)


In [45]:
# Make predictions
y_pred = model.predict(X_test_scaled)


In [46]:
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 14.550313676292289
R^2 Score: 0.8578693401801027
