In [1]:
 # Initial imports
import pandas as pd
from sklearn.linear_model import LinearRegression
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, r2_score
%matplotlib inline

In [2]:
# Loading data
ob_df = pd.read_csv("Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv")
ob_df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2020,2020,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59.0,Race/Ethnicity,Hispanic,RACE,RACEHIS
1,2014,2014,GU,Guam,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(13.444304, 144.793731)",OWS,OWS1,Q036,VALUE,66.0,Education,High school graduate,EDU,EDUHSGRAD
2,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,,OWS,OWS1,Q036,VALUE,59.0,Income,"$50,000 - $74,999",INC,INC5075
3,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,,OWS,OWS1,Q037,VALUE,59.0,Income,Data not reported,INC,INCNR
4,2015,2015,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who achieve at least 300 min...,,Value,...,,PA,PA1,Q045,VALUE,59.0,Income,"Less than $15,000",INC,INCLESS15


In [3]:
ob_clean_df = ob_df[['YearStart', 'LocationAbbr',       
       'Data_Value',
       #'Low_Confidence_Limit', 'High_Confidence_Limit ',
       #'Sample_Size', 
       #'TopicID',
       'QuestionID',
       'StratificationID1']].copy()

In [4]:
ob_clean_df = ob_clean_df[(ob_clean_df['StratificationID1'] != 'OVERALL') & (ob_clean_df['LocationAbbr'] != 'US') & (ob_clean_df['Data_Value'].notna())].copy()
ob_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25647 entries, 1 to 31209
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   YearStart          25647 non-null  int64  
 1   LocationAbbr       25647 non-null  object 
 2   Data_Value         25647 non-null  float64
 3   QuestionID         25647 non-null  object 
 4   StratificationID1  25647 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 1.2+ MB


In [5]:
ob_clean_df = ob_clean_df.pivot(index=["YearStart", "LocationAbbr", "StratificationID1"], columns=["QuestionID"], values="Data_Value")
ob_clean_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,QuestionID,Q018,Q019,Q036,Q037,Q043,Q044,Q045,Q046,Q047
YearStart,LocationAbbr,StratificationID1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011,AK,AGEYR2534,,,,,57.6,,,,
2011,AK,AGEYR3544,,,,38.9,,,,,
2011,AK,AGEYR4554,,,29.2,,56.2,,,,
2011,AK,AGEYR5564,,,,,58.2,,,,26.0
2011,AK,AGEYR65PLUS,,,,,58.9,,,,


In [6]:
ob_clean_df = ob_clean_df.reset_index()
ob_clean_df

QuestionID,YearStart,LocationAbbr,StratificationID1,Q018,Q019,Q036,Q037,Q043,Q044,Q045,Q046,Q047
0,2011,AK,AGEYR2534,,,,,57.6,,,,
1,2011,AK,AGEYR3544,,,,38.9,,,,,
2,2011,AK,AGEYR4554,,,29.2,,56.2,,,,
3,2011,AK,AGEYR5564,,,,,58.2,,,,26.0
4,2011,AK,AGEYR65PLUS,,,,,58.9,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
8926,2020,WY,MALE,,,30.1,44.0,,,,,20.3
8927,2020,WY,RACE2PLUS,,,24.5,41.2,,,,,23.9
8928,2020,WY,RACEHIS,,,35.2,33.0,,,,,21.1
8929,2020,WY,RACENAA,,,36.4,42.7,,,,,34.4


In [7]:
#Q018, #Q019 = fruit and vegetable questions
ob_clean_df = ob_clean_df.loc[ob_clean_df["YearStart"].isin([2011,2013,2015,2017,2019])]
ob_clean_df = ob_clean_df.drop(columns=["Q018", "Q019"])
ob_clean_df = ob_clean_df.dropna()
ob_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276 entries, 3775 to 6318
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   YearStart          276 non-null    int64  
 1   LocationAbbr       276 non-null    object 
 2   StratificationID1  276 non-null    object 
 3   Q036               276 non-null    float64
 4   Q037               276 non-null    float64
 5   Q043               276 non-null    float64
 6   Q044               276 non-null    float64
 7   Q045               276 non-null    float64
 8   Q046               276 non-null    float64
 9   Q047               276 non-null    float64
dtypes: float64(7), int64(1), object(2)
memory usage: 23.7+ KB


In [8]:
ob_dummies_df = pd.get_dummies(ob_clean_df)
ob_dummies_df.head()

Unnamed: 0,YearStart,Q036,Q037,Q043,Q044,Q045,Q046,Q047,LocationAbbr_AK,LocationAbbr_AL,...,StratificationID1_INCNR,StratificationID1_MALE,StratificationID1_RACE2PLUS,StratificationID1_RACEASN,StratificationID1_RACEBLK,StratificationID1_RACEHIS,StratificationID1_RACEHPI,StratificationID1_RACENAA,StratificationID1_RACEOTH,StratificationID1_RACEWHT
3775,2015,25.0,27.5,54.8,24.6,30.4,41.4,20.2,0,0,...,0,0,0,0,0,0,0,0,0,0
3776,2015,36.5,33.9,48.7,27.2,26.9,40.9,23.8,0,0,...,0,0,0,0,0,0,0,0,0,0
3777,2015,28.0,42.8,47.9,23.0,30.2,33.8,32.6,0,0,...,0,0,0,0,0,0,0,0,0,0
3778,2015,39.9,34.8,50.0,19.4,34.7,27.3,33.6,0,0,...,0,0,0,0,0,0,0,0,0,0
3779,2015,33.7,36.2,51.0,16.6,39.8,21.1,33.7,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
ob_dummies_df["Years_since_start"] = 2023 - ob_dummies_df['YearStart']
del ob_dummies_df['YearStart']
ob_dummies_df.head()

Unnamed: 0,Q036,Q037,Q043,Q044,Q045,Q046,Q047,LocationAbbr_AK,LocationAbbr_AL,LocationAbbr_AR,...,StratificationID1_MALE,StratificationID1_RACE2PLUS,StratificationID1_RACEASN,StratificationID1_RACEBLK,StratificationID1_RACEHIS,StratificationID1_RACEHPI,StratificationID1_RACENAA,StratificationID1_RACEOTH,StratificationID1_RACEWHT,Years_since_start
3775,25.0,27.5,54.8,24.6,30.4,41.4,20.2,0,0,0,...,0,0,0,0,0,0,0,0,0,8
3776,36.5,33.9,48.7,27.2,26.9,40.9,23.8,0,0,0,...,0,0,0,0,0,0,0,0,0,8
3777,28.0,42.8,47.9,23.0,30.2,33.8,32.6,0,0,0,...,0,0,0,0,0,0,0,0,0,8
3778,39.9,34.8,50.0,19.4,34.7,27.3,33.6,0,0,0,...,0,0,0,0,0,0,0,0,0,8
3779,33.7,36.2,51.0,16.6,39.8,21.1,33.7,0,0,0,...,0,0,0,0,0,0,0,0,0,8


In [10]:
print(ob_dummies_df.shape)

ob_dummies_df = ob_dummies_df.dropna()

print(ob_dummies_df.shape)

(276, 87)
(276, 87)


In [11]:
#Q036 = Percent of adults aged 18 years and older who have obesity
# Select features and target variable
X = ob_dummies_df.drop("Q036", axis=1)
y = ob_dummies_df["Q036"]  



In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [13]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
# Create the Random Forest Regressor model
model = LinearRegression()


In [15]:
# Train the model
model.fit(X_train_scaled, y_train)


In [16]:
# Make predictions
y_pred = model.predict(X_test_scaled)


In [17]:
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Target variable standard deviation is {y_test.std()}")
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R^2 Score:", r2)

Target variable standard deviation is 7.694693735835863
Mean Squared Error: 14.943033819949212
Root Mean Squared Error: 3.865622048254228
R^2 Score: 0.7430305828674417
