# Loading Data

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv('physicaldataset2.csv')

# Initial number of rows and columns
print(f"Initial dataset shape: {df.shape} (rows, columns)")

Initial dataset shape: (93249, 33) (rows, columns)


# Displays

In [2]:
# Check for the number of missing values in each column
print(df.isnull().sum())


YearStart                         0
YearEnd                           0
LocationAbbr                      0
LocationDesc                      0
Datasource                        0
Class                             0
Topic                             0
Question                          0
Data_Value_Unit               93249
Data_Value_Type                   0
Data_Value                     9235
Data_Value_Alt                 9235
Data_Value_Footnote_Symbol    84014
Data_Value_Footnote           84014
Low_Confidence_Limit           9235
High_Confidence_Limit          9235
Sample_Size                    9235
Total                         89919
Age(years)                    73269
Education                     79929
Gender                        86589
Income                        69939
Race/Ethnicity                66609
GeoLocation                    1736
ClassID                           0
TopicID                           0
QuestionID                        0
DataValueTypeID             

# Cleaning Data

In [3]:
# Remove unnecessary columns
df = df.drop(['Data_Value_Unit', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote', 'DataValueTypeID', 'QuestionID', 'TopicID', 'ClassID'], axis=1)

# Drop rows where 'GeoLocation' is missing
df = df.dropna(subset=['GeoLocation'])

# Remove rows with missing 'Data_Value'
df = df.dropna(subset=['Data_Value'])

# Remove rows where 'Age(years)', 'Gender', 'Race/Ethnicity', are all missing
df = df.dropna(subset=['Age(years)', 'Gender', 'Race/Ethnicity'], how='all')

# Fill in missing data for 'Age(years)', 'Gender', 'Race/Ethnicity', 'Income' with "Missing Data"
for column in ['Age(years)', 'Gender', 'Race/Ethnicity', 'Income']:
    df[column] = df[column].fillna('Missing Data')

# Check if YearStart and YearEnd are always the same
year_comparison = (df['YearStart'] == df['YearEnd']).all()
if year_comparison:
    # Drop YearEnd if it's always the same as YearStart
    df = df.drop('YearEnd', axis=1)

# Display the shape to see how many rows are left after all clean-up steps
print(df.shape)

# Check the remaining columns after cleanup
print(df.columns.tolist())

# Display the first few rows of the cleaned dataset
print(df.head())

# Print the shape of the DataFrame after cleaning to show the number of rows and columns remaining
print(f"Final dataset shape: {df.shape} (rows, columns)")

print(f"Total number of rows after cleaning: {df.shape[0]}")

(43274, 25)
['YearStart', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Data_Value_Type', 'Data_Value', 'Data_Value_Alt', 'Low_Confidence_Limit', 'High_Confidence_Limit ', 'Sample_Size', 'Total', 'Age(years)', 'Education', 'Gender', 'Income', 'Race/Ethnicity', 'GeoLocation', 'LocationID', 'StratificationCategory1', 'Stratification1', 'StratificationCategoryId1', 'StratificationID1']
    YearStart LocationAbbr  LocationDesc  \
5        2015           GU          Guam   
6        2012           WY       Wyoming   
9        2011           AL       Alabama   
11       2015           RI  Rhode Island   
14       2020           DE      Delaware   

                                    Datasource                    Class  \
5   Behavioral Risk Factor Surveillance System        Physical Activity   
6   Behavioral Risk Factor Surveillance System  Obesity / Weight Status   
9   Behavioral Risk Factor Surveillance System  Obesity / Weight Status   
11  Behavioral Ris

In [4]:
# Display unique values for key columns
print("Unique values in 'Age(years)':", df['Age(years)'].unique())
print("Unique values in 'Gender':", df['Gender'].unique())
print("Unique values in 'Race/Ethnicity':", df['Race/Ethnicity'].unique())
print("Unique values in 'Income':", df['Income'].unique())
unique_locations = df['LocationDesc'].unique()
print("Unique locations:", unique_locations)


Unique values in 'Age(years)': ['Missing Data' '25 - 34' '55 - 64' '45 - 54' '35 - 44' '18 - 24'
 '65 or older']
Unique values in 'Gender': ['Missing Data' 'Male' 'Female']
Unique values in 'Race/Ethnicity': ['Hispanic' 'American Indian/Alaska Native' 'Missing Data' 'Asian' 'Other'
 '2 or more races' 'Non-Hispanic White' 'Hawaiian/Pacific Islander'
 'Non-Hispanic Black']
Unique values in 'Income': ['Missing Data']
Unique locations: ['Guam' 'Wyoming' 'Alabama' 'Rhode Island' 'Delaware' 'New Jersey'
 'Puerto Rico' 'Maine' 'Virginia' 'Washington' 'California' 'New York'
 'Massachusetts' 'Arkansas' 'Illinois' 'New Hampshire' 'Maryland' 'Hawaii'
 'Louisiana' 'South Dakota' 'Texas' 'Oklahoma' 'Oregon' 'Kansas' 'Florida'
 'Idaho' 'Virgin Islands' 'Montana' 'District of Columbia' 'Minnesota'
 'Colorado' 'North Carolina' 'North Dakota' 'South Carolina'
 'Pennsylvania' 'Nebraska' 'Michigan' 'Nevada' 'New Mexico' 'Wisconsin'
 'Utah' 'Arizona' 'Mississippi' 'Indiana' 'Georgia' 'Ohio' 'Iowa'
 'Kent

# Gathering Data for ML

In [5]:
X = df.drop('Class', axis=1)  # Features
y = df['Class']  # Target variable

# Identify non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns

# Remove non-numeric columns from the feature set
X_numeric = X.drop(columns=non_numeric_columns)

# Correlation analysis
correlation_matrix = X_numeric.corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Statistical tests (ANOVA F-test)
selector = SelectKBest(score_func=f_classif, k=5)  # Select top 5 features
selector.fit(X_numeric, y)
p_values = selector.pvalues_
print("\nP-values:")
for feature, p_value in zip(X_numeric.columns, p_values):
    print(f"{feature}: {p_value:.5f}")

# Feature importance scores from Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_numeric, y)
importances = rf_model.feature_importances_
print("\nFeature Importances:")
for feature, importance in zip(X_numeric.columns, importances):
    print(f"{feature}: {importance:.5f}")

Correlation Matrix:
                        YearStart  Data_Value  Data_Value_Alt  \
YearStart                1.000000   -0.000904       -0.000904   
Data_Value              -0.000904    1.000000        1.000000   
Data_Value_Alt          -0.000904    1.000000        1.000000   
Low_Confidence_Limit    -0.001535    0.936790        0.936790   
High_Confidence_Limit   -0.001632    0.916696        0.916696   
Sample_Size             -0.053299   -0.000841       -0.000841   
LocationID               0.037441    0.006923        0.006923   

                        Low_Confidence_Limit  High_Confidence_Limit   \
YearStart                          -0.001535               -0.001632   
Data_Value                          0.936790                0.916696   
Data_Value_Alt                      0.936790                0.916696   
Low_Confidence_Limit                1.000000                0.722091   
High_Confidence_Limit               0.722091                1.000000   
Sample_Size                

# Testing the ML

In [6]:
X = df.drop('Class', axis=1)  # Features
y = df['Class']  # Target variable

# Identify non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns

# Remove non-numeric columns from the feature set
X_numeric = X.drop(columns=non_numeric_columns)

selected_features = ['YearStart', 'Data_Value', 'Low_Confidence_Limit', 'High_Confidence_Limit ', 'Sample_Size', 'LocationID']
X_selected = X_numeric[selected_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Train a random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the models
lr_predictions = lr_model.predict(X_test_scaled)
rf_predictions = rf_model.predict(X_test_scaled)

lr_accuracy = accuracy_score(y_test, lr_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)

lr_precision = precision_score(y_test, lr_predictions, average='weighted')
rf_precision = precision_score(y_test, rf_predictions, average='weighted')

lr_recall = recall_score(y_test, lr_predictions, average='weighted')
rf_recall = recall_score(y_test, rf_predictions, average='weighted')

lr_f1 = f1_score(y_test, lr_predictions, average='weighted')
rf_f1 = f1_score(y_test, rf_predictions, average='weighted')

print("Logistic Regression:")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1-score:", lr_f1)

print("\nRandom Forest:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-score:", rf_f1)

Logistic Regression:
Accuracy: 0.5377238590410167
Precision: 0.47530423499374425
Recall: 0.5377238590410167
F1-score: 0.48543787340070627

Random Forest:
Accuracy: 0.6614673599075679
Precision: 0.6519165547082781
Recall: 0.6614673599075679
F1-score: 0.6539979578807535
