In [21]:
import pandas as pd

# Load the primary dataset with activity data
data = pd.read_csv("FitbitActivity(1-30-20).csv")

# Select only the numeric columns (excluding non-numeric columns like 'egoid')
numeric_columns = data.select_dtypes(include=['number']).columns

# Group the data by 'egoid' and calculate the mean of all numeric columns
data_grouped = data.groupby('egoid')[numeric_columns].mean()

# Inspect the first few rows to ensure it worked
print(data_grouped.head())

         egoid  complypercent   meanrate     sdrate         steps     floors  \
egoid                                                                          
10237  10237.0      88.072883  75.225984  18.228836  12415.061093  14.191351   
10469  10469.0      64.736364  89.279848  13.887779   7885.719136   6.809091   
10547  10547.0      79.178571  77.981580  17.558218  16509.214286  24.607143   
11002  11002.0      92.877622  78.571042  16.413416  13308.022145  18.502364   
11128  11128.0      80.448571  76.240607  12.981718   9121.991404  17.751462   

       sedentaryminutes  lightlyactiveminutes  fairlyactiveminutes  \
egoid                                                                
10237        650.421858            244.457923            47.045016   
10469       1003.060606            189.521212             8.757576   
10547        753.428571            226.750000            52.714286   
11002        711.407143            230.586905            52.179487   
11128        828.23

In [22]:
# Load the second CSV with health labels (this file should have health labels for each ego)
labels_data = pd.read_csv("BasicSurvey(3-6-20).csv")

# Inspect the labels data
labels_data.head()

  labels_data = pd.read_csv("BasicSurvey(3-6-20).csv")


Unnamed: 0,egoid,Tier,StudyStatus,RegionUS,US,StartDate_1,EndDate_1,gender_1,hs_1,hssex_1,...,gpa_fa2017,gpa_fa2018,gpa_sp2019,gpa_su2019,reshallyear1,reshallyear2,liveoffcampusyear3,reshallyear3,liveoffcampusyear4,reshallyear4
0,44869,Tier 1,In Study,South Atlantic,US address,05aug2015 17:25:30,05aug2015 19:09:07,Female,Public school,A mixture of boys and girls,...,3.513,,,,299.0,299.0,On Campus,299.0,,
1,43516,Tier 1,In Study,Pacific,US address,10aug2015 18:26:07,10aug2015 19:27:45,Female,Public school,A mixture of boys and girls,...,,,,,283.0,283.0,On Campus,283.0,,
2,89411,Tier 3,In Study,Pacific,US address,,,Female,Private independent college-prep school,A mixture of boys and girls,...,,,,,287.0,287.0,,,,
3,74285,Tier 3,In Study,West South Central,US address,,,Female,Public school,A mixture of boys and girls,...,,,,,279.0,279.0,,,,
4,56527,Tier 3,In Study,Pacific,US address,,,Male,Private religious/parochial school,A mixture of boys and girls,...,,,,,105.0,105.0,,,,


In [23]:
# Merge the averaged Fitbit data with the health labels on 'egoid'
data_grouped.reset_index(inplace=True, drop=True)
labels_data.reset_index(inplace=True, drop=True)
merged_data = pd.merge(data_grouped, labels_data, on='egoid', how='inner')

# Inspect the merged data
print(merged_data.head())

     egoid  complypercent   meanrate     sdrate         steps     floors  \
0  10237.0      88.072883  75.225984  18.228836  12415.061093  14.191351   
1  10469.0      64.736364  89.279848  13.887779   7885.719136   6.809091   
2  10547.0      79.178571  77.981580  17.558218  16509.214286  24.607143   
3  11002.0      92.877622  78.571042  16.413416  13308.022145  18.502364   
4  11128.0      80.448571  76.240607  12.981718   9121.991404  17.751462   

   sedentaryminutes  lightlyactiveminutes  fairlyactiveminutes  \
0        650.421858            244.457923            47.045016   
1       1003.060606            189.521212             8.757576   
2        753.428571            226.750000            52.714286   
3        711.407143            230.586905            52.179487   
4        828.231928            166.234940            16.071429   

   veryactiveminutes  ...  gpa_fa2017  gpa_fa2018  gpa_sp2019  gpa_su2019  \
0          47.510383  ...         3.4         3.6         3.4        

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
# Mapping categorical health ratings to numerical values
health_mapping = {
    'Poor': 1,
    'Fair': 2,
    'Good': 3,
    'Excellent': 4
}

# Apply the mapping to the health columns (health_1 to health_8)
health_columns = ['Health_1', 'Health_2', 'Health_3', 'Health_4', 'Health_5', 'Health_6', 'Health_7', 'Health_8']
for col in health_columns:
    merged_data[col] = merged_data[col].map(health_mapping)

# Calculate the average health score (we can compute the mean of these health columns)
merged_data['health_score'] = merged_data[health_columns].mean(axis=1)

# Now, merged_data['health_score'] will be the target variable
# We will use it as the quantitative target for classification or regression

# Select features (same as before)
features = ['meanrate', 'sdrate', 'steps', 'floors', 'sedentaryminutes', 
            'lightlyactiveminutes', 'fairlyactiveminutes', 'veryactiveminutes', 
            'lowrangemins', 'fatburnmins', 'cardiomins', 'peakmins', 
            'lowrangecal', 'fatburncal', 'cardiocal', 'peakcal']

X = merged_data[features]  # Features
# Discretize the health_score into bins
bins = [0, 1.5, 2.5, 3.5, 4]  # Define bin edges
labels = [1, 2, 3, 4]  # Labels for the bins
merged_data['health_score_category'] = pd.cut(merged_data['health_score'], bins=bins, labels=labels)

# Update the target variable
y = merged_data['health_score_category']
# Scale the data (important for algorithms like SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Check the new target variable
print(merged_data[['health_score']].head())


   health_score
0      2.571429
1      2.400000
2      3.000000
3      3.400000
4      3.000000


In [25]:
print(merged_data[['health_score_category']].head())


  health_score_category
0                     3
1                     2
2                     3
3                     3
4                     3


In [26]:
# Import Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
print("Naive Bayes Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Classifier Performance:
Accuracy: 0.43315508021390375
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.19      0.54      0.28        26
           3       0.71      0.48      0.57       124
           4       0.30      0.20      0.24        35

    accuracy                           0.43       187
   macro avg       0.30      0.31      0.27       187
weighted avg       0.55      0.43      0.46       187



In [27]:
# Import SVM classifier
from sklearn.svm import SVC

# Initialize and train the model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
print("SVM Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


SVM Classifier Performance:
Accuracy: 0.6577540106951871
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00        26
           3       0.66      0.99      0.79       124
           4       0.00      0.00      0.00        35

    accuracy                           0.66       187
   macro avg       0.17      0.25      0.20       187
weighted avg       0.44      0.66      0.53       187



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
# Import Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# Predictions
y_pred_tree = tree_model.predict(X_test)

# Evaluate the model
print("Classification Tree Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))


Classification Tree Performance:
Accuracy: 0.5721925133689839
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.33      0.27      0.30        26
           3       0.71      0.70      0.70       124
           4       0.31      0.37      0.34        35

    accuracy                           0.57       187
   macro avg       0.34      0.34      0.33       187
weighted avg       0.57      0.57      0.57       187



In [29]:
# Comparing all models
models = ['Naive Bayes', 'SVM', 'Classification Tree']
accuracies = [
    accuracy_score(y_test, y_pred_nb),
    accuracy_score(y_test, y_pred_svm),
    accuracy_score(y_test, y_pred_tree)
]

# Create a DataFrame to summarize the results
performance_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies
})

print("Model Performance Summary:")
print(performance_df)


Model Performance Summary:
                 Model  Accuracy
0          Naive Bayes  0.433155
1                  SVM  0.657754
2  Classification Tree  0.572193
