In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, LeaveOneOut, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from time import process_time

from sklearn.linear_model import Lasso, Ridge

In [2]:
# Load the dataset
df = pd.read_csv("DailyActivitiesPreProcessed.csv")

In [3]:
# Select relevant columns
df2 = df[['Gender', 'Age', 'Occupation', 'Marital status', 'HouseHoldIncome', 'IncomeGroup',
          'Sleeping_duration_weekday', 'Sleeping_duration_weekend', 'University_Class_duration_weekday',
          'University_Class_duration_weekend', 'Study_duration_weekday', 'Study_duration_weekend',
          'Exercise_duration_weekday', 'Exercise_duration_weekend', 'Work_duration_weekday',
          'Work_duration_weekend', 'Housework_duration_weekday', 'Housework_duration_weekend', 'Entertainment_duration_weekday',
          'Entertainment_duration_weekend', 'Personal_Care_duration_weekday', 'Personal_Care_duration_weekend',
          'Eating_duration_weekday', 'Eating_duration_weekend', 'Socializing_duration_weekday',
          'Socializing_duration_weekend', 'Religion_or_Spiritual_Activities_duration_weekday', 'Religion_or_Spiritual_Activities_duration_weekend',
          'Shopping_duration_weekday', 'Shopping_duration_weekend', 'Cooking_duration_weekday',
          'Cooking_duration_weekend', 'Family_Time_duration_weekday', 'Family_Time_duration_weekend',
          'Sports_duration_weekday', 'Sports_duration_weekend', 'Reading_or_Writing_duration_weekday',
          'Reading_or_Writing_duration_weekend', 'Nightlife_duration_weekday', 'Nightlife_duration_weekend',
          'Travelling_duration_weekday', 'Travelling_duration_weekend', 'Volunteering_duration_weekday',
          'Volunteering_duration_weekend', 'Gardening_duration_weekday', 'Gardening_duration_weekend',
          'Entertainment_merged_duration_weekday', 'Entertainment_merged_duration_weekend', 'Personal_care_merged_duration_weekday',
          'Personal_care_merged_duration_weekend', 'Housework_merged_duration_weekday', 'Housework_merged_duration_weekend',
          'Active_duration_weekday', 'Active_duration_weekend', 'Leisure_duration_weekday', 'Leisure_duration_weekend',
          'Neutral_duration_weekday', 'Neutral_duration_weekend', 'Productivity_score_weekday', 'Productivity_score_weekend']]


In [4]:
df.head()

Unnamed: 0,Gender,Age,Occupation,Marital status,HouseHoldIncome,Weekday_05:00-05:30,Weekday_05:30-06:00,Weekday_06:00-06:30,Weekday_06:30-07:00,Weekday_07:00-07:30,...,Neutral_duration_weekday,Neutral_duration_weekend,Weekday_Breakfast_time,Weekend_Breakfast_time,Weekday_Lunch_time,Weekend_Lunch_time,Weekday_Dinner_time,Weekend_Dinner_time,Productivity_score_weekday,Productivity_score_weekend
0,Female,23,Student,Single,0.0,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,...,2.0,3.0,08:30-09:00,,,,18:00-18:30,23:00-23:30,0.4375,0.270833
1,Female,23,Student,Single,100000.0,Sleeping,Sleeping,Personal Care,Personal Care,Eating,...,2.5,0.5,07:00-07:30,07:30-08:00,12:00-12:30,13:00-13:30,20:30-21:00,20:30-21:00,0.3125,0.291667
2,Female,25,Student,Single,200000.0,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,...,0.5,1.0,,,,13:00-13:30,20:30-21:00,21:00-21:30,0.291667,0.020833
3,Male,23,Student,Single,100000.0,Sleeping,Sleeping,Sleeping,Sleeping,Eating,...,6.0,0.5,07:00-07:30,,12:30-13:00,12:00-12:30,21:00-21:30,,0.3125,0.270833
4,Male,23,Student,Single,90000.0,Religion or Spiritual Activities,Sleeping,Sleeping,Sleeping,Sleeping,...,0.0,1.0,08:00-08:30,,13:30-14:00,13:30-14:00,20:30-21:00,20:30-21:00,0.291667,0.333333


In [5]:
df2.head()

Unnamed: 0,Gender,Age,Occupation,Marital status,HouseHoldIncome,IncomeGroup,Sleeping_duration_weekday,Sleeping_duration_weekend,University_Class_duration_weekday,University_Class_duration_weekend,...,Housework_merged_duration_weekday,Housework_merged_duration_weekend,Active_duration_weekday,Active_duration_weekend,Leisure_duration_weekday,Leisure_duration_weekend,Neutral_duration_weekday,Neutral_duration_weekend,Productivity_score_weekday,Productivity_score_weekend
0,Female,23,Student,Single,0.0,Low,7.0,10.5,2.5,0.0,...,3.5,2.5,10.5,6.5,11.5,14.5,2.0,3.0,0.4375,0.270833
1,Female,23,Student,Single,100000.0,Moderate,7.5,9.0,2.5,0.0,...,0.0,0.0,7.5,7.0,14.0,16.5,2.5,0.5,0.3125,0.291667
2,Female,25,Student,Single,200000.0,Moderate,6.5,8.0,7.0,0.0,...,0.0,0.0,7.0,0.5,16.5,22.5,0.5,1.0,0.291667,0.020833
3,Male,23,Student,Single,100000.0,Moderate,7.5,11.0,4.5,0.0,...,0.0,0.0,7.5,6.5,10.5,17.0,6.0,0.5,0.3125,0.270833
4,Male,23,Student,Single,90000.0,Moderate,9.0,8.0,3.0,5.0,...,1.0,0.5,7.0,8.0,17.0,15.0,0.0,1.0,0.291667,0.333333


In [6]:
# Encode categorical variables
le = LabelEncoder()
df2['IncomeGroup'] = le.fit_transform(df2['IncomeGroup'])
df2 = pd.get_dummies(df2, columns=['Gender', 'Occupation', 'Marital status'], drop_first=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['IncomeGroup'] = le.fit_transform(df2['IncomeGroup'])


In [7]:
# Categorize the productivity score into low, moderate, and high
quartiles = df2['Productivity_score_weekday'].quantile([0.25, 0.5, 0.75])
def categorize_productivity(score):
    if score <= quartiles[0.25]:
        return 'low'
    elif score <= quartiles[0.75]:
        return 'moderate'
    else:
        return 'high'

In [8]:
df2['Productivity_group'] = df2['Productivity_score_weekday'].apply(categorize_productivity)
df2['Productivity_group'] = le.fit_transform(df2['Productivity_group'])

In [9]:
# Handle missing values
df2.dropna(inplace=True)


In [10]:
# Prepare features and target
X = df2.drop(columns=['Productivity_score_weekday', 'Productivity_group'])
y = df2['Productivity_group']

In [11]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [12]:
def evaluate_model(clf, X_train, X_test, y_train, y_test, model_type="classification"):
    start_time = process_time()
    clf.fit(X_train, y_train)
    end_time = process_time()
    total_training_time = end_time - start_time
    
    # If the model is regression (Lasso, Ridge), convert the output to a classification category
    if model_type == "regression":
        predictions = np.round(clf.predict(X_test))
        train_predictions = np.round(clf.predict(X_train))
    else:
        predictions = clf.predict(X_test)
        train_predictions = clf.predict(X_train)
    
    print(f"Model: {clf.__class__.__name__}")
    print(f"Training time: {total_training_time:.2f} seconds")
    print(f"Train Accuracy: {accuracy_score(y_train, train_predictions)}")
    print(f"Test Accuracy: {accuracy_score(y_test, predictions)}")
    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test, predictions))
    print("-" * 60)


In [13]:
# Cross-Validation function
def perform_cross_validation(model, X, y, cv_splits=10):
    kfold = KFold(n_splits=cv_splits)
    results = cross_val_score(model, X, y, cv=kfold)
    print(f"Cross-Validation Mean Accuracy: {np.mean(results):.4f} +/- {np.std(results):.4f}")

In [14]:
# Hyperparameter Tuning function
def hyperparameter_tuning(X_train, y_train):
    param_dist = {
        'n_estimators': [int(x) for x in np.linspace(200, 2000, 10)],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [int(x) for x in np.linspace(10, 1000, 10)],
        'min_samples_split': [2, 5, 10, 14],
        'min_samples_leaf': [1, 2, 4, 6, 8],
        'criterion': ['entropy', 'gini']
    }
    
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
    rf_random.fit(X_train, y_train)
    
    return rf_random.best_estimator_

In [15]:
# Classifiers to try
classifiers = [
    DummyClassifier(strategy="most_frequent"),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(n_neighbors=5),
    GaussianNB(),
    LogisticRegression(max_iter=1000),  # Increase from the default 100
    Ridge(alpha=1.0),
    Lasso(alpha=0.01)  # Tune alpha based on performance
]

In [16]:
for clf in classifiers:
    if isinstance(clf, (Lasso, Ridge)):
        evaluate_model(clf, X_train, X_test, y_train, y_test, model_type="regression")
    else:
        evaluate_model(clf, X_train, X_test, y_train, y_test)


Model: DummyClassifier
Training time: 0.00 seconds
Train Accuracy: 0.48314606741573035
Test Accuracy: 0.4666666666666667
[[ 0  0 11]
 [ 0  0 13]
 [ 0  0 21]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.00      0.00      0.00        13
           2       0.47      1.00      0.64        21

    accuracy                           0.47        45
   macro avg       0.16      0.33      0.21        45
weighted avg       0.22      0.47      0.30        45

------------------------------------------------------------
Model: DecisionTreeClassifier
Training time: 0.00 seconds
Train Accuracy: 1.0
Test Accuracy: 1.0
[[11  0  0]
 [ 0 13  0]
 [ 0  0 21]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        21

    accuracy                           1.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNeighborsClassifier
Training time: 0.00 seconds
Train Accuracy: 0.6067415730337079
Test Accuracy: 0.4222222222222222
[[ 2  2  7]
 [ 1  5  7]
 [ 5  4 12]]
              precision    recall  f1-score   support

           0       0.25      0.18      0.21        11
           1       0.45      0.38      0.42        13
           2       0.46      0.57      0.51        21

    accuracy                           0.42        45
   macro avg       0.39      0.38      0.38        45
weighted avg       0.41      0.42      0.41        45

------------------------------------------------------------
Model: GaussianNB
Training time: 0.00 seconds
Train Accuracy: 0.5
Test Accuracy: 0.4888888888888889
[[ 0  0 11]
 [ 0  1 12]
 [ 0  0 21]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       1.00      0.08      0.14        13
           2       0.48      1.00      0.65        21

    accuracy                           0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: LogisticRegression
Training time: 0.30 seconds
Train Accuracy: 0.9943820224719101
Test Accuracy: 0.8888888888888888
[[ 9  0  2]
 [ 0 11  2]
 [ 1  0 20]]
              precision    recall  f1-score   support

           0       0.90      0.82      0.86        11
           1       1.00      0.85      0.92        13
           2       0.83      0.95      0.89        21

    accuracy                           0.89        45
   macro avg       0.91      0.87      0.89        45
weighted avg       0.90      0.89      0.89        45

------------------------------------------------------------
Model: Ridge
Training time: 0.02 seconds
Train Accuracy: 0.42696629213483145
Test Accuracy: 0.3333333333333333
[[ 0  0  0  0  0]
 [ 2  3  4  2  0]
 [ 0  0  6  7  0]
 [ 0  0 14  6  1]
 [ 0  0  0  0  0]]
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         0.0       1.00      0.27      0.43        11
         1.0       0.25      0.46

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# # Example of Hyperparameter tuning
# best_rf = hyperparameter_tuning(X_train, y_train)
# evaluate_model(best_rf, X_train, X_test, y_train, y_test)

In [19]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Defining hyperparameter search spaces for each classifier
param_grids = {
    'KNN': {
        'n_neighbors': np.arange(1, 11),
        'metric': ['euclidean', 'manhattan', 'minkowski']  # Added distance metric options
    },
    'Decision Tree': {
        'max_depth': np.arange(1, 21),  # Extended depth range
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2, 5, 10],  # Added for controlling data split
        'min_samples_leaf': [1, 2, 4]   # Added for controlling leaf node size
    },
    'Random Forest': {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': np.arange(1, 21),
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']  # Added for feature selection
    },
    'Logistic Regression': {
        'C': np.logspace(-2, 2, 5),  # Adjusted C range for better exploration
        'solver': ['newton-cg', 'lbfgs']  # Removed less common solvers (consider revisiting based on data)
    },
    'Naive Bayes': {
        # No additional parameters commonly tuned for Naive Bayes
    },
    'SVM': {
        'C': np.logspace(-1, 1, 3),  # Adjusted C range for SVM
        'kernel': ['linear', 'poly'],  # Consider adding non-linear kernels if needed later
        'gamma': np.logspace(-2, 2, 3),  # Specific gamma search space
        'degree': [2, 3]  # Added for polynomial kernel (if used)
    },
    'AdaBoost': {
        'n_estimators': [10, 50, 100, 200],
        'learning_rate': [0.001, 0.01, 0.1, 1],
        'algorithm': ['SAMME.R', 'SAMME']  # Added algorithm options
    },
    'Gradient Boosting': {
        'n_estimators': [10, 50, 100, 200],
        'learning_rate': [0.001, 0.01, 0.1, 1],
        'max_depth': np.arange(1, 11),
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'loss': ['log_loss']  # Added loss function options
    }
}


# Performing hyperparameter tuning for each classifier using GridSearchCV
best_classifiers = {}

# Define the corresponding classifier names as a list
classifier_names = [
    'Dummy Classifier',
    'Decision Tree',
    'Random Forest',
    'KNN',
    'Naive Bayes',
    'Logistic Regression',
    'Ridge',
    'Lasso'
]

# Update the classifiers list to a dictionary for pairing
classifiers_dict = {
    name: clf for name, clf in zip(classifier_names, classifiers)
}

for classifier_name, classifier in classifiers_dict.items():
    # Initialize GridSearchCV for the current classifier
    grid_search = GridSearchCV(classifier, param_grids.get(classifier_name, {}), scoring='accuracy', cv=5, n_jobs=-1)

    # Fit the GridSearchCV to perform hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Store the best estimator for the current classifier
    best_classifiers[classifier_name] = grid_search.best_estimator_

    # Print the best parameters and corresponding score for the current classifier
    print(f"{classifier_name}: Best parameters - {grid_search.best_params_}, Best score - {grid_search.best_score_}")


Dummy Classifier: Best parameters - {}, Best score - 0.48317460317460315
Decision Tree: Best parameters - {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}, Best score - 1.0


7200 fits failed out of a total of 21600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3561 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\leeon\appdata\local\programs\python\python39\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\leeon\appdata\local\programs\python\python39\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\users\leeon\appdata\local\programs\python\python39\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\users\leeon\appdata\local\programs\python\python39\l

Random Forest: Best parameters - {'criterion': 'gini', 'max_depth': 17, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}, Best score - 0.9944444444444445
KNN: Best parameters - {'metric': 'manhattan', 'n_neighbors': 10}, Best score - 0.4385714285714286
Naive Bayes: Best parameters - {}, Best score - 0.5
Logistic Regression: Best parameters - {'C': 0.1, 'solver': 'newton-cg'}, Best score - 0.9046031746031746
Ridge: Best parameters - {}, Best score - nan
Lasso: Best parameters - {}, Best score - nan




In [27]:
from sklearn.metrics import precision_score, recall_score, f1_score

best_classifiers.pop('Ridge')

best_classifiers.pop('Lasso')


# Evaluate best classifiers
for name, clf in best_classifiers.items():
    print(name)
    if (name != 'Ridge' or name != 'Lasso'):
        # Train the best classifier
        clf.fit(X_train, y_train)
    
    # Predict using the trained classifier
        y_pred = clf.predict(X_test)
    
        # Evaluate the classifier's performance
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
    
        # Print evaluation metrics
        print(f"-------------{name}:-------------")
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")
    
    #     # Plot confusion matrix with classifier name
    #     skplt.metrics.plot_confusion_matrix(y_test, y_pred, title=f"Confusion Matrix - {name}");
    
    #     # Plot ROC curve with classifier name
    #     y_probas = clf.predict_proba(X_test)
    #     skplt.metrics.plot_roc(y_test, y_probas, figsize=(6,4), plot_micro=False, plot_macro=False, title=f"ROC Curve - {name}");
    
    #     # Plot PRC curve with classifier name
    #     skplt.metrics.plot_precision_recall(y_test, y_probas, figsize=(6,4), plot_micro=False, title=f"Precision-Recall Curve - {name}");
    
        print()


Dummy Classifier
-------------Dummy Classifier:-------------
Accuracy: 0.47
Precision: 0.16
Recall: 0.33
F1 Score: 0.21

Decision Tree
-------------Decision Tree:-------------
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00

Random Forest
-------------Random Forest:-------------
Accuracy: 0.98
Precision: 0.98
Recall: 0.97
F1 Score: 0.98

KNN
-------------KNN:-------------
Accuracy: 0.53
Precision: 0.67
Recall: 0.45
F1 Score: 0.44

Naive Bayes
-------------Naive Bayes:-------------
Accuracy: 0.49
Precision: 0.49
Recall: 0.36
F1 Score: 0.26

Logistic Regression


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


-------------Logistic Regression:-------------
Accuracy: 0.84
Precision: 0.88
Recall: 0.81
F1 Score: 0.83

