In [1]:
# Importing necessary libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import seaborn as sns
import pickle
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder


In [2]:
# Reading our files

x = pd.read_csv('processed_predictors.csv')
y = pd.read_csv('labels.csv')

In [3]:
x

Unnamed: 0.1,Unnamed: 0,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23
0,0,0.036323,-0.507420,1.551777,-0.658797,2.620898,-0.201656
1,1,0.036323,-0.507420,1.597951,-0.920315,2.510088,-0.993051
2,2,0.036323,-0.507420,1.644125,-0.658797,2.510088,-0.993051
3,3,0.076021,-0.473364,1.597951,-0.488506,2.510088,-0.993051
4,4,0.076021,-0.473364,1.782646,-0.920315,2.510088,-0.993051
...,...,...,...,...,...,...,...
41752,41752,-1.194322,0.090995,-0.341343,1.025865,-0.260180,1.709865
41753,41753,-1.856489,-0.108477,-0.557436,-0.348625,0.589369,1.442008
41754,41754,-0.193927,3.019820,-0.027362,0.587974,0.183063,-0.487776
41755,41755,-0.837038,-0.084151,0.258914,2.248309,0.072252,0.644527


In [4]:
# Dropping the indices column

x.drop(x.columns[0], inplace = True, axis = 1)
y.drop(y.columns[0], inplace = True, axis = 1)

## Since, we will be running RandomSearchCV along with GridSearchCV on our dataset, it will require heavy computation. To minimize the computation time, lets randomly sample 10,000 observations and proceed with modeling.

### To do random sampling, I am merging X and Y and sample ramndomly. I will segregate this post sampling

In [5]:
df = pd.concat([x,y], axis = 1)

In [6]:
df_sampled = df.sample(n=10000, random_state=42)

In [7]:
x = df_sampled.drop('label', axis = 1)
y = df_sampled.label

## Looking at the distribution of our target variable, there is an evidence of imbalance, hence we will address this during our model performance calculation.

In [8]:
y.value_counts()

cycling     1745
sitting     1725
walking     1709
lying       1705
standing    1685
bending1     851
bending2     580
Name: label, dtype: int64

## There are 2 classes, bending1 and bending2 which are similar, lets call them bending

In [9]:
y = y.apply(lambda x: 'bending' if x == 'bending1' or x == 'bending2' else x)

In [10]:
y.value_counts()

cycling     1745
sitting     1725
walking     1709
lying       1705
standing    1685
bending     1431
Name: label, dtype: int64

## There is a hint of slight imbalance. Since it is SLIGHT, I am ignoring it. However, if it was not slight, I would have balanced the training set only.

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [12]:
x_train

Unnamed: 0,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23
24147,0.949382,-0.716622,0.028046,-0.616225,0.737116,-0.993051
19820,0.552400,-0.473364,-1.818901,-0.008043,-1.922341,0.224479
14798,0.076021,-0.473364,-0.433691,-0.920315,-1.527855,-0.420812
40020,0.393607,1.360801,1.043867,-0.488506,0.626306,0.175778
6161,0.671494,0.154242,-0.003352,0.113593,0.884864,0.297531
...,...,...,...,...,...,...
5865,-0.598849,-0.084151,1.090041,0.654874,-0.112433,2.817817
25756,-0.281263,-0.084151,-1.357164,0.040611,0.737116,-0.993051
4913,-0.281263,-0.084151,0.258914,0.399438,1.623602,-0.993051
38924,-0.320961,1.764609,0.074220,0.971128,0.392864,-0.420812


In [13]:
x_test

Unnamed: 0,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23
11355,0.076021,2.046788,0.674478,0.569729,0.183063,2.513434
3077,0.842991,-0.487959,0.674478,-0.920315,0.552432,-0.487776
33655,1.346364,-0.473364,0.526722,-0.677043,-0.415315,-0.280796
35348,-1.829494,1.302419,-0.742131,-0.160089,-0.001622,0.224479
21571,0.512701,-0.507420,0.766825,0.472420,-0.543856,1.356781
...,...,...,...,...,...,...
11195,0.750891,-0.084151,-1.418113,1.579309,0.097369,1.015873
14000,1.425761,-0.716622,-1.495685,-0.658797,-1.774593,-0.250358
18114,0.103016,-0.487959,-0.479864,-0.658797,-1.331350,-0.134692
17887,0.103016,-0.487959,-0.756906,-0.415525,-0.987098,-0.706931


In [14]:
y_train

24147    sitting
19820      lying
14798      lying
40020    walking
6161     cycling
          ...   
5865     cycling
25756    sitting
4913     bending
38924    walking
6699     cycling
Name: label, Length: 7000, dtype: object

In [15]:
y_test

11355     cycling
3077      bending
33655    standing
35348     walking
21571     sitting
           ...   
11195     cycling
14000       lying
18114       lying
17887       lying
2532      bending
Name: label, Length: 3000, dtype: object

## In activity recognition applications, correctly identifying both positive and negative instances is important. However, depending on the problem, the cost of false positives and false negatives may not be equal. For example, in medical diagnosis, a false negative (not detecting a disease when it's present) may be more costly than a false positive (detecting a disease when it's not present).

## In the case of activity recognition, it's important to minimize false negatives (i.e., failing to detect an activity that's actually being performed) in order to ensure accurate tracking of physical activity levels. Failing to detect certain activities could lead to inaccurate assessment of the user's overall physical activity, which could have negative impacts on healthcare decision-making and outcomes.

## F1 score takes both precision and recall into account, making it a suitable metric for evaluating performance in the case of imbalanced datasets, as well as situations where false negatives are particularly costly. By focusing on both precision and recall, F1 score provides a balanced evaluation of the model's ability to correctly classify both positive and negative instances.

## Logistic Regression

### Random Search with CV = 5

In [16]:
# Define the logistic regression model
logreg = LogisticRegression()

# Define the hyperparameter space for random search
random_grid = {
    'penalty': ['l2'],
    'C': [0.1, 0.5, 1, 2, 5],
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced'],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 200, 500, 1000],
}

# Perform random search to find the best hyperparameters
logreg_random = RandomizedSearchCV(estimator=logreg, param_distributions=random_grid, n_iter=100, cv=5, random_state=42,
                              n_jobs =-1, scoring = 'f1')
logreg_random.fit(x_train, y_train)

# Print the best hyperparameters
print(logreg_random.best_params_)

{'solver': 'saga', 'penalty': 'l2', 'max_iter': 200, 'fit_intercept': True, 'class_weight': 'balanced', 'C': 1}


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


In [17]:
# Define the hyperparameter space for grid search
param_grid = {
    'penalty': [logreg_random.best_params_['penalty']],
    'C': [0.1, 0.01, 1],
    'fit_intercept': [logreg_random.best_params_['fit_intercept']],
    'class_weight': [logreg_random.best_params_['class_weight']],
    'solver': [logreg_random.best_params_['solver']],
    'max_iter': [logreg_random.best_params_['max_iter'] + 10, logreg_random.best_params_['max_iter'] - 10 ],
}

# Perform grid search to find the best hyperparameters
logreg_grid = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring = 'f1')
logreg_grid.fit(x_train, y_train)

# Print the best hyperparameters
print(logreg_grid.best_params_)

# Fit the model with the best hyperparameters
logreg_best = LogisticRegression(**logreg_grid.best_params_)
logreg_best.fit(x_train, y_train)

# Predict on the test set and calculate accuracy
y_pred = logreg_best.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred, average = 'weighted')

# Print the metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 score: {f1}')

Traceback (most recent call last):
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1544, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_l

Traceback (most recent call last):
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1544, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_l

Traceback (most recent call last):
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1544, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_l

Traceback (most recent call last):
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 216, in __call__
    return self._score(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "C:\Users\shuv1\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1544, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_l

{'C': 0.1, 'class_weight': 'balanced', 'fit_intercept': True, 'max_iter': 210, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 0.6533333333333333
Precision: 0.6441709707166171
Recall: 0.6533333333333333
F1 score: 0.6470794504468896


## Since, this is a multiclassification problem, TP, TF and FN are calculated based on Sum of diagonal values, Sum of vertical values and Sum of horizontal values of confusion matrix respectively.

In [18]:
confusion_matrix(y_test, y_pred)

array([[328,  11,  14,  68,   6,   2],
       [ 27, 307,   1,  29,   4, 149],
       [ 11,   1, 446,  10,  44,   0],
       [ 86,  16,  85, 210, 145,   3],
       [  0,   4,  43, 125, 323,   1],
       [  1, 149,   0,   5,   0, 346]], dtype=int64)

In [19]:
# Convert the dictionary to a dataframe
metrics = pd.DataFrame({
    'Model': ['Logistic Regression'],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

In [20]:
metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.653333,0.644171,0.653333,0.647079


## Support Vector Machine

### Random Search with CV = 5

In [21]:
# Define the parameter distribution for random search
param_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf'],
              'gamma': ['scale', 'auto'],
              'degree': [2, 3, 4]}

# Create an SVM classifier object
svm = SVC()

# Perform random search
random_search = RandomizedSearchCV(svm, param_distributions=param_grid, n_iter=100, cv=5, random_state=42, n_jobs = -1,
                                  scoring= 'f1')
random_search.fit(x_train, y_train)
print('Random search best params:', random_search.best_params_)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


Random search best params: {'kernel': 'linear', 'gamma': 'scale', 'degree': 2, 'C': 0.1}


In [22]:
# Define the parameter grid to search with the best parameters
param_grid_best = {
    'C': [95,105],
    'kernel': ['linear', 'rbf'],
    'degree': [1,2,3],
    'gamma': ['auto']
}

# Perform grid search with the best parameters
grid_search = GridSearchCV(svm, param_grid=param_grid_best, cv=5, scoring = 'f1', n_jobs = -1)
grid_search.fit(x_train, y_train)



GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [95, 105], 'degree': [1, 2, 3], 'gamma': ['auto'],
                         'kernel': ['linear', 'rbf']},
             scoring='f1')

In [23]:
svm_best = grid_search.best_estimator_

In [24]:
print(svm_best)

SVC(C=95, degree=1, gamma='auto', kernel='linear')


In [25]:
y_pred = svm_best.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred, average = 'weighted')

In [26]:
confusion_matrix(y_test, y_pred)

array([[324,  11,  13,  68,   8,   5],
       [ 24, 308,   2,  24,   3, 156],
       [ 14,   2, 445,   5,  46,   0],
       [ 66,  20,  85, 210, 161,   3],
       [  0,   9,  42, 101, 344,   0],
       [  1, 148,   1,   2,   0, 349]], dtype=int64)

In [27]:
metrics.loc[1] = ['SVM', accuracy, precision, recall, f1]

In [28]:
metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.653333,0.644171,0.653333,0.647079
1,SVM,0.66,0.651606,0.66,0.653039


## Decision Tree

### Random Search with CV = 5

In [29]:
# Define the parameter grid to search with random search
param_grid_random = {
    'max_depth': [20, 24, 28, 32, 36, 40, 44, 48],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'criterion': ['gini', 'entropy']
}

# Create a decision tree model
tree = DecisionTreeClassifier()

# Run random search cv
random_search = RandomizedSearchCV(tree, param_distributions=param_grid_random, n_iter=100, cv=5, n_jobs=-1, 
                                   scoring = 'f1')
random_search.fit(x_train, y_train)

# Print the best parameters
print("Best parameters found by random search:", random_search.best_params_)

Best parameters found by random search: {'min_samples_split': 4, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 32, 'criterion': 'gini'}


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


In [30]:
# Define the parameter grid to search with grid search using the best parameters from random search
param_grid_best = {
    'max_depth': [random_search.best_params_['max_depth']-1, random_search.best_params_['max_depth'], random_search.best_params_['max_depth']+1],
    'max_features': [random_search.best_params_['max_features']],
    'min_samples_split': [random_search.best_params_['min_samples_split']+3, random_search.best_params_['min_samples_split'], random_search.best_params_['min_samples_split']+1],
    'min_samples_leaf': [random_search.best_params_['min_samples_leaf']-1, random_search.best_params_['min_samples_leaf'], random_search.best_params_['min_samples_leaf']+1],
    'criterion': [random_search.best_params_['criterion']]
}

# Perform grid search using the best parameters from random search
grid_search = GridSearchCV(tree, param_grid=param_grid_best, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Print the best parameters
print("Best parameters found by grid search:", grid_search.best_params_)

# Get the predicted values for the test set using the best model
y_pred = grid_search.predict(x_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred, average = 'weighted')

Best parameters found by grid search: {'criterion': 'gini', 'max_depth': 31, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 7}


 nan nan nan nan nan nan nan nan nan]


In [31]:
confusion_matrix(y_test, y_pred)

array([[344,  11,   6,  51,  10,   7],
       [ 10, 330,   0,  17,   7, 153],
       [  4,   3, 472,  19,  14,   0],
       [ 31,  25,  17, 330, 132,  10],
       [  9,  20,  31, 113, 321,   2],
       [  1, 144,   1,   6,   2, 347]], dtype=int64)

In [32]:
# Add the metrics to the dataframe
metrics = metrics.append({
    'Model': 'Decision Tree',
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
}, ignore_index=True)

  metrics = metrics.append({


In [33]:
metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.653333,0.644171,0.653333,0.647079
1,SVM,0.66,0.651606,0.66,0.653039
2,Decision Tree,0.714667,0.715545,0.714667,0.714838


## Based on the provided table, we can see the performance of three different models on the activity recognition dataset.

## The decision tree model achieved an accuracy of 0.701, precision of 0.704, recall of 0.701, and F1 score of 0.701.

## We can interpret these results to mean that the decision tree model correctly predicted the activity performed by the user in 70.1% of cases. The precision of 0.704 means that when the model predicted an activity, it was correct 70.4% of the time. The recall of 0.701 indicates that the model correctly identified 70.1% of all instances of a particular activity. Finally, the F1 score of 0.701 represents a weighted average of the precision and recall, which takes into account both false positives and false negatives.

## Overall, the decision tree model appears to perform moderately well on the activity recognition dataset, with an F1 score that is comparable to the other two models. However, it may be worth investigating whether other models or approaches could lead to better results.

### ---------------------------------------------------------------------------------------------------------------------------------------------

## Weekly Assignment (Neural Network)

#### Random Search CV

In [34]:
%%time

score_measure = "f1"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (50,), (70,),(50,30)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0, .2, .5, .7, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.01, 0.1, 0.2],
    'max_iter': [100]
}

ann = MLPClassifier()
grid_search = RandomizedSearchCV(estimator = ann, param_distributions=param_grid, cv=kfolds, n_iter=70,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(x_train, y_train)

bestRecallTree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 70 candidates, totalling 350 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


{'solver': 'adam', 'max_iter': 100, 'learning_rate_init': 0.2, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (50,), 'alpha': 0.7, 'activation': 'relu'}
CPU times: total: 5.61 s
Wall time: 2min 36s


In [35]:
%%time
y_pred = bestRecallTree.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     bending       0.84      0.74      0.79       429
     cycling       0.64      0.66      0.65       517
       lying       0.79      0.92      0.85       512
     sitting       0.53      0.34      0.42       545
    standing       0.61      0.80      0.69       496
     walking       0.71      0.70      0.71       501

    accuracy                           0.69      3000
   macro avg       0.69      0.69      0.68      3000
weighted avg       0.68      0.69      0.68      3000

CPU times: total: 62.5 ms
Wall time: 60 ms


#### Grid Search CV

In [36]:
%%time

score_measure = "f1"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (50,), (70,)],
    'activation': ['logistic'],
    'solver': ['adam'],
    'alpha': [.5, .7, 1],
    'learning_rate': ['adaptive', 'invscaling'],
    'learning_rate_init': [0.005, 0.01, 0.15],
    'max_iter': [1000]
}

ann = MLPClassifier()
grid_search = GridSearchCV(estimator = ann, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(x_train, y_train)

bestRecallTree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


{'activation': 'logistic', 'alpha': 0.5, 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.005, 'max_iter': 1000, 'solver': 'adam'}
CPU times: total: 4.77 s
Wall time: 57.6 s


In [37]:
%%time
y_pred = bestRecallTree.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     bending       0.80      0.76      0.78       429
     cycling       0.68      0.51      0.58       517
       lying       0.76      0.87      0.81       512
     sitting       0.53      0.33      0.41       545
    standing       0.59      0.77      0.66       496
     walking       0.65      0.81      0.72       501

    accuracy                           0.67      3000
   macro avg       0.67      0.68      0.66      3000
weighted avg       0.66      0.67      0.65      3000

CPU times: total: 46.9 ms
Wall time: 42.4 ms


In [38]:
f1 = f1_score(y_test, y_pred, average = 'weighted')

In [39]:
f1

0.6548248998592462

## Assignment 2 - Deep Neural Network

In [40]:
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from tensorflow import keras


tf.random.set_seed(1)

In [41]:
from __future__ import print_function
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt

from sklearn import datasets


np.random.seed(1)

In [42]:
x_test.shape

(3000, 6)

## Keras - Deep Network

## The below code utilses RandomSearchCV with custom metrics

In [46]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

def f1_score_macro(y_true, y_pred):
    y_true = np.argmax(y_true, axis=1)
    y_pred = np.argmax(y_pred, axis=1)
    return f1_score(y_true, y_pred, average='macro')

model = Sequential()
model.add(Input(shape=(6,)))  # Change the input shape to match your data
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(6, activation='softmax')) # final layer, 6 categories


# Create a wrapper function for the F1Score metric that can be pickled
def f1_score_wrapper(y_true, y_pred):
    f1_score_metric = F1Score()
    f1_score_metric.update_state(y_true, y_pred)
    return f1_score_metric.result().numpy()

model.compile(loss='categorical_crossentropy', optimizer='adam')

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

y_train_encoded = tf.keras.utils.to_categorical(y_train_encoded, num_classes=6)
y_test_encoded = tf.keras.utils.to_categorical(y_test_encoded, num_classes=6)

history = model.fit(x_train, y_train_encoded, batch_size=32, epochs=20, validation_data=(x_test, y_test_encoded))

from scikeras.wrappers import KerasClassifier

def build_clf(hidden_layer_sizes, dropout):
    ann = tf.keras.models.Sequential()
    ann.add(tf.keras.layers.Input(shape=6))
    for hidden_layer_size in hidden_layer_sizes:
        ann.add(tf.keras.layers.Dense(hidden_layer_size, kernel_initializer=tf.keras.initializers.GlorotNormal(),
                                      bias_initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None), activation="relu"))
        ann.add(tf.keras.layers.Dropout(dropout))
    ann.add(tf.keras.layers.Dense(6, activation='softmax'))
    ann.compile(loss='categorical_crossentropy', metrics=['categorical_accuracy'])
    return ann

keras_clf = KerasClassifier(
    build_fn=build_clf,
    hidden_layer_sizes=64,
    dropout=0.0
)

# Add your param_distributions dictionary as 'params'
params = {
    'optimizer__learning_rate': [0.001, 0.005],
    'model__hidden_layer_sizes': [(70,),(90, )],
    'model__dropout': [0, 0.1],
    'batch_size':[20, 40, 60],
    'epochs':[10, 50],
    'optimizer':['adam','sgd']
}

rnd_search_cv = RandomizedSearchCV(estimator=keras_clf, param_distributions=params, scoring=make_scorer(f1_score_macro, greater_is_better=True), n_iter=20, cv=5,
                                  n_jobs=-1)

import sys
sys.setrecursionlimit(2500) # note: the default is 3000 (python 3.9)

earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
callback = [earlystop]

_ = rnd_search_cv.fit(x_train, y_train_encoded, callbacks=callback, verbose=0)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




In [47]:
rnd_search_cv.best_params_

{'optimizer__learning_rate': 0.005,
 'optimizer': 'sgd',
 'model__hidden_layer_sizes': (90,),
 'model__dropout': 0.1,
 'epochs': 50,
 'batch_size': 20}

In [48]:
best_net = rnd_search_cv.best_estimator_
print(rnd_search_cv.best_params_)
y_pred = best_net.predict(x_test)
print(classification_report(y_test_encoded, y_pred))

{'optimizer__learning_rate': 0.005, 'optimizer': 'sgd', 'model__hidden_layer_sizes': (90,), 'model__dropout': 0.1, 'epochs': 50, 'batch_size': 20}
              precision    recall  f1-score   support

           0       0.90      0.76      0.82       429
           1       0.66      0.65      0.66       517
           2       0.85      0.93      0.89       512
           3       0.60      0.52      0.56       545
           4       0.68      0.78      0.73       496
           5       0.69      0.73      0.71       501

   micro avg       0.72      0.72      0.73      3000
   macro avg       0.73      0.73      0.73      3000
weighted avg       0.73      0.72      0.72      3000
 samples avg       0.72      0.72      0.72      3000



In [52]:
y_pred_classes = np.argmax(y_pred, axis=1)
y_pred_labels = encoder.inverse_transform(y_pred_classes)

precision = precision_score(y_test, y_pred_labels, average='macro')
recall = recall_score(y_test, y_pred_labels, average='macro')
f1 = f1_score(y_test, y_pred_labels, average='macro')


print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Accuracy:", accuracy)

Precision: 0.7312588582362869
Recall: 0.7285275092494019
F1-score: 0.7273203372977227
Accuracy: 0.7146666666666667


In [53]:
metrics = metrics.append({
    'Model': 'Deep Neural Network',
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
}, ignore_index=True)

  metrics = metrics.append({


In [54]:
metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.653333,0.644171,0.653333,0.647079
1,SVM,0.66,0.651606,0.66,0.653039
2,Decision Tree,0.714667,0.715545,0.714667,0.714838
3,Deep Neural Network,0.714667,0.731259,0.728528,0.72732


## The assessment of the models based on their F1 scores indicated that the Neural Network Keras DNN model achieved the highest score of 0.7273, followed by the Decision Tree model with an F1 score of 0.7148. The SVM model obtained an F1 score of 0.6530, while the Logistic Regression model received the lowest F1 score of 0.6471.

## The Neural Network Keras DNN model is a deep learning model that can identify complex relationships between features in the data, making it suitable for large datasets. The Decision Tree model is a non-parametric model that can interpret both categorical and continuous data easily. The SVM model is a powerful model that can handle high-dimensional data and is effective in both linear and non-linear classification tasks. The Logistic Regression model is a linear model that is commonly used as a benchmark for classification tasks.

## To summarize, the Neural Network Keras DNN model is the best choice for making predictions based on its superior F1 score, while the Decision Tree model is also a good alternative. However, the SVM and Logistic Regression models may not perform as well as the other models in certain scenarios due to their lower F1 scores.