```
1. Ordinary Decision Trees
```

In [16]:
import itertools
import numpy as np
from scipy.stats import norm
from prettytable import PrettyTable

import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
raw_data = pd.read_csv('/content/drive/MyDrive/data/subject101.csv')
raw_data.head()

Unnamed: 0,timestamp,activityID,heartrate,handTemperature,handAcc16_1,handAcc16_2,handAcc16_3,handAcc6_1,handAcc6_2,handAcc6_3,...,ankleGyro1,ankleGyro2,ankleGyro3,ankleMagne1,ankleMagne2,ankleMagne3,ankleOrientation1,ankleOrientation2,ankleOrientation3,ankleOrientation4
0,8.38,0,104.0,30.0,2.37223,8.60074,3.51048,2.43954,8.76165,3.35465,...,0.0083,0.00925,-0.01758,-61.1888,-38.9599,-58.1438,1.0,0.0,0.0,0.0
1,8.39,0,,30.0,2.18837,8.5656,3.66179,2.39494,8.55081,3.64207,...,-0.006577,-0.004638,0.000368,-59.8479,-38.8919,-58.5253,1.0,0.0,0.0,0.0
2,8.4,0,,30.0,2.37357,8.60107,3.54898,2.30514,8.53644,3.7328,...,0.003014,0.000148,0.022495,-60.7361,-39.4138,-58.3999,1.0,0.0,0.0,0.0
3,8.41,0,,30.0,2.07473,8.52853,3.66021,2.33528,8.53622,3.73277,...,0.003175,-0.020301,0.011275,-60.4091,-38.7635,-58.3956,1.0,0.0,0.0,0.0
4,8.42,0,,30.0,2.22936,8.83122,3.7,2.23055,8.59741,3.76295,...,0.012698,-0.014303,-0.002823,-61.5199,-39.3879,-58.2694,1.0,0.0,0.0,0.0


In [4]:
# Define a function to clean data
def clean_data(data):
    data = data.dropna()
    data = data.reset_index(drop=True)

    # Remove certain columns
    data = data.drop(['timestamp'], axis=1)
    # Remove the orientation columns
    data = data.drop(['handOrientation1', 'handOrientation2', 'handOrientation3', 'handOrientation4'], axis=1)
    data = data.drop(['chestOrientation1', 'chestOrientation2', 'chestOrientation3', 'chestOrientation4'], axis=1)
    data = data.drop(['ankleOrientation1', 'ankleOrientation2', 'ankleOrientation3', 'ankleOrientation4'], axis=1)

    # For the heart rate, fill missing values with previous timestamp's heart rate
    data['heartrate'] = data['heartrate'].fillna(method='ffill')

    # For any other missing values, fill them with last value
    data = data.fillna(method='ffill')

    # Normalize the data
    # data = (data - data.mean()) / data.std()
    # discard data with NaN values
    data = data.dropna()
    data = data.reset_index(drop=True)

    # disacrd data with activityID = 0
    data = data[data['activityID'] != 0]
    data = data.reset_index(drop=True)

    # Shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)
    return data

In [5]:
data = clean_data(raw_data)
data.head()

Unnamed: 0,activityID,heartrate,handTemperature,handAcc16_1,handAcc16_2,handAcc16_3,handAcc6_1,handAcc6_2,handAcc6_3,handGyro1,...,ankleAcc16_3,ankleAcc6_1,ankleAcc6_2,ankleAcc6_3,ankleGyro1,ankleGyro2,ankleGyro3,ankleMagne1,ankleMagne2,ankleMagne3
0,17,103.0,33.25,-6.19149,4.66471,1.35565,-6.14019,4.75933,1.4997,-0.845065,...,-2.25985,9.63847,-1.28601,-1.73367,-0.076267,-0.385391,0.167553,-52.1545,-5.86215,52.1747
1,6,122.0,32.0,-1.60835,7.12429,8.84406,-0.919899,7.34267,10.4133,0.060554,...,-1.4197,3.33647,2.8889,-0.245516,0.766419,-0.183654,1.75344,-66.3623,-3.49986,2.8046
2,7,146.0,32.4375,-5.51307,4.72498,0.210984,-5.69861,5.01146,-0.026481,-0.488648,...,-3.08772,9.70797,0.481822,-3.20785,-1.37746,0.493949,0.01404,-61.2106,22.0655,-8.72382
3,4,117.0,33.125,-7.22157,2.08505,0.502197,-7.5409,2.00505,0.782331,0.18755,...,-2.50755,9.32035,0.69434,-2.51025,-0.691283,0.308269,-0.147382,-112.052,-37.6019,17.1729
4,4,120.0,33.25,-5.96354,6.60452,2.08378,-5.90853,6.78177,1.93296,1.03411,...,-6.79281,25.7123,1.14804,-5.55014,2.55048,-1.70282,6.7862,-68.6012,-1.95131,-8.30066


In [6]:
y = data['activityID']
X = data.drop(['activityID'], axis=1)

print("Shape of X:", X.shape)
print("Shape of Y:", y.shape)

Shape of X: (22590, 40)
Shape of Y: (22590,)


In [7]:
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


print("Shape of training data: ", X_train.shape)
print("Shape of test data: ", X_test.shape)
print("Shape of training labels: ", y_train.shape)
print("Shape of test labels: ", y_test.shape)

Shape of training data:  (18072, 40)
Shape of test data:  (4518, 40)
Shape of training labels:  (18072,)
Shape of test labels:  (4518,)


In [13]:
def decision_tree_classifier(X, y, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a decision tree classifier
    clf = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
                                  min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf, 
                                  max_features=max_features)

    # Train the classifier on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    # Compute the accuracy of the classifier
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy


In [17]:
# Define the different possible values for each parameter
criterion = ['gini', 'entropy']
max_depth = [None, 5, 10, 15]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 3, 5]
max_features = [None, 3, 5, 10]

# Create a list of all possible combinations of parameter values
parameter_combinations = list(itertools.product(criterion, max_depth, min_samples_split, min_samples_leaf, max_features))

t = PrettyTable(['Criterion', 'Max Depth', 'Min Samples Split', 'Min Samples Leaf', 'Max Features', 'Accuracy'])
# Test the decision tree classifier with different parameter combinations

for params in tqdm.tqdm(parameter_combinations):
    criterion, max_depth, min_samples_split, min_samples_leaf, max_features = params
    accuracy = decision_tree_classifier(X, y, criterion=criterion, max_depth=max_depth, 
                                         min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                                         max_features=max_features)
    
    t.add_row([criterion, max_depth, min_samples_split, min_samples_leaf, max_features, accuracy*100])

print(t)

100%|██████████| 288/288 [01:38<00:00,  2.91it/s]

+-----------+-----------+-------------------+------------------+--------------+--------------------+
| Criterion | Max Depth | Min Samples Split | Min Samples Leaf | Max Features |      Accuracy      |
+-----------+-----------+-------------------+------------------+--------------+--------------------+
|    gini   |    None   |         2         |        1         |     None     | 99.88933156263833  |
|    gini   |    None   |         2         |        1         |      3       |  93.6697653829128  |
|    gini   |    None   |         2         |        1         |      5       | 96.96768481629039  |
|    gini   |    None   |         2         |        1         |      10      | 99.02611775121736  |
|    gini   |    None   |         2         |        3         |     None     |  99.867197875166   |
|    gini   |    None   |         2         |        3         |      3       | 92.87295263390881  |
|    gini   |    None   |         2         |        3         |      5       | 97.94156706




```
Boosting Decision Trees
```

In [32]:
from sklearn.ensemble import AdaBoostClassifier

def boosting_decision_tree_classifier(X, y):
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initisalize the AdaBoost Classifier
    clf = AdaBoostClassifier(random_state=42)

    # Train the classifier on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = clf.predict(X_test)

    # Calculate the accuracy of the classifier
    accuracy = accuracy_score(y_test, y_pred)

    # Print the accuracy
    print('Accuracy:', accuracy*100)

In [33]:
boosting_decision_tree_classifier(X, y)

AdaBoostClassifier(random_state=42)
Accuracy: 46.7020805666224


```
Random Forest
```

In [36]:
from sklearn.ensemble import RandomForestClassifier

def random_forest(X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a random forest classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the classifier on the training data
    clf.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = clf.predict(X_test)

    # Calculate the accuracy of the classifier
    accuracy = accuracy_score(y_test, y_pred)

    # Print the accuracy
    print('Accuracy:', accuracy*100)

In [37]:
random_forest(X, y)

Accuracy: 99.933598937583
