## Hyper-parameter tuning for classifiers used

In [None]:
class Tee(object):
    def __init__(self, *files):
        self.files = files
    def write(self, obj):
        for f in self.files:
            f.write(obj)
            f.flush() # If you want the output to be visible immediately
    def flush(self) :
        for f in self.files:
            f.flush()

The variance threshold can be changed using the below variable.

In [None]:
vt = 0.07

In [None]:
import pandas as pd
import os, sys

os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"

df1 = pd.read_csv('data/train_data.csv')
df2 = pd.read_csv('data/train_labels.csv')
df = pd.merge(df1, df2, on='Unnamed: 0')
df.shape

In [None]:
df = df.drop(columns=['Unnamed: 0'])
df.shape

In [None]:
X = df.drop(['Activity'],axis=1)
y = df['Activity']

## Preprocessing

We now perform variance thresholding on the data and remove features having variance less than 0.8 as found out in a previous python script in `trial1`.

In [None]:
from sklearn.feature_selection import VarianceThreshold

selection = VarianceThreshold(threshold=(vt))    
X = selection.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Classification

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


## Build Classification models

In [None]:
# Define the models to be evaluated
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Neural Network': MLPClassifier()
}

# Define the hyperparameters to be tuned for each model
hyperparameters = {
    'Decision Tree': {
        'model__max_depth': [None, 5, 10, 20], 
        'model__max_features': [100, 'sqrt', 'log2'], 
        'model__ccp_alpha': [0.1, .01, .001], 
        'model__criterion' :['gini', 'entropy', 'log_loss']
    },
    'Random Forest': {
        'model__max_depth': [None, 5, 10, 20], 
        'model__max_features': [100, 'sqrt', 'log2'], 
        'model__ccp_alpha': [0.1, .01, .001], 
        'model__criterion' :['gini', 'entropy', 'log_loss']
    },
    'SVM': {
        'model__C': [0.1, 0.5, 1, 5, 10, 50, 100], 
        'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'scale', 'auto'], 
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    'KNN': {
        'model__n_neighbors': [i for i in range(1, 200)], 
        'model__weights': ['uniform', 'distance']
    },
    'Neural Network': {
        'model__max_iter': [500], 
        'model__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)], 
        'model__alpha': [0.0001, 0.001, 0.01], 
        'model__activation' : ['identity', 'logistic', 'tanh', 'relu'], 
        'model__solver' : ['lbfgs', 'sgd', 'adam']
    }
}

In [None]:
def evaluate_model(model_name, model, hyperparams):
    print(f"Evaluating {model_name}...")
    
    pipeline = Pipeline([('model', model)])
    # Perform grid search to tune hyperparameters
    grid_search = GridSearchCV(pipeline, hyperparams, cv=10, n_jobs=-1, scoring='f1_macro', verbose=2)
    
    logfile = open('tune.tsv', 'w')
    
    original_stderr = sys.stderr
    original_stdout = sys.stdout
    sys.stdout = Tee(sys.stdout, logfile)
    sys.stderr = sys.stdout

    grid_search.fit(X_train, y_train)
    
    sys.stdout = original_stdout
    sys.stderr = original_stderr
    
    logfile.close()
    
    print(f"Best hyperparameters: {grid_search.best_params_}")
    print(f"Training f1 score (macro): {grid_search.best_score_:.4f}")
    
    test_f1 = grid_search.best_estimator_.score(X_test, y_test)
    print(f"Test F1 score (macro): {test_accuracy:.4f}\n")

    # Test the model on the test set
    y_pred = grid_search.predict(X_test)
    test_f1 = f1_score(y_test, y_pred, average='macro')
    print('Test F1 score (macro):', test_f1)
    
    return grid_search

In [None]:
results = {}
# done_models = ['Decision Tree', 'Random Forest', 'SVM', 'KNN']
done_models = []
for model_name, model in models.items():
    if model_name in done_models:
        continue
    hyperparams = hyperparameters[model_name]
    results[model_name] = evaluate_model(model_name, model, hyperparams)

In [None]:
print(results)

## Results across various runs of GridSearchCV

## SVM
```
Evaluating SVM...
Best hyperparameters: {'model__C': 100, 'model__gamma': 0.01, 'model__kernel': 'rbf'}
Training F1 (macro): 0.9985
Test F1 (macro): 0.9931
```

## k-NN
```
Evaluating KNN...
Best hyperparameters: {'model__n_neighbors': 4, 'model__weights': 'distance'}
Training f1 score (macro): 0.9691
Test F1 score (macro): 0.9660
```

## MLP
```
Evaluating Neural Network...
Fitting 5 folds for each of 756 candidates, totalling 3780 fits
Best hyperparameters: {'model__activation': 'tanh', 'model__alpha': 0.0001, 'model__max_iter': 500, 'model__solver': 'adam'}
Training f1 score (macro): 0.9870
Test F1 score (macro): 0.9897
```

## DT
```
Evaluating Decision Tree...
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
Best hyperparameters: {'model__ccp_alpha': 0.001, 'model__criterion': 'entropy', 'model__max_depth': None, 'model__max_features': 100}
Training f1 score (macro): 0.9408
Test F1 score (macro): 0.9399
```

## RF
```
Evaluating Random Forest...
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
Best hyperparameters: {'model__ccp_alpha': 0.001, 'model__criterion': 'log_loss', 'model__max_depth': None, 'model__n_estimators': 500}
Training f1 score (macro): 0.9753
Test F1 score (macro): 0.9763
```