In this notebook I will try to use different ML algorithms such as Random Forest, Boosting, SVM, NN to build classifier models, test and compare performance between them.

In [None]:
import pandas as pd 
from sklearn.utils import shuffle
# reading data from csv files and converting to matrix 
test  = pd.read_csv("../input/test.csv")  
train = pd.read_csv("../input/train.csv") 

# suffling data 
test  = shuffle(test)
train = shuffle(train)

# separating data inputs and output lables 
trainData  = train.drop('Activity' , axis=1).values
trainLabel = train.Activity.values

testData  = test.drop('Activity' , axis=1).values
testLabel = test.Activity.values

In [None]:
# encoding labels 
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()

# encoding test labels 
encoder.fit(testLabel)
testLabelE = encoder.transform(testLabel)

# encoding train labels 
encoder.fit(trainLabel)
trainLabelE = encoder.transform(trainLabel)

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:

#train and test with Random Forest
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200,  n_jobs=4, min_samples_leaf=10)    
#train
rf.fit(trainData, trainLabelE)
#test
p_te = rf.predict_proba(testData)
#auc_te = roc_auc_score(testLabelE, p_te)
y_te_pred = rf.predict(testData)
acc = accuracy_score(testLabelE, y_te_pred)
prec = precision_score(testLabelE, y_te_pred, average="macro")
rec = recall_score(testLabelE, y_te_pred, average="macro")
cfs = confusion_matrix(testLabelE, y_te_pred)
print("Acc: %3.5f, P: %3.5f, R: %3.5f" % (acc, prec, rec))
#print("Confusing Matrix:\n", cfs)
# Plot non-normalized confusion matrix
plt.figure()
class_names = encoder.classes_
plot_confusion_matrix(cfs, classes=class_names,
                      title='RF Confusion matrix, without normalization')

In [None]:
print(trainData.shape)
print(trainLabelE.shape)
print(testLabelE.shape)
print(p_te.shape)
print(y_te_pred.shape)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=0))
#linear kernel gave Acc: 0.96437, P: 0.96695, R: 0.96396
#classifier = OneVsRestClassifier(svm.SVC(kernel='rbf', probability=True, random_state=0))
#RBF kernel gave Acc: 0.92806, P: 0.93173, R: 0.92578
classifier.fit(trainData, trainLabelE)
y_score = classifier.predict_proba(testData)
y_te_pred = classifier.predict(testData)
acc = accuracy_score(testLabelE, y_te_pred)
prec = precision_score(testLabelE, y_te_pred, average="macro")
rec = recall_score(testLabelE, y_te_pred, average="macro")
cfs = confusion_matrix(testLabelE, y_te_pred)
print("Acc: %3.5f, P: %3.5f, R: %3.5f" % (acc, prec, rec))
#print("Confusing Matrix:\n", cfs)
# Plot non-normalized confusion matrix
plt.figure()
class_names = encoder.classes_
plot_confusion_matrix(cfs, classes=class_names,
                      title='SVM Confusion Matrix, without normalization')

In [None]:
#perf test with Adaboost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                                    n_estimators=200,
                                    learning_rate=1.5,
                                    algorithm="SAMME")
bdt_discrete.fit(trainData, trainLabelE)
y_score = bdt_discrete.predict_proba(testData)
y_te_pred = bdt_discrete.predict(testData)
acc = accuracy_score(testLabelE, y_te_pred)
prec = precision_score(testLabelE, y_te_pred, average="macro")
rec = recall_score(testLabelE, y_te_pred, average="macro")
cfs = confusion_matrix(testLabelE, y_te_pred)
print("Acc: %3.5f, P: %3.5f, R: %3.5f" % (acc, prec, rec))
#print("Confusing Matrix:\n", cfs)
# Plot non-normalized confusion matrix
plt.figure()
class_names = encoder.classes_
plot_confusion_matrix(cfs, classes=class_names,
                      title='AdaBoost Confusion Matrix, without normalization')

In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()
scaler.fit(trainData)  
trainData_scaled = scaler.transform(trainData)  
# apply same transformation to test data
testData_scaled = scaler.transform(testData) 
#test with NN
import sklearn.neural_network as nn
mlpSGD  =  nn.MLPClassifier(hidden_layer_sizes=(50,50)
                            , solver='adam' 
                            , max_iter=1000
                            , early_stopping=True
                            , random_state=0) 
nnSGDModel  = mlpSGD.fit(trainData_scaled , trainLabelE)
y_score = nnSGDModel.predict_proba(testData_scaled)
y_te_pred = nnSGDModel.predict(testData_scaled)
acc = accuracy_score(testLabelE, y_te_pred)
prec = precision_score(testLabelE, y_te_pred, average="macro")
rec = recall_score(testLabelE, y_te_pred, average="macro")
cfs = confusion_matrix(testLabelE, y_te_pred)
print("Acc: %3.5f, P: %3.5f, R: %3.5f" % (acc, prec, rec))
#print("Confusing Matrix:\n", cfs)
# Plot non-normalized confusion matrix
plt.figure()
class_names = encoder.classes_
plot_confusion_matrix(cfs, classes=class_names,
                      title='MLP-SCG Confusion Matrix, without normalization')
#test with SGD no scaled HL(30,60) max_iter=1000 => Acc: 0.94673, P: 0.94860, R: 0.94456
#test with Adam no scaled HL(30,60) max_iter=1000 => Acc: 0.94537, P: 0.94720, R: 0.94511
#test with Adam scaled HL(20,10) max_iter=1000, EarlyStop => Acc: 0.94062, P: 0.94493, R: 0.93926
#test with Adam scaled HL(20,20) max_iter=1000, EarlyStop => Acc: 0.94774, P: 0.94912, R: 0.94670
#test with Adam scaled HL(20,30) max_iter=1000, EarlyStop => Acc: 0.94333, P: 0.94470, R: 0.94221
#test with Adam scaled HL(20,40) max_iter=1000, EarlyStop => Acc: 0.94808, P: 0.95020, R: 0.94749
#test with Adam scaled HL(30,30) max_iter=1000, EarlyStop => Acc: 0.94197, P: 0.94488, R: 0.94052
#test with Adam scaled HL(30,60) max_iter=1000, EarlyStop => Acc: 0.94096, P: 0.94463, R: 0.93973
#test with Adam scaled HL(40,20) max_iter=1000, EarlyStop => Acc: 0.93349, P: 0.93825, R: 0.93124
#test with Adam scaled HL(40,30) max_iter=1000, EarlyStop => Acc: 0.94774, P: 0.94872, R: 0.94674
#test with Adam scaled HL(40,40) max_iter=1000, EarlyStop => Acc: 0.94808, P: 0.95087, R: 0.94699
#test with Adam scaled HL(40,60) max_iter=1000, EarlyStop => Acc: 0.94605, P: 0.94655, R: 0.94495
#test with Adam scaled HL(40,80) max_iter=1000, EarlyStop => Acc: 0.94231, P: 0.94519, R: 0.94061
#test with Adam scaled HL(50,50) max_iter=1000, EarlyStop => Acc: 0.95080, P: 0.95176, R: 0.94989

So far, we can see that SVM (linear kernel) gave the best result: 96% accuracy with high precision and recall also.
In the future we can try tuning different parameters of those ML algorithms to find better model. 
 - SVM-linear:             Acc: 0.96437, P: 0.96695, R: 0.96396
 - MLP-SCG(50,50):   Acc: 0.95080, P: 0.95176, R: 0.94989