In [None]:
# ***Training Support Vector Machines for Multiclass Classification ***

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pylab as pl
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### Load the Train and Test set

In [None]:
# train = shuffle(pd.read_csv("../input/human-activity-recognition-with-smartphones/train.csv"))
# test = shuffle(pd.read_csv("../input/human-activity-recognition-with-smartphones/test.csv"))
train = shuffle(pd.read_csv("../input/motion-recog-data-train-test-validation/train.csv"))
validation = shuffle(pd.read_csv("../input/motion-recog-data-train-test-validation/validation.csv"))
test = shuffle(pd.read_csv("../input/motion-recog-data-train-test-validation/test11.csv"))

### Check for missing values in the dataset

In [None]:
print("Any missing sample in training set:",train.isnull().values.any())
print("Any missing sample in test set:",validation.isnull().values.any(), "\n")
print("Any missing sample in test set:",test.isnull().values.any(), "\n")



### Frequency Distribution of the Outome

In [None]:
#Frequency distribution of classes"
train_outcome = pd.crosstab(index=train["Activity"],  # Make a crosstab
                              columns="count")      # Name the count column

train_outcome


### Visualizing Outcome Distribution 

In [None]:
# Visualizing Outcome Distribution 
temp = train["Activity"].value_counts()
df = pd.DataFrame({'labels': temp.index,
                   'values': temp.values
                  })

#df.plot(kind='pie',labels='labels',values='values', title='Activity Ditribution',subplots= "True")

labels = df['labels']
sizes = df['values']
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral','cyan','lightpink']
patches, texts = plt.pie(sizes, colors=colors, shadow=True, startangle=90, pctdistance=1.1, labeldistance=1.2)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.show()


### Normalize the Predictor(Feature Set) for SVM training 

In [None]:
# Seperating Predictors and Outcome values from train and test sets
X_train = pd.DataFrame(train.drop(['Activity','subject'],axis=1))
Y_train_label = train.Activity.values.astype(object)
X_validation = pd.DataFrame(validation.drop(['Activity','subject'],axis=1))
Y_validation_label = validation.Activity.values.astype(object)
X_test = pd.DataFrame(test.drop(['Activity','subject'],axis=1))
Y_test_label = test.Activity.values.astype(object)

# Dimension of Train and Test set 
print("Dimension of Train set",X_train.shape)
print("Dimension of Train set",X_validation.shape)
print("Dimension of Test set",X_test.shape,"\n")

# Transforming non numerical labels into numerical labels
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()

# encoding train labels 
encoder.fit(Y_train_label)
Y_train = encoder.transform(Y_train_label)

# encoding validation labels 
encoder.fit(Y_validation_label)
Y_validation = encoder.transform(Y_validation_label)

# encoding test labels 
encoder.fit(Y_test_label)
Y_test = encoder.transform(Y_test_label)

#Total Number of Continous and Categorical features in the training set
num_cols = X_train._get_numeric_data().columns
print("Number of numeric features:",num_cols.size)
#list(set(X_train.columns) - set(num_cols))


names_of_predictors = list(X_train.columns.values)

# Scaling the Train, validation and Test feature set 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.fit_transform(X_validation)
X_test_scaled = scaler.transform(X_test)


### Hyperparameter tuning using grid search and cross validation

In [None]:
#Libraries to Build Ensemble Model : Random Forest Classifier 
# Create the parameter grid based on the results of random search 
params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]


### Training SVM model using radial kernel

In [None]:
# Performing CV to tune parameters for best SVM fit 
# svm_model = GridSearchCV(SVC(), params_grid, cv=5)
# svm_model.fit(X_train_scaled, Y_train)


# # **Modified/added code--Using a logistic regression linear classifier**

In [None]:
# logistic regression classifier
clf = LogisticRegression(random_state=0, C=2).fit(X_train_scaled, Y_train)

plot_C=[]
plot_y_validation=[]
plot_y_training=[]

# The solver for the logistic regression is changed for liblinear, lbfgs, sag, saga, newton-cg
for i in range(20):
    C=0.1*i
    clf = LogisticRegression(random_state=0,solver='liblinear')
    clf.fit(X_train_scaled, Y_train)
    Y_validation_pred = clf.predict(X_validation_scaled)
    Y_validation_pred_label = list(encoder.inverse_transform(Y_validation_pred))
    print("validation set score for logistic regression with C=")
    print(C , clf.score(X_validation_scaled  , Y_validation ))
    plot_C.append(C)                    
    plot_y_validation.append(clf.score(X_validation_scaled  , Y_validation ))
    plot_y_training.append(clf.score(X_train_scaled, Y_train))

plt.plot(plot_C,plot_y_validation)
plt.plot(plot_C,plot_y_training)
plt.legend(['validation set', 'training set'], loc='upper left')
plt.title('Accuracy vs C')            


clf = LogisticRegression(random_state=0,solver='lbfgs')
clf.fit(X_train_scaled, Y_train)
Y_validation_pred = clf.predict(X_validation_scaled)
Y_validation_pred_label = list(encoder.inverse_transform(Y_validation_pred))
print("training set score for logistic regression with lbfgs")
print(clf.score(X_train_scaled  , Y_train ))
print("validation set score for logistic regression with lbfgs")
print(clf.score(X_validation_scaled  , Y_validation ))



clf = LogisticRegression(random_state=0,solver='newton-cg')
clf.fit(X_train_scaled, Y_train)
Y_validation_pred = clf.predict(X_validation_scaled)
Y_validation_pred_label = list(encoder.inverse_transform(Y_validation_pred))
print("training set score for logistic regression with newton-cg")
print(clf.score(X_train_scaled  , Y_train ))
print("validation set score for logistic regression with newton-cg")
print(clf.score(X_validation_scaled  , Y_validation ))

clf = LogisticRegression(random_state=0,solver='sag')
clf.fit(X_train_scaled, Y_train)
Y_validation_pred = clf.predict(X_validation_scaled)
Y_validation_pred_label = list(encoder.inverse_transform(Y_validation_pred))
print("training set score for logistic regression with sag")
print(clf.score(X_train_scaled  , Y_train ))
print("validation set score for logistic regression with sag")
print(clf.score(X_validation_scaled  , Y_validation ))


clf = LogisticRegression(random_state=0,solver='saga')
clf.fit(X_train_scaled, Y_train)
Y_validation_pred = clf.predict(X_validation_scaled)
Y_validation_pred_label = list(encoder.inverse_transform(Y_validation_pred))
print("training set score for logistic regression with saga")
print(clf.score(X_train_scaled  , Y_train ))
print("validation set score for logistic regression with saga")
print(clf.score(X_validation_scaled  , Y_validation ))



clf = LogisticRegression(random_state=0,solver='liblinear')
clf.fit(X_train_scaled, Y_train)
Y_validation_pred = clf.predict(X_validation_scaled)
Y_validation_pred_label = list(encoder.inverse_transform(Y_validation_pred))
print("training set score for logistic regression with liblinear")
print(clf.score(X_train_scaled  , Y_train ))
print("validation set score for logistic regression with liblinear")
print(clf.score(X_validation_scaled  , Y_validation ))

    
    
    

### Confusion Matrix  and Accuracy Score 

In [None]:

#using logistic regression
Y_test_pred = clf.predict(X_test_scaled)
Y_test_pred_label = list(encoder.inverse_transform(Y_test_pred))

In [None]:
# Making the Confusion Matrix
#print(pd.crosstab(Y_test_label, Y_pred_label, rownames=['Actual Activity'], colnames=['Predicted Activity']))
print(confusion_matrix(Y_test_label,Y_test_pred_label))
print("\n")
print(classification_report(Y_test_label,Y_test_pred_label))

print("Training set score for logistic regression: %f" % clf.score(X_train_scaled , Y_train))
print("Testing  set score for logistic regression: %f" % clf.score(X_test_scaled  , Y_test ))

print("Training set score for logistic regression: %f" % clf.score(X_train_scaled , Y_train))
print("Testing  set score for logistic regression: %f" % clf.score(X_test_scaled  , Y_test ))

clf.score