**Jonathan Glaser**

**jmg764**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Data

In [None]:
X_test_new = pd.read_csv("../input/new-datasets/X_test_new.csv")
X_train_new = pd.read_csv("../input/new-datasets/X_train_new.csv")
y_test_new = pd.read_csv("../input/new-datasets/y_test_new.csv")
y_train_new = pd.read_csv("../input/new-datasets/y_train_new.csv")

In [None]:
X_train_new.head()

In [None]:
X_train_new.describe()

## Data Exploration

**Distribution of surface types in y_train_new**

In [None]:
y_train_new.surface.value_counts().plot(kind = 'bar')

**Distributions of feature values in series_id = 0**

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(26, 16))
for i, col in enumerate(X_train_new.columns[4:]):
    plt.subplot(3, 4, i + 1)
    plt.plot(X_train_new.loc[X_train_new['series_id'] == 0, col])
    plt.title(col)

## Feature Engineering

In X_train_new, each series_id corresponds to a collection of measurements obtained from a robot driving over a given floor surface. For example, rows 1-128 contain the orientation_X, orientation_Y, orientation_Z, angular_velocity_X, etc of various robots driving on fine concrete. It is therefore more useful to create a selection of features that summarize the measurements obtained on each surface. For example, we can obtain orientation_X_mean to represent the average orientation_X values for a particular series_id. 

In [None]:
columns=['orientation_X','orientation_Y','orientation_Z','orientation_W','angular_velocity_X','angular_velocity_Y','angular_velocity_Z','linear_acceleration_X','linear_acceleration_Y','linear_acceleration_Z']
def feature_data(X):
    new_data=pd.DataFrame()
    for col in columns:
        new_data[col+'_mean'] = X.groupby(['series_id'])[col].mean()
        new_data[col+'_median'] = X.groupby(['series_id'])[col].median()
        new_data[col+'_max'] = X.groupby(['series_id'])[col].max()
        new_data[col+'_min'] = X.groupby(['series_id'])[col].min()
        new_data[col + '_abs_max'] = X.groupby(['series_id'])[col].apply(lambda x: np.max(np.abs(x)))
        new_data[col + '_abs_min'] = X.groupby(['series_id'])[col].apply(lambda x: np.min(np.abs(x)))
        new_data[col + '_abs_avg'] = (new_data[col + '_abs_min'] + new_data[col + '_abs_max'])/2
        new_data[col+'_var'] = X.groupby(['series_id'])[col].var()
        new_data[col+'_std'] = X.groupby(['series_id'])[col].std()
        new_data[col + '_maxtoMin'] = new_data[col + '_max'] / new_data[col + '_min']
        new_data[col + '_range'] = new_data[col + '_max'] - new_data[col + '_min']
        new_data[col + '_mean_abs_chg'] = X.groupby(['series_id'])[col].apply(lambda x: np.mean(np.abs(np.diff(x))))
        new_data[col + '_abs_median_chg'] = X.groupby(['series_id'])[col].apply(lambda x: np.median(np.abs(np.diff(x))))
        new_data[col + '_abs_std_chg'] = X.groupby(['series_id'])[col].apply(lambda x: np.std(np.abs(x)))
        
    return new_data

In [None]:
X_train_new_2 = feature_data(X_train_new)
X_train_new_2.head()

In [None]:
X_test_new_2 = feature_data(X_test_new)
X_test_new_2.head()

# Modeling
* Here we evaluate several supervised learning models including multinomial logistic regression, decision tree, random forest, support vector machine, and k-nearest neighbors on their ability to accurately classify a particular surface given the information provided in X_train_new.
* Each model evaluation explores two scenarios: 
    1. Training the model using X_train_new and y_train_new and determining accuracy by testing on X_test_new and comparing results with y_test_new.
    2. Combining X_train_new with X_test_new and y_train_new with y_test_new in order to evaluate the accuracy of the given model using k-fold cross validation. 

## Logistic Regression

**Part 1**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import warnings  
warnings.filterwarnings('ignore')

lr = LogisticRegression(multi_class='multinomial')
lr.fit(X_train_new_2, y_train_new['surface'])
y_pred = lr.predict(X_test_new_2)
accuracy = metrics.accuracy_score(y_test_new['surface'], y_pred)
print(accuracy)

**Part 2**

In [None]:
# Combine X_train_new_2 and X_test_new_2, as well as y_train_new and y_test_new 
# in order to create X and y dataframes for performing k-fold cross validation

X_data = [X_train_new_2, X_test_new_2]
y_data = [y_train_new, y_test_new]

X = np.asarray(pd.concat(X_data))
y = np.asarray(pd.concat(y_data))

In [None]:
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn import preprocessing
import statistics

def k_fold_cross_validation_logistic(k, X, y):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    avg_accuracy = 0
    accuracies = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # scaling the data matrix:
        X_train = preprocessing.scale(X_train)
        X_test = preprocessing.scale(X_test)
        
        # Make prediction and determine accuracy for this fold:
        lr = LogisticRegression(multi_class='multinomial')
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        avg_accuracy += accuracy
        accuracies.append(accuracy)

    avg_accuracy = avg_accuracy / k
    stdev = statistics.stdev(accuracies)
    return  avg_accuracy, stdev

avg_accuracy, stdev = k_fold_cross_validation_logistic(10, X, y[:,3])
print("Average 10-fold CV accuracy = {:10.2f}, standard deviation = {:10.2f}".format(avg_accuracy, stdev))

The low standard deviation obtained suggests that the logistic regression model tends to have low variance which indicates that it is reliable. Additionally, the higher accuracy average obtained using 10-fold cross validation may be due to the fact that it uses a larger dataset (X_train_new and X_test_new combined) thereby allowing for better label predictions. 

## Decision Tree

**Part 1**

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

tree.fit(X_train_new_2, y_train_new['surface'])
y_pred_test_tree = tree.predict(X_test_new_2) 
accuracy = metrics.accuracy_score(y_test_new['surface'], y_pred_test_tree)
print(accuracy)

**Part 2**

In [None]:
def k_fold_cross_validation_decisiontree(k, X, y):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    avg_accuracy = 0
    accuracies = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # scaling the data matrix:
        X_train = preprocessing.scale(X_train)
        X_test = preprocessing.scale(X_test)
        
        # Make prediction and determine accuracy for this fold:
        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_test) 
        accuracy = metrics.accuracy_score(y_test, y_pred)
        avg_accuracy += accuracy
        accuracies.append(accuracy)
        
    avg_accuracy = avg_accuracy / k
    stdev = statistics.stdev(accuracies)
    return  avg_accuracy, stdev


avg_accuracy, stdev = k_fold_cross_validation_decisiontree(10, X, y[:,3])
print("Average 10-fold CV accuracy = {:10.2f}, standard deviation = {:10.2f}".format(avg_accuracy, stdev))

The above results support the generality that decision trees work well with the data used to create them, but they are not flexible when it comes to classifying new samples. 

The higher accuracy in part 1 compared to part 2 shows that the decision tree classifier has a tendency to overfit. The standard deviation of 10-fold cross validation accuracies is slightly higher than that of logistic regression. Taken together, these indicate low bias and high variance which is generally not favorable when selecting a model.

## Random Forest Classifier

**Part 1**

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=600)

forest.fit(X_train_new_2, y_train_new['surface'])
y_pred_test_forest = forest.predict(X_test_new_2) 
accuracy = metrics.accuracy_score(y_test_new['surface'], y_pred_test_forest)
print(accuracy)

**Part 2**

In [None]:
def k_fold_cross_validation_randomforest(k, X, y):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    avg_accuracy = 0
    accuracies = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # scaling the data matrix:
        X_train = preprocessing.scale(X_train)
        X_test = preprocessing.scale(X_test)
        
        # Make prediction and determine accuracy for this fold:
        forest = RandomForestClassifier(n_estimators=600)
        forest.fit(X_train, y_train)
        y_pred = forest.predict(X_test) 
        accuracy = metrics.accuracy_score(y_test, y_pred)
        avg_accuracy += accuracy
        accuracies.append(accuracy)
        
    avg_accuracy = avg_accuracy / k
    stdev = statistics.stdev(accuracies)
    return  avg_accuracy, stdev


avg_accuracy, stdev = k_fold_cross_validation_randomforest(10, X, y[:,3])
print("Average 10-fold CV accuracy = {:10.2f}, standard deviation = {:10.2f}".format(avg_accuracy, stdev))

In general, random forests are more flexible than decision trees when it comes to classifying new samples. The above results support this claim since it resulted in higher accuracy and less variance than decision tree classification.

## Support Vector Machine

**Part 1**

In [None]:
from sklearn.svm import SVC

svm = SVC(decision_function_shape='ovo')
svm.fit(X_train_new_2, y_train_new['surface'])
y_pred_test_svm = svm.predict(X_test_new_2) 
accuracy = metrics.accuracy_score(y_test_new['surface'], y_pred_test_svm)
print(accuracy)

**Part 2**

In [None]:
def k_fold_cross_validation_svm(k, X, y):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    avg_accuracy = 0
    accuracies = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # scaling the data matrix:
        X_train = preprocessing.scale(X_train)
        X_test = preprocessing.scale(X_test)
        
        # Make prediction and determine accuracy for this fold:
        svm = SVC(decision_function_shape='ovo')
        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)  
        accuracy = metrics.accuracy_score(y_test, y_pred)
        avg_accuracy += accuracy
        accuracies.append(accuracy)
        
    avg_accuracy = avg_accuracy / k
    stdev = statistics.stdev(accuracies)
    return  avg_accuracy, stdev


avg_accuracy, stdev = k_fold_cross_validation_svm(10, X, y[:,3])
print("Average 10-fold CV accuracy = {:10.2f}, standard deviation = {:10.2f}".format(avg_accuracy, stdev))

As with logistic regression, average 10-fold cross validation accuracy is higher than when simply training on X_train_new and testing on X_test_new, and standard deviation is relatively low. Here, however, the disparity between accuracies is greater which suggests that SVM may have a higher dependence on dataset size than logistic regression. 

## K-Nearest Neighbors

**Part 1**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_accuracies = []
for i in range(1, 100):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_new_2, y_train_new['surface'])
    y_pred_test_knn = knn.predict(X_test_new_2) 
    accuracy = metrics.accuracy_score(y_test_new['surface'], y_pred_test_knn)
    knn_accuracies.append(accuracy)

print("Maximum accuracy:", max(knn_accuracies), "obtained at k =", knn_accuracies.index(max(knn_accuracies)))



**Part 2**

In [None]:
def k_fold_cross_validation_knn(k, X, y):
    kf = KFold(n_splits=k, random_state=21, shuffle=True)
    avg_accuracy = 0
    accuracies = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # scaling the data matrix:
        X_train = preprocessing.scale(X_train)
        X_test = preprocessing.scale(X_test)
        
        # Make prediction and determine accuracy for this fold:
        knn = KNeighborsClassifier(n_neighbors=20)
        knn.fit(X_train, y_train['surface'])
        y_pred = knn.predict(X_test) 
        accuracy = metrics.accuracy_score(y_test['surface'], y_pred)        
        avg_accuracy += accuracy
        accuracies.append(accuracy)
        
    avg_accuracy = avg_accuracy / k
    stdev = statistics.stdev(accuracies)
    return  avg_accuracy, stdev


avg_accuracy, stdev = k_fold_cross_validation_svm(10, X, y[:,3])
print("Average 10-fold CV accuracy = {:10.2f}, standard deviation = {:10.2f}".format(avg_accuracy, stdev))

Accuracy is reasonably high in both cases and low variance is achieved, but the random forest classifier seems to achieve an even higher accuracy.

## Final Prediction using Random Forest

In conclusion, the random forest classifier's high accuracy and relatively low variance proves that it is the best model for surface type classification out of the classifiers explored here.

In [None]:
y_pred = pd.DataFrame(y_pred_test_tree)
y_pred

In [None]:
accuracy = metrics.accuracy_score(y_test_new['surface'], y_pred_test_forest)
print("Accuracy = ", accuracy)