# Importing Libraries

In [77]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Joining all the csv files (1.csv to 15.csv) 
## Naming it as train

In [78]:
join=[]
for a,b,c in os.walk('csv'):
    for file in c:
        df=pd.read_csv(os.path.join(a,file),header=None,index_col=0)
        join.append(df)
train=pd.concat(join,axis=0)
train.columns=['x','y','z','target']
train.reset_index(drop=True,inplace=True)
train.head(10)


Unnamed: 0,x,y,z,target
0,1502,2215,2153,1
1,1667,2072,2047,1
2,1611,1957,1906,1
3,1601,1939,1831,1
4,1643,1965,1879,1
5,1604,1959,1921,1
6,1640,1829,1940,1
7,1607,1910,1910,1
8,1546,2045,1910,1
9,1529,2049,1972,1


# Counting number of occurences of target variable 

In [79]:
train['target'].value_counts()


1    608667
7    593563
4    357064
3    216737
5     51498
2     47878
6     47770
0      3719
Name: target, dtype: int64

# Dropping 3 target values (0,2,5)
This is due to the fact that target value 0 has very few samples, target value 2 is "Standing Up, Walking and Going up\down stairs" and target value 5 is "Going Up\Down Stairs". 
There are 3 variables acceleration in X,Y and Z direction, now taking into account the activity of going down, here the Y coordinate will decrease (assuming Y coordinate as height) and while going up it will increase so for target values 2 and 5, The samples will contain both decreasing and increasing value of y which would be difficult to model as they are both different activities and require further seperation. So due to this reason they are removed while creating a model.

In [80]:
#train=train.drop_duplicates()
train=train[train['target'] != 0 ]
train=train[train['target'] != 2 ]
train=train[train['target'] != 5 ]

Y=train.target
X=train.drop(['target'],axis=1)
le = LabelEncoder()
le.fit(Y)
Y = le.transform(Y)

# Splitting data

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size = 0.20,random_state=0)

# KNeighbors Classifier

In [82]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(15)
fit  = knn.fit(X_train, y_train)
predicted = fit.predict(X_test)
accuracy = knn.score(X_test, y_test)
array=confusion_matrix(y_test, predicted)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))
print(accuracy)

[[111694   1221   3771     49   5042]
 [  2408  21168   9940    587   9463]
 [  5124   4281  53142    222   8489]
 [   389   1352   1903   2205   3668]
 [  4863   5330   8698   1135  98617]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91    121777
           1       0.63      0.49      0.55     43566
           2       0.69      0.75      0.71     71258
           3       0.53      0.23      0.32      9517
           4       0.79      0.83      0.81    118643

    accuracy                           0.79    364761
   macro avg       0.71      0.64      0.66    364761
weighted avg       0.78      0.79      0.78    364761

0.7863395483618041


In [83]:
joblib.dump(knn, 'knn.pkl')

['knn.pkl']

# Feature engineering
The new variable created is a the root mean square value of the 3 variables (X,Y and Z) and it is named as 'xyz'. Added 6 features in total (variance, mean, max, min, sum_man_min, mean square )

In [85]:
new = ((train['x']**2 + train['y']**2 + train['z']**2) / 3)**0.5
train[['x_V','y_V','z_V']]= np.var(train[['x','y','z']], axis=0)
train[['x_mean','y_mean','z_mean']] = np.mean(train[['x','y','z']], axis=0)
train[['x_min','y_min','z_min']] = np.min(train[['x','y','z']], axis = 0)
train[['x_max','y_max','z_max']] = np.max(train[['x','y','z']], axis = 0)
train[['x_sum','y_sum','z_sum']] = np.sum([train[['x_min','y_min','z_min']],train[['x_max','y_max','z_max']]], axis = 0)


In [None]:
train['xyz']=new
Y=train.target
X=train.drop(['target'],axis=1)
print(train.head(5))

# Scaling the input and Label Encoding targets

In [86]:
scaler=StandardScaler()
# transform data
X = scaler.fit_transform(X)
le = LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size = 0.20,random_state=0)

      x     y     z  target           x_V           y_V        z_V  \
0  1502  2215  2153       1  12388.780536  10054.238906  8828.3886   
1  1667  2072  2047       1  12388.780536  10054.238906  8828.3886   
2  1611  1957  1906       1  12388.780536  10054.238906  8828.3886   
3  1601  1939  1831       1  12388.780536  10054.238906  8828.3886   
4  1643  1965  1879       1  12388.780536  10054.238906  8828.3886   

        x_mean       y_mean       z_mean  x_min  y_min  z_min  x_max  y_max  \
0  1987.581921  2382.504686  1970.528338    282      2      1   3828   4095   
1  1987.581921  2382.504686  1970.528338    282      2      1   3828   4095   
2  1987.581921  2382.504686  1970.528338    282      2      1   3828   4095   
3  1987.581921  2382.504686  1970.528338    282      2      1   3828   4095   
4  1987.581921  2382.504686  1970.528338    282      2      1   3828   4095   

   z_max  x_sum  y_sum  z_sum          xyz  
0   4095   4110   4097   4096  1983.064800  
1   4095   411

# Applying KNeighborsClassifier on new features

In [90]:
from sklearn.neighbors import KNeighborsClassifier
clf3 = KNeighborsClassifier(25)
fit  = clf3.fit(X_train, y_train)
predicted = fit.predict(X_test)
accuracy = clf3.score(X_test, y_test)
array1=confusion_matrix(y_test, predicted)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))
print(accuracy)

[[111746   1133   3746     47   5105]
 [  2358  21056  10034    542   9576]
 [  5062   3655  53658    203   8680]
 [   397   1316   1910   2118   3776]
 [  4719   4802   8439   1004  99679]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91    121777
           1       0.66      0.48      0.56     43566
           2       0.69      0.75      0.72     71258
           3       0.54      0.22      0.32      9517
           4       0.79      0.84      0.81    118643

    accuracy                           0.79    364761
   macro avg       0.71      0.64      0.66    364761
weighted avg       0.78      0.79      0.78    364761

0.7902626651423809


# Grid Search for KNeighborsClassifier

In [93]:
from sklearn.model_selection import GridSearchCV
grid_params={
    'n_neighbors':[10,15,20,21,22,25,30,40,50],
    'weights':['uniform','distance'],
    'metric':['euclidian','manhattan']
}
gs=GridSearchCV(KNeighborsClassifier(),grid_params,verbose=1,cv=3,n_jobs=-1)
gs_results=gs.fit(X_train,y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 20.9min finished


In [94]:
gs_results.best_score_

0.7910160106226328

In [97]:
gs_results.best_estimator_


KNeighborsClassifier(metric='manhattan', n_neighbors=50)

In [100]:
joblib.dump(gs.best_estimator_, 'knn.pkl')

['knn.pkl']