In [1]:
import sys
assert sys.version_info >= (3,5)

#is this notebook running on Colab or Kaggle? # 코랩에서 실행되냐?
IS_COLAB = "google.colab" in sys.modules

#Scikit-learn >= 0.20 is required # 사이킷런 버전 설정
import sklearn
assert sklearn.__version__ >= "0.20"

#Common imports #당연히 임포트 하는 것들 
import numpy as np
import os

#to make this notebook's output stable across runs # ??
np.random.seed(42)

# To plot pretty figures # matplotlib 설정?
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

#Where to save the figures # 어디에 저장할지?
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

#png로 저장하는 함수
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import pandas as pd

DATA_PATH = "datasets"

def load_data():
    csv_path = os.path.join(DATA_PATH, "train.csv")
    return pd.read_csv(csv_path) 

In [3]:
pd_train = load_data()

In [4]:
pd_train

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.030400,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.123520,-0.998245,-0.975300,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.995380,-0.967187,-0.978944,-0.996520,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.982750,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.016570,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.123320,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,0.299665,-0.057193,-0.181233,-0.195387,0.039905,0.077078,-0.282301,0.043616,0.060410,0.210795,...,-0.880324,-0.190437,0.829718,0.206972,-0.425619,-0.791883,0.238604,0.049819,30,WALKING_UPSTAIRS
7348,0.273853,-0.007749,-0.147468,-0.235309,0.004816,0.059280,-0.322552,-0.029456,0.080585,0.117440,...,-0.680744,0.064907,0.875679,-0.879033,0.400219,-0.771840,0.252676,0.050053,30,WALKING_UPSTAIRS
7349,0.273387,-0.017011,-0.045022,-0.218218,-0.103822,0.274533,-0.304515,-0.098913,0.332584,0.043999,...,-0.304029,0.052806,-0.266724,0.864404,0.701169,-0.779133,0.249145,0.040811,30,WALKING_UPSTAIRS
7350,0.289654,-0.018843,-0.158281,-0.219139,-0.111412,0.268893,-0.310487,-0.068200,0.319473,0.101702,...,-0.344314,-0.101360,0.700740,0.936674,-0.589479,-0.785181,0.246432,0.025339,30,WALKING_UPSTAIRS


In [5]:
print(pd_train.keys())

Index(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',
       'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',
       'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',
       'tBodyAcc-max()-X',
       ...
       'fBodyBodyGyroJerkMag-kurtosis()', 'angle(tBodyAccMean,gravity)',
       'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'subject', 'Activity'],
      dtype='object', length=563)


In [6]:
print(pd_train.isna().sum())

tBodyAcc-mean()-X       0
tBodyAcc-mean()-Y       0
tBodyAcc-mean()-Z       0
tBodyAcc-std()-X        0
tBodyAcc-std()-Y        0
                       ..
angle(X,gravityMean)    0
angle(Y,gravityMean)    0
angle(Z,gravityMean)    0
subject                 0
Activity                0
Length: 563, dtype: int64


## Train Data, Target 분류

In [7]:
X = pd_train.drop(['subject', 'Activity'],axis=1).values
y = pd_train['Activity'].values

In [8]:
print(X.shape)
print(y.shape)

print(X)
print(y)

(7352, 561)
(7352,)
[[ 0.28858451 -0.02029417 -0.13290514 ... -0.84124676  0.17994061
  -0.05862692]
 [ 0.27841883 -0.01641057 -0.12352019 ... -0.8447876   0.18028889
  -0.05431672]
 [ 0.27965306 -0.01946716 -0.11346169 ... -0.84893347  0.18063731
  -0.04911782]
 ...
 [ 0.27338737 -0.01701062 -0.04502183 ... -0.77913261  0.24914484
   0.04081119]
 [ 0.28965416 -0.01884304 -0.15828059 ... -0.78518142  0.24643223
   0.02533948]
 [ 0.35150347 -0.01242312 -0.20386717 ... -0.78326693  0.24680852
   0.03669484]]
['STANDING' 'STANDING' 'STANDING' ... 'WALKING_UPSTAIRS'
 'WALKING_UPSTAIRS' 'WALKING_UPSTAIRS']


## 데이터 순서 섞기

In [9]:
import numpy as np
shuffle_index = np.random.permutation(7352)
X_train, y_train = X[shuffle_index], y[shuffle_index]

## 데이터 스케일링 및 라벨인코딩

In [10]:
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()

encoder.fit(y_train)
Y_train = encoder.transform(y_train)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

## Random forest 

평균 98.2%

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)

forest_cv_score = cross_val_score(forest_clf, X_train_scaled, Y_train, cv=10, scoring="accuracy")

In [13]:
forest_cv_score

array([0.98233696, 0.97690217, 0.98503401, 0.97959184, 0.97959184,
       0.98639456, 0.98367347, 0.97959184, 0.98911565, 0.98231293])

In [14]:
np.mean(forest_cv_score)

0.9824545252883763

## SVM

평균 98.8%

In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV

params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

svm_model = GridSearchCV(SVC(), params_grid, cv=10)
svm_model.fit(X_train_scaled, Y_train)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000], 'kernel': ['linear']}])

In [16]:
print('Best score:', svm_model.best_score_) 

print('Best C:',svm_model.best_estimator_.C) 
print('Best Kernel:',svm_model.best_estimator_.kernel)
print('Best Gamma:',svm_model.best_estimator_.gamma)

Best score: 0.9889819949719019
Best C: 1000
Best Kernel: rbf
Best Gamma: 0.001


In [18]:
svm_final_model = SVC(kernel="rbf", gamma=0.001, C=1000)

svm_cv_score = cross_val_score(svm_final_model, X_train_scaled, Y_train, cv=10, scoring="accuracy")

SVM 결과

In [19]:
svm_cv_score

array([0.99592391, 0.98641304, 0.98639456, 0.97414966, 0.98911565,
       0.99319728, 0.99047619, 0.99047619, 0.99183673, 0.99183673])

In [20]:
np.mean(svm_cv_score)

0.9889819949719019

## ExtraTreeClassifier

98.5%

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier

extra_clf = ExtraTreesClassifier(random_state=42)

extra_cv_score = cross_val_score(extra_clf, X_train_scaled, Y_train, cv=10, scoring="accuracy")

In [22]:
print(extra_cv_score)
print(np.mean(extra_cv_score))

[0.98505435 0.99048913 0.98095238 0.98231293 0.98095238 0.98367347
 0.99183673 0.98639456 0.9877551  0.98911565]
0.985853667553978


## 결론

가장 높은 정확도는 SVM의 결과인 98.8% 이다.