# Human Activity Recognition

In [1]:
!gdown https://drive.google.com/uc?id=1ypa5iZ1dLDO-zGRO_yDXrJnMvjRGUG4v

Downloading...
From: https://drive.google.com/uc?id=1ypa5iZ1dLDO-zGRO_yDXrJnMvjRGUG4v
To: C:\workspace\flyai\HumanActivityRecognition.zip

  0%|          | 0.00/25.7M [00:00<?, ?B/s]
  2%|2         | 524k/25.7M [00:00<00:09, 2.69MB/s]
 10%|#         | 2.62M/25.7M [00:00<00:02, 9.43MB/s]
 18%|#8        | 4.72M/25.7M [00:00<00:01, 13.1MB/s]
 29%|##8       | 7.34M/25.7M [00:00<00:01, 17.2MB/s]
 39%|###8      | 9.96M/25.7M [00:00<00:00, 19.5MB/s]
 47%|####6     | 12.1M/25.7M [00:00<00:00, 19.9MB/s]
 55%|#####5    | 14.2M/25.7M [00:00<00:00, 20.1MB/s]
 63%|######3   | 16.3M/25.7M [00:00<00:00, 18.8MB/s]
 71%|#######1  | 18.4M/25.7M [00:01<00:00, 19.4MB/s]
 82%|########1 | 21.0M/25.7M [00:01<00:00, 18.8MB/s]
 94%|#########3| 24.1M/25.7M [00:01<00:00, 20.6MB/s]
100%|##########| 25.7M/25.7M [00:01<00:00, 18.0MB/s]


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [3]:
human = pd.read_csv('./HumanActivityRecognition/train.csv')
human.shape

(7352, 563)

#### 데이터 준비
- 결측치/중복치 제거
- 중복치
- X, y 분리
- y값 인코딩

- 정규화 (표준정규화)
- GridSearchCV : 베스트 모델 찾기
- 베스트 모델로 텍스트파일 읽어서 score 계산하기.

In [4]:
# 파일 잘 들어왔는지 확인
human.shape

(7352, 563)

In [5]:
# 결측지 확인 및 제거

human = human.dropna()
human.isnull().sum().value_counts()

0    563
dtype: int64

In [6]:
# 중복치 확인

human.duplicated().sum()

0

In [7]:
# columns 확인 및 target 확인
human.columns

Index(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',
       'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',
       'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',
       'tBodyAcc-max()-X',
       ...
       'fBodyBodyGyroJerkMag-kurtosis()', 'angle(tBodyAccMean,gravity)',
       'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'subject', 'Activity'],
      dtype='object', length=563)

In [8]:
human['Activity'].unique()

array(['STANDING', 'SITTING', 'LAYING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS'], dtype=object)

In [9]:
# 'Activity' 수치화
human['Activity'] = human['Activity'].map({
    'STANDING': 0,
    'SITTING': 1,
    'LAYING': 2,
    'WALKING': 3,
    'WALKING_DOWNSTAIRS': 4,
    'WALKING_UPSTAIRS': 5
    })

In [10]:
human['Activity'].isnull().sum()

0

In [11]:
X = human.iloc[:, :-1]
y = human.iloc[:, -1]
# subject = human.iloc[:, -2]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(5881, 562) (1471, 562)
(5881,) (1471,)


In [13]:
# 스케일러 선언(표준화) 및 X_train 표준화
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_train

array([[ 0.05539972,  0.06951267,  0.04958205, ...,  0.50057889,
         0.13391142, -0.71188214],
       [-0.11783822, -0.0968819 ,  0.31284518, ..., -2.03723688,
        -1.31666206, -0.04297491],
       [-0.81177185, -0.6976301 , -0.69121041, ..., -0.13884357,
         0.6734921 , -0.15445945],
       ...,
       [-0.32591332,  0.59175554,  0.86017578, ...,  0.88444093,
         0.40270946, -1.82672752],
       [-0.67731025,  0.5104703 , -0.21169271, ...,  0.60660454,
         0.15183553, -1.26930483],
       [ 0.93767005,  0.40544321,  0.72394589, ...,  0.44080701,
         0.72726848,  0.62593232]])

In [14]:
# y_train 도 표준화
y_train = y_train.values

In [18]:
y_train

array([0, 2, 3, ..., 5, 3, 3], dtype=int64)

In [19]:
# C=1 <- 이 값을 바꿔보면?
param_range = [0.001, 0.01, 0.1, 1, 10, 100]

param = [
    {
        # gridSearch 1
        'C' : param_range,
        'gamma' : param_range, 
        'kernel' : ['rbf']
    },
    {
        # gridSearch 2
        'C' : param_range,
        'kernel' : ['linear']
    },
    {
        # gridSearch 3
        'C' : param_range,
        'degree' : [2, 3],
        'kernel' : ['poly']
    }
]

In [20]:
from sklearn.model_selection import GridSearchCV # Gridsearch와 CV를 같이함

# 어떤 모델을 사용할지 정하자
clf = SVC(random_state = 2022)

gs = GridSearchCV(estimator = clf,
                  param_grid = param,
                 scoring = 'accuracy',
                  cv = 5,
                  n_jobs = -1, # 내가 가진 PC의 성능 최대
                  verbose = 3 # 로그 표시 
                 ) # CV = Cross Validation / estimator : 테스트할 모델

gs.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [23]:
print(gs.best_estimator_)
print(gs.best_score_)
print(gs.best_params_)

SVC(C=100, gamma=0.001, random_state=2022)
0.9858857986695103
{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}


In [21]:
# - 베스트 모델로 텍스트파일 읽어서 score 계산하기. 

In [25]:
clf = SVC(C=100, gamma=0.001, kernel = 'rbf', random_state = 2022)
clf.fit(X_train, y_train)

In [31]:
# X_test도 스케일링을 해준다.

scaler = StandardScaler()
scaler.fit(X_test)
X_test = scaler.transform(X_test)
X_test

array([[-0.45523418, -1.16467464, -2.03231041, ...,  0.62942969,
         1.19486517,  0.60902179],
       [-1.75121047,  0.2876541 , -0.09979937, ...,  0.24311075,
        -0.08408319,  0.94263812],
       [-0.2169866 , -0.45236066,  0.26746492, ..., -0.46279861,
        -3.18090975, -1.17026531],
       ...,
       [-1.89766285, -1.47242222,  1.29549129, ...,  0.85371924,
         0.63490829, -1.83749797],
       [-0.15430522,  0.9663005 ,  0.50105493, ...,  0.37655787,
         0.03863153,  0.94263812],
       [ 2.70408486, -1.27704855, -0.34187428, ...,  0.83383252,
         0.20525554, -1.05905986]])

In [30]:
clf.score(X_test, y_test)

0.9877634262406526