In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

#  1 - Reading and exploring the dataset

In [None]:
train = pd.read_csv('../input/human-activity-recognition-with-smartphones/train.csv')
test = pd.read_csv('../input/human-activity-recognition-with-smartphones/test.csv')

train.head()

In [None]:
train.shape

As we can see, our train dataset has 7352 instances and 563 attributes. 

Now, let us check the null values, see if we have any.

In [None]:
train.isnull().sum(axis=1)

Let's split the X values for the training split by dropping the class column.

In [None]:
x_train = train.drop(["Activity", "subject"], axis=1)
x_train

# #  1.1 - Exploring the classes

In [None]:
y_train = train.Activity
y_train.unique()

We can conclude that we have six label classes.

In [None]:
pd.crosstab(index=y_train, columns="count")

# #  1.2 - Reading and exploring the dataset
In the following, I define train and test data sets and class labels.

Also, I changed the class labels.

In [None]:
train = pd.read_csv('../input/human-activity-recognition-with-smartphones/train.csv')
test = pd.read_csv('../input/human-activity-recognition-with-smartphones/test.csv')

le = LabelEncoder()
train["activity"] = le.fit_transform(train["Activity"])
train["Activity"] = train["activity"]
test["activity"] = le.fit_transform(test["Activity"])
test["Activity"] = test["activity"]

x_train = train.drop(["Activity", "subject", "activity"], axis=1)
y_train = train.Activity
x_test = test.drop(["Activity", "subject", "activity"], axis=1)
y_test = test.Activity

x_train, x_test, y_train, y_test = (
    x_train.values,
    x_test.values,
    y_train.values,
    y_test.values,
)

#  2 - Applying Gradient Boosting Classifier

In the following, I used GradientBoostingClassifier from the Sklearn.

I have set all the parameters as their default values except for the learning rate, max_depth, and subsample. To find their optimum values, I used GridsearchCV to implement cross-validation on all the specified parameters. The number of the split in cross-validation has been set to 2 to run the model faster.



In [None]:
model = GradientBoostingClassifier(
    loss="deviance",
    learning_rate=0.1,
    n_estimators=100,
    subsample=1,
    min_samples_split=10,
)

param_grid = {
    "learning_rate": [0.01,0.05, 0.15, 0.2],
    "max_depth": [3, 5],
    "subsample": [0.5, 1.0],
}

GridSearchCV = GridSearchCV(model, param_grid, cv=2)
GridSearchCV.fit(x_train, y_train)


In [None]:
pred = model.predict(x_test)
accuracy = accuracy_score(y_test, pred)
accuracy