In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (15, 7)
plt.style.use("ggplot")

### Quick EDA

In [None]:
FILE_PATH = "../input/human-activity-recognition-with-smartphones"
df_train = pd.read_csv(FILE_PATH + "/train.csv")
df_test = pd.read_csv(FILE_PATH + "/test.csv")

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train["Activity"].value_counts(ascending=False).plot.barh()
_ = plt.title("Distribution of activities")

In [None]:
from sklearn.decomposition import PCA

pca = PCA(2, random_state=42)
X_datavis = pca.fit_transform(df_train.values[:, :-2])
_ = sns.scatterplot(X_datavis[:, 0], X_datavis[:, 1], hue=df_train["Activity"])

In [None]:
# How much of the variance did we capture? 
pca.explained_variance_ratio_.sum()

In [None]:
# What is the lowest number of components we need to explain 95% of the variance? 
pca_2 = PCA(n_components=0.95)
pca_2.fit(df_train.values[:, :-2])
pca_2.n_components_

### Simple XGBoost classification

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -1], train_size=0.8, 
                                                      random_state=42, stratify=df_train.iloc[:, -1])

X_test, y_test = df_test.iloc[:, :-2], df_test.iloc[:, -1]

In [None]:
cls = XGBClassifier(objective="logistic")
cls.fit(X_train, y_train)

In [None]:
cls.score(X_valid, y_valid)

In [None]:
cls.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

labels = df_train["Activity"].unique()
cm = confusion_matrix(cls.predict(X_valid), y_valid, normalize="true", labels=labels)
plt.imshow(cm)
plt.grid(False)
plt.ylabel("truth"), plt.xlabel("predicted"), plt.title("Confusion matrix for validation") 
_ = plt.xticks(range(0, len(labels)),labels, rotation = 90), plt.yticks(range(0, len(labels)), labels)

In [None]:
cm = confusion_matrix(cls.predict(X_test), y_test, normalize="true")
plt.imshow(cm)
plt.grid(False)
plt.ylabel("truth"), plt.xlabel("predicted"), plt.title("Confusion matrix for test") 
_ = plt.xticks(range(0, len(labels)),labels, rotation = 90), plt.yticks(range(0, len(labels)), labels)

## Feature Importance

Here I'll use two different methods to determine feature importance.

The first is to use the feature importances vector from a decision tree based classifier, which is based on impurity.

The second is described as follows: First, we create, fit and score a baseline model. Then, for every feature, we permute it, and get the difference between the baseline accuracy and the score of the model on the permuted dataset. The intuition here is that if that feature really is important, then shuffling it should cause the greatest decrease in accuracy. (Of course, you can choose any other metric other than accuracy).  

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
rfc = RandomForestClassifier(n_jobs=-1, random_state=42)
rfc.fit(X_train, y_train)

In [None]:
# In decending order of highest to lowest, top 10
most_important = rfc.feature_importances_.argsort()[:-10:-1]
imps = rfc.feature_importances_[most_important]
vars_ = df_train.columns[most_important]
_ = [print(f"Feature: {var}, Importance: {imp}") for var, imp in zip(vars_, imps)]

In [None]:
most_important

In [None]:
# Method 2
from sklearn.linear_model import RidgeClassifier
from tqdm.auto import tqdm

# Ridge Classifier was chose due to its speed. 
base = RidgeClassifier()
base.fit(X_train, y_train)
base_acc = base.score(X_valid, y_valid)
N_features = 561
feature_imp = []

for feature in tqdm(range(N_features)): 
    X_valid_new = X_valid.copy().values
    shuffled = np.random.permutation(X_valid_new[:, feature])
    X_valid_new[:, feature] = shuffled
    feature_imp.append(base_acc - base.score(X_valid_new, y_valid))

In [None]:
feature_imp = np.array(feature_imp)
# Descending Order of most importance
most_important = feature_imp.argsort()[:-10:-1]
imps = feature_imp[most_important]
vars_ = df_train.columns[most_important]
_ = [print(f"Feature: {var}, Importance: {imp}") for var, imp in zip(vars_, imps)]

In [None]:
# It's also provided by sklearn
from sklearn.inspection import permutation_importance

result = permutation_importance(base, X_valid, y_valid, n_repeats=1, n_jobs=-1)

In [None]:
most_important = result["importances_mean"].argsort()[:-10:-1]
imps = result["importances_mean"][most_important]
vars_ = df_train.columns[most_important]
_ = [print(f"Feature: {var}, Importance: {imp}") for var, imp in zip(vars_, imps)]

### Visualization with only the top two most important features

In [None]:
imp1, imp2 = most_important[:2]

sns.scatterplot(X_train.iloc[:, imp1], X_train.iloc[:, imp2], hue=y_train)
_ = plt.title("Scatter plot with the two most important features")

From the graph above, we can see why they are considered important: we can clearly see that we can easily divide 'laying' and the other activities from there two metrics alone. 