![RP Tango](./dataset/Tango_Management_logo.png "RP Tango")

***
# Smartphone-Based Recognition of Human Stance 
Objective : Detect Human Stance (sitting or standing)
## Based on Activities and Postural Transitions Data Set
http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions#
***

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6

from sklearn.decomposition import PCA
from sklearn import preprocessing

output_notebook()

### 1: Data-Import and DataFrame Manipulation

In [20]:
#DataImport and DF manipulation
columnas_features = ["feature" + str(i) for i in range(1,562)]
activity_labels = pd.read_csv('./dataset/activity_labels.txt', header=None,delim_whitespace=True, names=["id", "label"])
features=pd.read_csv('./dataset/features.txt',header = None)
x_train_raw = pd.read_csv('./dataset/Train/X_train.txt', header = None,delim_whitespace=True,names=columnas_features )
y_train_raw = pd.read_csv('./dataset/Train/y_train.txt', header = None,delim_whitespace=True,names=['target'] )
x_test_raw = pd.read_csv('./dataset/Test/X_test.txt', header = None,delim_whitespace=True,names=columnas_features )
y_test_raw = pd.read_csv('./dataset/Test/y_test.txt', header = None,delim_whitespace=True,names=['target'] )



In [24]:
print("RAW Shape")
print("TRAIN : x.shape: {} y.shape: {}".format(x_train_raw.shape, y_train_raw.shape))
print("TEST : x.shape: {} y.shape: {}".format(x_test_raw.shape, y_test_raw.shape))

RAW Shape
TRAIN : x.shape: (7767, 561) y.shape: (7767, 1)
TEST : x.shape: (3162, 561) y.shape: (3162, 1)


In [26]:
#Class Identification Sitting & Standing : activity_labels.head(5)
#id	label
#4	SITTING
#5	STANDING



In [10]:
x_train.describe()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature552,feature553,feature554,feature555,feature556,feature557,feature558,feature559,feature560,feature561
count,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,...,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0,7767.0
mean,0.038759,-0.000647,-0.018155,-0.599017,-0.634424,-0.69127,-0.623886,-0.657884,-0.740154,-0.3602,...,0.161745,-0.316548,-0.625132,0.016774,0.018471,0.009239,-0.005184,-0.485936,0.05031,-0.052888
std,0.101996,0.099974,0.089927,0.441481,0.367558,0.321641,0.418113,0.348005,0.272619,0.499259,...,0.237319,0.313899,0.302581,0.331326,0.44354,0.601208,0.477218,0.509278,0.300866,0.276196
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-0.958535,-1.0,-1.0,-0.97658,-1.0,-1.0,-1.0,-1.0,-1.0,-0.987874
25%,0.032037,-0.011209,-0.028448,-0.99214,-0.98357,-0.984661,-0.992902,-0.984131,-0.986661,-0.795613,...,0.020312,-0.548129,-0.843966,-0.108225,-0.261002,-0.470267,-0.373565,-0.810953,-0.047752,-0.14056
50%,0.038975,-0.002921,-0.019602,-0.914202,-0.82797,-0.827696,-0.924421,-0.838559,-0.852735,-0.717007,...,0.170819,-0.35398,-0.710071,0.017627,0.029079,0.001515,-0.005503,-0.706619,0.176777,0.004583
75%,0.044,0.004303,-0.011676,-0.246026,-0.313069,-0.450478,-0.294903,-0.362671,-0.540521,0.054178,...,0.31624,-0.137462,-0.503837,0.167695,0.314876,0.496871,0.35269,-0.488765,0.246834,0.109507
max,1.0,1.0,1.0,1.0,0.945956,1.0,1.0,0.960341,1.0,1.0,...,1.0,0.938491,0.911653,1.0,1.0,0.998702,0.991288,1.0,0.482229,1.0


In [11]:
print('Total number of NaN in TRAIN dataframe: ', x_train.isnull().sum().sum())
print('Total number of NaN in TEST dataframe: ', x_test.isnull().sum().sum())

Total number of NaN in TRAIN dataframe:  0
Total number of NaN in TEST dataframe:  0


### 2: Principal Components Analysis
To then perform PCA we would use PCA module from sklearn which we have already imported in Step 1. In Listing 1.3, below, the first and the  line performs the PCA, the third line loads the principal components into a dataframe. You can view your data by typing principalComponents or principalDataframe in a cell and running it

In [33]:
from sklearn.preprocessing import StandardScaler
PCA_x = x_train_raw.loc[:, columnas_features].values
PCA_y = y_train_raw.loc[:, ['target']].values
PCA_x = StandardScaler().fit_transform(PCA_x)
pca = PCA(n_components=561)
principalComponents = pca.fit_transform(PCA_x)
principalDataframe = pd.DataFrame(data = principalComponents, columns = columnas_features)
   
#Combinacion X(Features) and Y(Target)
targetDataframe = y_train_raw[['target']]
XY_Dataframe = pd.concat([principalDataframe, targetDataframe],axis = 1)

In [91]:
percent_variance = np.round(pca.explained_variance_ratio_* 100, decimals =2)
p = figure(plot_height = 400, plot_width = 800,title = 'PCA Screen Plot',x_axis_label ='Features', 
           y_axis_label = 'Percentate of Variance Explained ')
p.quad(bottom=0, top=percent_variance, left=range(1,562),right=(range(1,562)),fill_color='red', line_color='black')
show(p)

#percent_variance = pca.explained_variance_ratio_* 100

print('PCA variance_ratio Primeros 100 Features: ',percent_variance[:100].sum())
print('PCA variance_ratio Primeros 150 Features: ',percent_variance[:150].sum())
print('PCA variance_ratio Primeros 200 Features: ',percent_variance[:200].sum())
print('PCA variance_ratio Primeros 300 Features: ',percent_variance[:300].sum())

#the first 300 components explains 99.8% of the variation in the original data

PCA variance_ratio Primeros 100 Features:  94.73999999999998
PCA variance_ratio Primeros 150 Features:  97.96
PCA variance_ratio Primeros 200 Features:  99.28
PCA variance_ratio Primeros 300 Features:  99.8
