In [None]:
# Load Dataset
import pandas as pd
import numpy as np

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
apex_df = pd.read_csv('data/apex.csv')

In [None]:
#decision trees good at overfitting small data points
# ONE SHOT LEARNING- learn a differentiator rather than a predictor.  
# maximize the difference between good score and bad score
# transfer learning
# combination of both
# metrics, true positive, true negative, sensitivy, specificity, recall

### Feature Engineering

In [None]:
'''
Important features to look at for 'good' squats:
1) angle at hip
2) difference in x-coordinates of hip and ankle
3) difference in y-coordinates of hip and knee
'''

features = apex_df.copy()
features.head()
features['xcoord_lhip_ank'] = features['leftHip_x'] - features['leftAnkle_x']
features['ycoord_lhip_knee'] = features['leftHip_y'] - features['leftKnee_y']
features['left_hip_angle'] = np.arctan(features['ycoord_lhip_knee']/features['xcoord_lhip_ank'])
features['left_hip_angle']

features['xcoord_rhip_ank'] = features['rightHip_x'] - features['rightAnkle_x']
features['ycoord_rhip_knee'] = features['rightHip_y'] - features['rightKnee_y']
features['right_hip_angle'] = np.arctan(features['ycoord_rhip_knee']/features['xcoord_rhip_ank'])
features['right_hip_angle']

features.head()

### Decision Tree Classifier

In [None]:
'''
A decision tree is built on an entire dataset, using all the features/variables of interest.
Decision Tree Classifiers tend to overfit the small data points.
'''

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
y = features['good']
X = features.drop(['good'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
print('Test Accuracy: ', score)

### Random Forest Classifer

In [None]:
'''
Random Forest Classifiers choose randomly selects observations/rows
and specific features/variables to build multiple decision trees from and then averages the results.
'''
from sklearn.ensemble import RandomForestClassifier
y = features['good']
X = features.drop(['good'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
clf_rf = RandomForestClassifier()
clf_rf = clf_rf.fit(X_train,y_train)
y_pred_rf = clf_rf.predict(X_test)
score_rf = clf_rf.score(X_test, y_test)
print('Test Accuracy: ', score_rf)

### Logistic Regression

In [None]:
'''
Logistic Regression conducts an analysis of the data when it is binary. 
In this case, it predicts the probability of getting a 'good' squat (1).
'''
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
y = features['good']
X = features.drop(['good'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
clf_log = LogisticRegression()
clf_log = clf_log.fit(X_train,y_train)
y_pred_log = clf_log.predict(X_test)
score_log = clf_log.score(X_test, y_test)
print('Test Accuracy: ', score_log)

### Sequential Model

In [None]:
'''
Installation of pip, tensorflow, keras:
run this in terminal to install pip:  curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
run this in terminal to install tensorflow: pip install --upgrade tensorflow
run this in terminal to install keras: pip install keras
'''
import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential
from keras.layers import Dropout
from tensorflow.keras.layers import Dense

In [None]:
y = features['good']
x = features.drop(['good'], axis=1)
#pd.get_dummies(y).values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42,shuffle=True)

# Core data structure in Keras is a model
# The model is an object in which we organize layers

model_k = Sequential() # instantiate empty Sequential model


# model contruction (architecture build computational graph)


model_k.add( Dense(units=64, input_dim = 25, activation='relu'))

model_k.add(Dense(32, activation=tf.nn.relu))

#model_k.add(Dense(16, activation=tf.nn.relu))

model_k.add(Dense(4, activation='sigmoid'))

model_k.add(Dense(units=1, activation='softmax') )


# For a binary classification problem 
#def mean_pred(y_true, y_pred):
 #   return keras.backend.mean(y_pred) 

model_k.compile(optimizer= 'rmsprop',
              loss='binary_crossentropy',
             metrics=['accuracy'])

history = model_k.fit(X_train, y_train, epochs =10, batch_size= 10)

from sklearn.externals import joblib
import pickle
joblib.dump(model_k, './data/model.pkl')
# Evaluate the model Accuracy on test set

loss, accuracy = model_k.evaluate(X_test, y_test, batch_size=10)
#print('Test loss:', test_scores[0])
print('Test accuracy:', accuracy)


### K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [None]:
'''
k nearest neighbors model works by taking a data point and looking at the ‘k’ closest labeled data points. 
The data point is then assigned the label of the majority of the ‘k’ closest points. In this case, we want
to take the features and label it as good or bad (1 or 0).
K-folds cross validation estimates the skill of the k nearest neighbors model. In this case, 
the data-set is split into k (5) groups and one group is used as the test set and the rest
are used as the training set.
'''

feat = features.copy()
#feat = feat.drop(['good'], axis=1)
feat_new = feat[['xcoord_lhip_ank','ycoord_lhip_knee','left_hip_angle','xcoord_rhip_ank','ycoord_rhip_knee','right_hip_angle', 'good']]
y = feat_new['good']
x = feat_new.drop(['good'], axis=1)
#Cluster the data

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 3)
# Fit the classifier to the data
knn.fit(X_train,y_train)
knn.predict(X_test)[0:5]
knn_score = knn.score(X_test, y_test)

#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=3)
#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, x, y, cv=5)
#print each cv score (accuracy) and average them
print('Accuracy:', knn_score)
print('Cross Validation Mean:{}'.format(np.mean(cv_scores)))


In [None]:
#feat.shape
#kmeans = KMeans(n_clusters=2).fit(feat_new)
#centroids = kmeans.cluster_centers_
#print(centroids)


#kmeans = KMeans(n_clusters=2, random_state=42).fit(feat_std)
#labels = kmeans.labels_

#Glue back to originaal data
#feat['clusters'] = labels

#Add the column into our list
#feat(['clusters'])

#Lets analyze the clusters
#feat['good'].groupby(['clusters']).mean()