In [1]:
from google.colab import drive; drive.mount('/content/drive')   # OK to enable, if your kaggle.json is stored in Google Drive

Mounted at /content/drive


In [None]:
!pip -q install --upgrade --force-reinstall --no-deps kaggle > log  # upgrade kaggle package (to avoid a warning)
!mkdir -p ~/.kaggle                                           # .kaggle folder must contain kaggle.json for kaggle executable to properly authenticate you to Kaggle.com
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json >log  # First, download kaggle.json from kaggle.com (in Account page) and place it in the root of mounted Google Drive
!cp kaggle.json ~/.kaggle/kaggle.json > log                   # Alternative location of kaggle.json (without a connection to Google Drive)
!chmod 600 ~/.kaggle/kaggle.json                              # give only the owner full read/write access to kaggle.json
!kaggle config set -n competition -v nov1122har               # set the competition context for the next few kaggle API calls. !kaggle config view - shows current settings
!kaggle competitions download >> log                          # download competition dataset as a zip file
!unzip -o *.zip >> log                                        # Kaggle dataset is copied as a single file and needs to be unzipped.
!kaggle competitions leaderboard --show                       # print public leaderboard

cp: cannot stat '/content/drive/MyDrive/kaggle.json': No such file or directory
- competition is now set to: nov1122har
100% 601M/601M [00:16<00:00, 37.4MB/s]
Using competition: nov1122har
 teamId  teamName        submissionDate       score    
-------  --------------  -------------------  -------  
9496135  🏃-baseline.csv  2021-11-22 05:48:49  0.92939  


In [2]:
%%time
%%capture
%reset -f
!pip -q install -U plotly > log
from IPython.core.interactiveshell import InteractiveShell as IS; IS.ast_node_interactivity = "all" 
import numpy as np, pandas as pd, time, matplotlib.pyplot as plt, os, plotly.express as px
import tensorflow as tf, tensorflow.keras as keras
from sklearn.neural_network import MLPClassifier   # SKLearn's MLP is optimised for CPU (and doesn't use GPU)
from keras.layers import Flatten, Dense
from sklearn.model_selection import train_test_split
np.set_printoptions(linewidth=10000, precision=2, edgeitems=20, suppress=True)
pd.set_option('max_colwidth', 1000, 'max_columns', 100, 'display.width', 1000, 'max_rows', 4)
ToCSV = lambda df, fname: df.round(2).to_csv(f'{fname}.csv', index_label='id') # rounds values to 2 decimals

class Timer():
  def __init__(self, lim:'RunTimeLimit'=60): self.t0, self.lim, _ = time.time(), lim, print(f'⏳ started. You have {lim} sec. Good luck!')
  def ShowTime(self):
    msg = f'Runtime is {time.time()-self.t0:.0f} sec'
    print(f'\033[91m\033[1m' + msg + f' > {self.lim} sec limit!!!\033[0m' if (time.time()-self.t0-1) > self.lim else msg)

CPU times: user 4.04 s, sys: 510 ms, total: 4.55 s
Wall time: 40.9 s


In [3]:
%time vX  = pd.read_csv('/content/drive/MyDrive/HAR/testX.csv', index_col='id')  # load testing input features X (only)
%time tYX = pd.read_csv('/content/drive/MyDrive/HAR/trainYX.csv')                # partially load training labels Y and input features X
tYX  # 561 input features

CPU times: user 455 ms, sys: 56.6 ms, total: 511 ms
Wall time: 1.07 s
CPU times: user 56.9 s, sys: 11.3 s, total: 1min 8s
Wall time: 1min 24s


Unnamed: 0,y,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,...,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560
0,5,0.2778,0.0092,-0.0676,-0.9785,-0.9160,-0.9610,-0.9834,-0.9170,-0.9590,-0.9390,-0.4230,-0.7520,0.8496,0.6226,0.8400,-0.9434,-0.9614,-1.0370,-1.0150,-1.0070,-0.9640,-0.9550,-0.6772,0.0568,0.0192,0.5900,-0.3162,0.1833,0.4440,-0.2622,0.1092,0.4468,-0.4443,-0.1484,0.1718,-0.2727,0.0954,-0.4720,-0.5264,0.2332,0.9640,-0.1309,0.1071,-0.9814,-0.948,-0.9727,-0.9720,-0.9575,-0.9585,...,-0.9126,-0.2037,-0.5300,-0.8164,-0.9170,-0.8850,-0.9033,-0.9120,-0.9750,-0.9326,-1.014,-0.9560,-0.6780,-0.9966,-0.6180,-0.1021,-0.5977,-0.9546,-0.9110,-0.9260,-0.9297,-1.017,-0.9460,-1.022,-0.9570,-0.2930,-1.0100,-0.3455,-0.1411,-0.5215,-0.9585,-0.9160,-0.9434,-0.9414,-0.9750,-0.9414,-0.9890,-0.9610,-0.4453,-1.002,-0.5415,-0.0308,-0.5093,0.0380,-0.0912,-0.1415,-0.1316,-0.8200,0.1721,-0.0535
1,1,0.2454,0.0073,-0.1046,-0.2010,0.1426,-0.2668,-0.2776,0.0648,-0.2605,-0.0572,-0.0364,-0.2830,-0.2830,-0.1448,0.4443,-0.0844,-0.6733,-0.7603,-0.7847,-0.4136,-0.3633,-0.1837,0.2830,0.5100,0.0582,-0.2502,0.3079,-0.1384,0.0822,0.0902,-0.0034,0.1969,0.0538,0.2996,-0.0258,0.0936,-0.3472,-0.1434,-0.4058,0.3690,0.9326,-0.2942,-0.0916,-0.9966,-0.964,-0.9663,-0.9746,-0.9736,-0.9634,...,-0.8115,0.4165,-0.4731,-0.8210,0.2542,0.2410,0.2688,0.0928,-0.7710,0.2430,-0.221,-0.1018,0.7134,-0.8994,-0.0642,-0.0842,-0.4750,-0.1345,-0.3853,-0.2573,-0.5430,-0.757,-0.1365,-0.677,-0.1826,0.6777,-0.7866,0.3240,-0.6206,-0.8530,-0.2500,-0.3025,-0.3176,-0.3198,-0.6426,-0.2488,-0.7236,-0.2512,0.6177,-0.910,0.1069,-0.0397,-0.4220,0.5480,0.6455,0.2296,-0.0335,-0.7000,0.2998,0.0880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499998,4,0.2740,-0.0132,-0.1257,-0.9834,-1.0020,-0.9590,-0.9897,-0.9746,-0.9873,-0.9346,-0.5630,-0.8394,0.8306,0.6846,0.8350,-0.9840,-0.9824,-0.9960,-1.0200,-0.9950,-1.0160,-0.9590,-0.6500,-0.5225,-0.7974,0.5020,-0.2532,0.3723,0.1772,0.2920,-0.2756,0.3179,-0.1398,0.0948,0.0180,-0.1853,0.1871,0.0790,-0.0402,-0.0880,0.9785,-0.0442,-0.0532,-0.9950,-1.028,-0.9790,-1.0010,-0.9697,-1.0060,...,-0.2356,0.4312,-0.6030,-0.8706,-0.9700,-0.9863,-1.0010,-0.9990,-0.9927,-0.9910,-1.020,-1.0210,-0.9920,-0.9736,0.3857,-0.4620,-0.7485,-0.9985,-0.9575,-0.9897,-0.9814,-1.000,-0.9990,-1.008,-0.9660,-0.8574,-0.9210,0.1049,-0.6284,-0.8970,-1.0200,-1.0150,-0.9750,-1.0170,-0.9746,-0.9937,-0.9927,-0.9950,-1.0030,-0.844,0.2454,-0.3782,-0.7183,-0.0227,0.1957,0.1864,0.4556,-0.9326,0.1137,0.0595
499999,5,0.2695,-0.0251,-0.1010,-1.0170,-0.9050,-0.9375,-0.9736,-0.8920,-0.9673,-0.9575,-0.5293,-0.8022,0.8530,0.6714,0.8480,-0.9624,-1.0205,-0.9900,-0.9600,-0.9960,-0.9480,-0.9720,-0.7320,-0.5117,-0.3535,0.3710,-0.2270,0.2700,-0.0636,-0.2438,0.0608,0.2050,-0.0218,-0.1199,0.0678,0.0154,-0.1132,-0.2886,-0.3882,0.6284,0.9966,-0.1277,0.0722,-1.0050,-0.925,-0.9440,-1.0050,-0.9824,-0.9233,...,-0.9500,0.0488,-0.3591,-0.7050,-1.0240,-0.9790,-0.9746,-0.9814,-0.9920,-0.9814,-1.013,-0.9860,-0.9650,-1.0150,-0.1430,-0.1555,-0.5180,-0.9320,-0.9200,-0.9424,-0.9326,-0.932,-0.9170,-0.985,-0.9463,-0.4020,-0.9640,-0.3160,-0.0948,-0.4695,-0.9590,-0.9500,-0.9976,-0.9680,-1.0340,-0.9727,-0.9900,-0.9790,-0.6980,-1.017,-0.4863,0.0084,-0.3293,-0.0127,-0.1399,0.4624,-0.7610,-0.8696,0.1720,-0.0272


In [4]:
tYX.y.value_counts(sort=False).to_frame().T  # counts of observations in each label category

Unnamed: 0,5,1,3,4,2,6
y,93667,83502,66901,87427,72554,95949


In [52]:
tmr = Timer() # runtime limit (in seconds). Add all of your code after the timer

⏳ started. You have 60 sec. Good luck!


<hr color=red>

<font size=5>⏳</font> <strong><font color=orange size=5>Your Code, Documentation, Ideas and Timer - All Start Here...</font></strong>

Students: Keep all your definitions, code, documentation **between** ⏳ symbols.

## **Task 1. Preprocessing Pipeline**
 
Explain elements of your preprocessing pipeline i.e. feature engineering, subsampling, clustering, dimensionality reduction, etc. 
1. Why did you choose these elements? (Something in EDA, prior experience,...? Btw, EDA is not required)
1. How do you evaluate the effectiveness of these elements? 
1. What else have you tried that worked or didn't? 

**Student's answer:**

## **Task 2. Modeling Approach**
Explain your modeling approach, i.e. ideas you tried and why you thought they would be helpful. 

1. How did these decisions guide you in modeling?
1. How do you evaluate the effectiveness of these elements? 
1. What else have you tried that worked or didn't? 

**Student's answer:**

In [53]:
tX, tY = tYX.drop('y', axis=1).head(32000), tYX.head(32000).y-1               # shift labels by -1 to range {0,1,2,3,4,5}

In [54]:
tf.random.set_seed(0)   # always seed your experiments
Init = keras.initializers.RandomNormal(seed=0) # seed weights and biases

m = keras.models.Sequential([
    Dense(10, kernel_initializer=Init, input_shape=[tX.shape[1]]), # hidden layer with 100 neurons requires 561x100 trainable weights + 100 biases
    Dense(6,  kernel_initializer=Init, activation='softmax')])     # one output neuron for each label. Softmax for multiclass (single label)
m.summary()
loss = tf.keras.losses.SparseCategoricalCrossentropy()              # Maps 0-based integer labels to one-hot encodings
m.compile(loss=loss, optimizer="sgd", metrics=["accuracy"])         # Accuracy is not really needed since it's equivalent to loss

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 10)                5620      
                                                                 
 dense_13 (Dense)            (None, 6)                 66        
                                                                 
Total params: 5,686
Trainable params: 5,686
Non-trainable params: 0
_________________________________________________________________


In [55]:
hist = m.fit(tX, tY, epochs=22, validation_split=.3)  # validation loss is decreasing (as it should)

Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoch 11/22
Epoch 12/22
Epoch 13/22
Epoch 14/22
Epoch 15/22
Epoch 16/22
Epoch 17/22
Epoch 18/22
Epoch 19/22
Epoch 20/22
Epoch 21/22
Epoch 22/22


In [56]:
print('History object contains: ' + ', '.join(hist.history.keys()))
dfHist = pd.DataFrame(hist.history)
dfHist['epoch'] = dfHist.index+1
f = px.line(dfHist, x='epoch', y='val_loss', title='', markers=True);
f = f.update_layout(height=200, margin=dict(l=0, r=0, t=0, b=0))
f = f.show();

History object contains: loss, accuracy, val_loss, val_accuracy


In [57]:
pOneHot = m.predict(vX)   # probabilities for each category. Subjects are rows
YLab = [f'{i}/{s}' for i, s in enumerate('walking walking_upstairs walking_downstairs sitting standing laying'.split())]  # column labels
pd.DataFrame(pOneHot[:3,:], columns=YLab).style.background_gradient(cmap='coolwarm', axis=1)  # display first few predictions



Unnamed: 0,0/walking,1/walking_upstairs,2/walking_downstairs,3/sitting,4/standing,5/laying
0,0.996613,0.000873,0.002251,1e-06,0.000262,0.0
1,0.0,0.0,0.0,0.000162,0.0,0.999838
2,0.0,0.0,0.0,0.001662,0.0,0.998338


In [58]:
pY = pd.DataFrame(np.argmax(pOneHot, axis = 1)+1, columns=['y'])  # predicted labels (from 1 to 6)
pY.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,2897,2898,2899,2900,2901,2902,2903,2904,2905,2906,2907,2908,2909,2910,2911,2912,2913,2914,2915,2916,2917,2918,2919,2920,2921,2922,2923,2924,2925,2926,2927,2928,2929,2930,2931,2932,2933,2934,2935,2936,2937,2938,2939,2940,2941,2942,2943,2944,2945,2946
y,1,6,6,2,1,6,2,6,1,5,6,3,6,5,1,5,3,2,2,2,2,3,4,2,5,1,4,5,5,5,3,3,2,3,3,2,5,5,6,5,3,5,6,6,6,1,2,1,6,3,...,2,5,4,5,2,3,2,6,1,5,1,5,2,3,4,4,2,2,1,3,1,5,5,2,6,3,6,6,6,3,6,5,3,5,1,4,1,3,6,6,2,1,3,2,4,5,6,3,4,2


In [59]:
ToCSV(pY, 'HAR_baseline')  # generate a CSV submission file for Kaggle

# **References:**

1. Remember to cite your sources here as well! At the least, your textbook should be cited. Google Scholar allows you to effortlessly copy/paste an APA citation format for books and publications. Also cite StackOverflow, package documentation, and other meaningful internet resources to help your peers learn from these (and to avoid plagiarism claims).
1. ...
1. ...

<font size=5>⌛</font> <strong><font color=orange size=5>Do not exceed competition's runtime limit!</font></strong>

<hr color=red>


In [60]:
tmr.ShowTime()    # measure Colab's runtime. Do not remove. Keep as the last cell in your notebook.

Runtime is 53 sec


# 💡**Starter Ideas**

1. Try tuning DNN hyperparameters
1. Training set has 500K observations (2GB), but you really don't need them all. They are all bootstrapped (with noise) from the original sample of 7352 observations. In order to stay within Colab runtime limit (CRTL), you can 
  1. use more observations for a shallow DNN, but risk underfitting due to lower model complexity
  1. use fewer observations for a deeper DNN, but risk overfitting to higher model complexity
1. Check out the original related papers about feature engineering for this dataset
1. Try engineering features with [`PolynomialFeatures`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) and discarding unimportant features via PCA or alternative technique.
1. Consider KMeans/KMedoid or other clustering methods to identify observations, which represent the original 7352 observations. It might require finding 7352 cluster centroids/medoids.
  1. Fast clustering methods: [FAISS](https://github.com/facebookresearch/faiss) (GPU-enabled)
1. For deep NN, consider dropout, batch normalization
1. Try PCA on transposed matrix to find/eliminate highly correlated observations
1. Try [stratified sampling](https://en.wikipedia.org/wiki/Stratified_sampling) to ensure each label is proportionally represented in a subsample