### Purpose:

Calculate how long each trace takes to complete, calculated as:  

(startTime - endTime)/(1000\*60)  

Compare the distribution between the train and test sets.

### Interesting findings:

* Maximum trace time: **18.2 minutes**
* Many train traces took less than a minute to complete, while test traces always take more than a minute.
    * The distribution of test trace completion times looks suspiciously like the train distribution shifted by 1 minute.
* Roughly 2.0% of train traces took more than **5 minutes** to complete, compared to 4.8% of test traces.
    * Roughly 2.7% of test traces took more than **6 minutes** to complete.
* Minor data quality issue: The file **../input/indoor-location-navigation/train/5cd56b83e2acfd2d33b5cab0/B2/5cf72539e9d9c9000852f45b.txt** does not have an 'endTime' line.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import glob

path_train = '../input/indoor-location-navigation/train/*/*/*'
path_test = '../input/indoor-location-navigation/test/*'

def get_time_diffs(path, train=True, n_files=-1):    
    # Get file paths and initialize return list
    all_files = glob.glob(path)
    time_diffs = []
    trace_ids = []
    
    # Loop through each file
    for f in all_files:
        
        # Get the trace text
        with open(f) as f_contents:
            lines = f_contents.readlines()
        
        try:
            if train:
                # '#\tstartTime:1560913369586\n'
                start_time = int(lines[0].replace('\n', '').split(':')[-1])
                end_time = int(lines[-1].replace('\n', '').split(':')[-1])
            else:
                # '#\tstartTime\t0000000000000\n'
                start_time = int(lines[0].replace('\n', '').split('\t')[-1])
                end_time = int(lines[-1].replace('\n', '').split('\t')[-1])
        except:
            print('Unable to parse for file: {}'.format(f))
        
        # Append results
        time_diffs.append((end_time - start_time)/(1000*60))
        
    return time_diffs
        
train_diffs = get_time_diffs(path_train)
test_diffs = get_time_diffs(path_test, train=False)

In [None]:
bins = np.linspace(0, int(max(train_diffs + test_diffs)) + 1, 100)
plt.hist(train_diffs, bins, alpha=0.5, label='train', density=True)
plt.hist(test_diffs, bins, alpha=0.5, label='test', density=True)
plt.legend(loc='upper right')
plt.xlabel("Trace Completion Time (minutes)")
plt.tight_layout()
plt.show()

In [None]:
print('Minimum train trace time: {}'.format(min(train_diffs)))
print('Minimum test trace time: {}'.format(min(test_diffs)))

print('\nMaximum train trace time: {}'.format(max(train_diffs)))
print('Maximum test trace time: {}'.format(max(test_diffs)))

tmp = sum(d > 4 for d in train_diffs)/len(train_diffs)
print('\nRatio train trace time > 5 minutes: {}'.format(tmp))
tmp = sum(d > 5 for d in test_diffs)/len(test_diffs)
print('Ratio test trace time > 5 minutes: {}'.format(tmp))
tmp = sum(d > 6 for d in test_diffs)/len(test_diffs)
print('Ratio test trace time > 6 minutes: {}'.format(tmp))