In [1]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [2]:
path_to_data = 'event_sample.csv'

In [3]:
data = pd.read_csv(path_to_data, sep=';', names=['id', 'date', 'x', 'y'])

In [4]:
# Convert String format of 'date' to datetime format
data['date'] = pd.to_datetime(data['date'])

In [5]:
class TrackAnalyzer:
    def __init__(self, cluster_distance_thd=0.05, home_job_distance=0.2, label_filter_thd=2):
        # Distance for agglomerative clustering
        self.cluster_distance_thd = cluster_distance_thd
        # Minimum distance between home and job
        self.home_job_distance = home_job_distance
        # Minimum amount of points in clusters
        self.label_filter_thd = label_filter_thd
    
    @staticmethod
    def euclidean_distance(x1, y1, x2, y2):
        return np.sqrt(np.power(x1 - x2, 2) + np.power(y1 - y2, 2))
    
    def extract_coordinates_by_hours(self, data, hours) -> list:
        """Group data by hours and calculate average coordinate per hour"""
        coordinates = []
        for h in hours:
            hour_data = data[data.date.apply(lambda x: x.hour == h)]
            if not hour_data.empty:
                coordinates.append((hour_data.x.mean(), hour_data.y.mean()))
        return coordinates
    
    def get_labels(self, data):
        """Find clusters and return labels"""
        return AgglomerativeClustering(n_clusters=None,
                                       affinity='euclidean',
                                       compute_full_tree=True,
                                       linkage='ward',
                                       distance_threshold=self.cluster_distance_thd
                                       ).fit(data).labels_
    
    def filter_clusters(self, data):
        """Filter clusters with threshold"""
        filtered_labels = [index for index, count in data.label.value_counts().iteritems()
                           if count >= self.label_filter_thd]
        return data[data.label.apply(lambda x: x in filtered_labels)]
        
    def predict(self, data):
        """Use this function to classificate tracker's data"""
        data_home = pd.DataFrame(self.extract_coordinates_by_hours(data, range(0, 9)), columns=['x', 'y'])
        data_job = pd.DataFrame(self.extract_coordinates_by_hours(data, range(11, 18)), columns=['x', 'y'])
        data_after = pd.DataFrame(self.extract_coordinates_by_hours(data, range(20, 24)), columns=['x', 'y'])
        
        # if no data for clustering
        if not (len(data_job) > 1 and len(data_home) > 1 and len(data_after) > 1):
            return 0
        
        # get clusters
        data_home['label'] = self.get_labels(data_home)
        data_job['label'] = self.get_labels(data_job)
        data_after['label'] = self.get_labels(data_after)
        
        # remove clusters with only 1 elements (most likely it's the way to work)
        data_home = self.filter_clusters(data_home)
        data_job = self.filter_clusters(data_job)
        data_after = self.filter_clusters(data_after)
        
        # check for only one cluster must be in each group and distance between home, job and after_job
        if data_job.label.nunique() == 1 and \
           data_home.label.nunique() == 1 and \
           data_after.label.nunique() == 1 and \
           TrackAnalyzer.euclidean_distance(data_home.x.mean(), data_home.y.mean(),
                                            data_job.x.mean(), data_job.y.mean()) > self.home_job_distance and \
           TrackAnalyzer.euclidean_distance(data_home.x.mean(), data_home.y.mean(),
                                            data_after.x.mean(), data_after.y.mean()) < self.home_job_distance:
            return 1
        else:
            return 0

In [6]:
# Create instance
tracker = TrackAnalyzer()

# Make dict from data - id: prediction
results = { id_ctr: tracker.predict(data[data.id == id_ctr]) for id_ctr in set(data.id) }
# Convert to pandas.Series
results_series = pd.Series(results, name='prediction')
# Save as csv
results_series.to_csv('results.csv', index_label='id')