In [1]:
import pandas as pd
import numpy as np
import datetime
import pickle

from hmmlearn.hmm import MultinomialHMM

In [2]:
head=["id","date_time","flag","option"]

temporal_dataset = pd.read_csv("./../data/FTDD/temporal_activity.csv",header=None)
temporal_dataset.columns=head
temporal_dataset["date_time"]=pd.to_datetime(temporal_dataset["date_time"])

In [3]:
temporal_dataset.head()

Unnamed: 0,id,date_time,flag,option
0,322067,2006-01-01 04:10:56,N,0
1,322067,2007-06-06 15:24:00,W,521
2,322067,2007-06-06 15:24:00,C,521
3,322067,2007-07-04 14:35:00,Z,549
4,322075,2006-01-01 06:08:45,N,0


In [4]:
#reduce the set
compressed_set = temporal_dataset.groupby(['id'])['flag'].apply(lambda x: "%s" % ''.join(x)).reset_index()
#i have not sorted by time but it seems to have come in order

In [5]:
t_max = temporal_dataset.groupby(['id'])['date_time'].apply(lambda x: max(x)).reset_index()
t_min =temporal_dataset.groupby(['id'])['date_time'].apply(lambda x: min(x)).reset_index()

In [6]:
compressed_set["start"]=t_min["date_time"]
compressed_set["end"]=t_max["date_time"]
compressed_set["duration"]=(compressed_set["end"]-compressed_set["start"]).astype('timedelta64[D]')

compressed_set.drop(compressed_set[compressed_set["duration"]<1].index, inplace=True)
compressed_set=compressed_set.reset_index(drop=True)

compressed_set.head()

Unnamed: 0,id,flag,start,end,duration
0,322067,N W C Z,2006-01-01 04:10:56,2007-07-04 14:35:00,549.0
1,322077,N A V C V V V Y L Z,2006-01-01 07:08:00,2006-10-31 14:56:00,303.0
2,322080,N C W C C Z,2006-01-01 07:52:24,2009-08-26 22:06:00,1333.0
3,322081,N C C Z,2006-01-01 07:54:33,2007-06-06 15:25:00,521.0
4,322094,N C Z,2006-01-01 12:04:36,2006-01-02 18:59:00,1.0


In [7]:
compressed_set['flag'] = compressed_set.flag.apply(lambda x: x.strip().split())

observation_dict = {'N':0,'M':1,'E':2,'A':3,'R':4,'C':5,'D':6,'V':7,'Y':8,'S':9,'H':10,'F':11,'W':12,'L':13,'P':14,'Q':15,'Z':16}

compressed_set['flag'] = compressed_set.flag.apply(lambda x: list(map(lambda y: int(observation_dict[y]),x)) )


#compressed_set = compressed_set.head(n=100)

compressed_set



compressed_set_long = compressed_set[compressed_set["duration"] > 60]
compressed_set_short = compressed_set[compressed_set["duration"] <= 60]



In [8]:
msk = np.random.rand(len(compressed_set_short)) < 0.6
train_short = compressed_set_short[msk]
test_short = compressed_set_short[~msk]
msk = np.random.rand(len(compressed_set_long)) < 0.6
train_long = compressed_set_long[msk]
test_long = compressed_set_long[~msk]

In [9]:
train_short.to_pickle("./../data/processed/train_short.pkl")
test_short.to_pickle("./../data/processed/test_short.pkl")
train_long.to_pickle("./../data/processed/train_long.pkl")
test_long.to_pickle("./../data/processed/test_long.pkl")

In [10]:
print(len(train_short))
print(len(test_short))
print(len(train_long))
print(len(test_long))

17842
12087
18571
12111


In [11]:
#Training Sequence

In [12]:
#len(compressed_set.flag)
len_sequence_long_train=[len(xi) for xi in train_long.flag]
sequence_long_train=[[[int(x)] for x in xi]  for xi in train_long.flag]
sequence_long_train=np.concatenate(sequence_long_train)

#len(compressed_set.flag)
len_sequence_short_train=[len(xi) for xi in train_short.flag]
sequence_short_train=[[[int(x)] for x in xi]  for xi in train_short.flag]
sequence_short_train=np.concatenate(sequence_short_train)

In [18]:
for n_states in [5]:
    hmm_long=MultinomialHMM(n_components=n_states).fit(sequence_long_train,len_sequence_long_train)
    hmm_short=MultinomialHMM(n_components=n_states).fit(sequence_short_train,len_sequence_short_train)

    fname="./models/hmm_long_"+str(n_states)+".pkl"
    with open(fname, 'wb') as output:
        pickle.dump(hmm_long, output, pickle.HIGHEST_PROTOCOL)
    fname="./models/hmm_short_"+str(n_states)+".pkl"
    with open(fname, 'wb') as output:
        pickle.dump(hmm_short, output, pickle.HIGHEST_PROTOCOL)



In [23]:
for n_year in range(2006,2014):
    start = datetime.date(year=i,month=1,day=1)
    end = datetime.date(year=i+1,month=1,day=1)

    train_long_year = train_long[(train_long['end'] > start) & (train_long['end'] < end)]
    train_short_year = train_short[(train_short['end'] > start) & (train_short['end'] < end)]


    #len(compressed_set.flag)
    len_sequence_long_train=[len(xi) for xi in train_long_year.flag]
    sequence_long_train=[[[int(x)] for x in xi]  for xi in train_long_year.flag]
    sequence_long_train=np.concatenate(sequence_long_train)

    #len(compressed_set.flag)
    len_sequence_short_train=[len(xi) for xi in train_short_year.flag]
    sequence_short_train=[[[int(x)] for x in xi]  for xi in train_short_year.flag]
    sequence_short_train=np.concatenate(sequence_short_train)




    for n_states in [5]:

        hmm_long=MultinomialHMM(n_components=n_states).fit(sequence_long_train,len_sequence_long_train)
        hmm_short=MultinomialHMM(n_components=n_states).fit(sequence_short_train,len_sequence_short_train)

        fname="./models/hmm_long_"+str(n_year)+"_"+str(n_states)+".pkl"
        with open(fname, 'wb') as output:
            pickle.dump(hmm_long, output, pickle.HIGHEST_PROTOCOL)
        fname="./models/hmm_short_"+str(n_year)+"_"+str(n_states)+".pkl"
        with open(fname, 'wb') as output:
            pickle.dump(hmm_short, output, pickle.HIGHEST_PROTOCOL)




