In [84]:
import pandas as pd
import numpy as np
import datetime
import pickle

In [85]:
head=["id","date_time","flag","option"]

temporal_dataset = pd.read_csv("./../data/FTDD/temporal_activity.csv",header=None)
temporal_dataset.columns=head
temporal_dataset["date_time"]=pd.to_datetime(temporal_dataset["date_time"])

In [86]:
#reduce the set
compressed_set = temporal_dataset.groupby(['id'])['flag'].apply(lambda x: "%s" % ''.join(x)).reset_index()
#i have not sorted by time but it seems to have come in order

In [87]:
compressed_set

Unnamed: 0,id,flag
0,322067,NWCZ
1,322075,NZ
2,322077,NAVCVVVYLZ
3,322080,NCWCCZ
4,322081,NCCZ
...,...,...
86437,1116604,MPQPCZ
86438,1116618,NCZ
86439,1116656,NCCZ
86440,1116679,NQCQZ


In [88]:
t_max = temporal_dataset.groupby(['id'])['date_time'].apply(lambda x: max(x)).reset_index()
t_min =temporal_dataset.groupby(['id'])['date_time'].apply(lambda x: min(x)).reset_index()

In [89]:
compressed_set["start"]=t_min["date_time"]
compressed_set["end"]=t_max["date_time"]
compressed_set["duration"]=(compressed_set["end"]-compressed_set["start"]).astype('timedelta64[D]')

compressed_set.drop(compressed_set[compressed_set["duration"]<1].index, inplace=True)
compressed_set=compressed_set.reset_index(drop=True)

compressed_set.head()

Unnamed: 0,id,flag,start,end,duration
0,322067,NWCZ,2006-01-01 04:10:56,2007-07-04 14:35:00,549.0
1,322077,NAVCVVVYLZ,2006-01-01 07:08:00,2006-10-31 14:56:00,303.0
2,322080,NCWCCZ,2006-01-01 07:52:24,2009-08-26 22:06:00,1333.0
3,322081,NCCZ,2006-01-01 07:54:33,2007-06-06 15:25:00,521.0
4,322094,NCZ,2006-01-01 12:04:36,2006-01-02 18:59:00,1.0


In [90]:
compressed_set["label"]=(compressed_set["duration"]>60)
compressed_set["label"]=compressed_set["label"].apply(lambda x: 0 if x==False else 1)
compressed_set

Unnamed: 0,id,flag,start,end,duration,label
0,322067,NWCZ,2006-01-01 04:10:56,2007-07-04 14:35:00,549.0,1
1,322077,NAVCVVVYLZ,2006-01-01 07:08:00,2006-10-31 14:56:00,303.0,1
2,322080,NCWCCZ,2006-01-01 07:52:24,2009-08-26 22:06:00,1333.0,1
3,322081,NCCZ,2006-01-01 07:54:33,2007-06-06 15:25:00,521.0,1
4,322094,NCZ,2006-01-01 12:04:36,2006-01-02 18:59:00,1.0,0
...,...,...,...,...,...,...
60606,1116530,NCCZ,2014-12-30 10:39:00,2015-02-05 23:13:07,37.0,0
60607,1116570,NDZ,2014-12-30 12:43:00,2015-01-05 14:50:37,6.0,0
60608,1116656,NCCZ,2014-12-30 18:39:00,2015-01-02 09:20:37,2.0,0
60609,1116679,NQCQZ,2014-12-30 19:47:00,2015-01-13 14:08:07,13.0,0


In [91]:
msk = np.random.rand(len(compressed_set)) < 0.6
train_df = compressed_set[msk]
test_df = compressed_set[~msk]

In [92]:
'''
train_data_source = '../data/ag_news_csv/train.csv'
test_data_source = '../data/ag_news_csv/test.csv'

train_df = pd.read_csv(train_data_source, header=None)
test_df = pd.read_csv(test_data_source, header=None)

# concatenate column 1 and column 2 as one text
for df in [train_df, test_df]:
    df[1] = df[1] + df[2]
    df = df.drop([2], axis=1)
'''    
# convert string to lower case 
train_texts = train_df["flag"].values 
train_texts = [s.lower() for s in train_texts] 

test_texts = test_df["flag"].values 
test_texts = [s.lower() for s in test_texts] 

test_texts = list(map(lambda x: x[:7] ,test_texts))


In [93]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [94]:


# Tokenizer
tk = Tokenizer(char_level=True)
tk.fit_on_texts(train_texts)
# If we already have a character list, then replace the tk.word_index

In [95]:
alphabet="nmearcdvyshfwlpqz"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
tk.word_index = char_dict.copy() 

train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

In [96]:


# Padding
train_data = pad_sequences(train_sequences, padding='post')
test_data = pad_sequences(test_texts,maxlen=train_data.shape[1], padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')



In [97]:
#=======================Get classes================
train_classes = train_df["label"].values

test_classes = test_df["label"].values

train_classes_copy = train_df["label"].values
test_classes_copy = test_df["label"].values

from keras.utils import to_categorical
train_classes = to_categorical(train_classes)
test_classes = to_categorical(test_classes)

In [100]:
vocab_size = len(tk.word_index)
vocab_size

17

In [101]:
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

In [102]:
test_data.shape

(24413, 157)

In [103]:
# =====================Char CNN in whole dataset=======================
# parameter
input_size = test_data.shape[1]
vocab_size = len(tk.word_index)
embedding_size = vocab_size
conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = 2
dropout_p = 0.5
optimizer = 'adam'
loss = 'binary_crossentropy'


In [104]:

# Embedding weights
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size)) 

for char, i in tk.word_index.items(): 
    onehot = np.zeros(vocab_size)
    onehot[i - 1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)



In [105]:
# Embedding layer Initialization
embedding_layer = Embedding(vocab_size + 1,
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

In [106]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [107]:


# Model Construction
# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')  
# Embedding
x = embedding_layer(inputs)
# Conv
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)  # Final shape=(None, 34, 256)
x = Flatten()(x)  # (None, 8704)
# Fully connected layers
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)  # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(num_of_classes, activation='softmax')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc',f1_m,precision_m, recall_m])  # Adam, binary_crossentropy





In [108]:
# Shuffle
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices]
y_train = train_classes[indices]

x_test = test_data
y_test = test_classes

In [109]:
# Training
history = model.fit(x_train, y_train,
          validation_split=0.1,
          batch_size=128,
          epochs=10,
          verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Train on 32578 samples, validate on 3620 samples
Epoch 1/10
 - 47s - loss: 0.5676 - acc: 0.6991 - f1_m: 0.6991 - precision_m: 0.6991 - recall_m: 0.6991 - val_loss: 0.5346 - val_acc: 0.7243 - val_f1_m: 0.7256 - val_precision_m: 0.7256 - val_recall_m: 0.7256
Epoch 2/10
 - 51s - loss: 0.5396 - acc: 0.7182 - f1_m: 0.7182 - precision_m: 0.7182 - recall_m: 0.7182 - val_loss: 0.5262 - val_acc: 0.7251 - val_f1_m: 0.7264 - val_precision_m: 0.7264 - val_recall_m: 0.7264
Epoch 3/10
 - 51s - loss: 0.5309 - acc: 0.7236 - f1_m: 0.7236 - precision_m: 0.7236 - recall_m: 0.7236 - val_loss: 0.5302 - val_acc: 0.7243 - val_f1_m: 0.7256 - val_precision_m: 0.7256 - val_recall_m: 0.7256
Epoch 4/10
 - 51s - loss: 0.5242 - acc: 0.7252 - f1_m: 0.7253 - precision_m: 0.7253 - recall_m: 0.7253 - val_loss: 0.5375 - val_acc: 0.7202 - val_f1_m: 0.7209 - val_precision_m: 0.7209 - val_recall_m: 0.7209
Epoch 5/10
 - 51s - loss: 0.5202 - acc: 0.7282

In [110]:
model.evaluate(x_test, y_test, batch_size=128)



[0.5692659644791882,
 0.6985212564468384,
 0.6988450288772583,
 0.6988450884819031,
 0.6988450884819031]

In [111]:
from sklearn.metrics import classification_report


In [119]:
y_pred = model.predict(x_test, batch_size=128, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
rep = classification_report(test_classes_copy, y_pred_bool,digits=4)




In [120]:
print(rep)

precision    recall  f1-score   support

           0     0.6397    0.8974    0.7470     12106
           1     0.8329    0.5029    0.6271     12307

    accuracy                         0.6985     24413
   macro avg     0.7363    0.7001    0.6870     24413
weighted avg     0.7371    0.6985    0.6866     24413

