In [81]:
import pandas as pd
import numpy as np
import random
from hmmlearn import hmm
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
random.seed(4242)

temporal = pd.read_csv("D:/OneDrive/Online/SelfLearn/Ryerson/MSc/PaperReplication/temporal_activity.csv", 
                       names = ["bugID", "timeStamp", "activity", "cumDays"])
#temporal = pd.read_csv("C:/OneDrive/Online/SelfLearn/Ryerson/MSc/PaperReplication/temporal_activity.csv", 
#                       names = ["bugID", "timeStamp", "activity", "cumDays"])
activity = pd.read_csv("D:/OneDrive/Online/SelfLearn/Ryerson/MSc/PaperReplication/activity_description.csv", 
                      names = ["short", "description"])
#activity = pd.read_csv("C:/OneDrive/Online/SelfLearn/Ryerson/MSc/PaperReplication/activity_description.csv", 
#                      names = ["short", "description"])

threshold = temporal[temporal.loc[:,'activity'] == 'Z'].iloc[:,[0,3]]
threshold['isSlow'] = np.where(threshold.cumDays > 60, 1, 0)
activity['code'] = activity.index
temporal = pd.merge(temporal, activity, left_on='activity', right_on='short')
temporal = temporal.iloc[:,[0,1,2,3,6]]
temporal = temporal.sort_values(by=['bugID', 'activity']).reset_index(drop=True)
temporal = pd.merge(temporal, threshold.iloc[:,[0,2]].reset_index(drop=True), on='bugID')

chans = np.unique(temporal.bugID.values)
chanCount = len(chans)
idx = set(range(chanCount))
idxTrain = set(random.sample(range(chanCount), int(chanCount*0.6)))
idxTest = idx - idxTrain

train = temporal[temporal.bugID.isin(chans[list(idxTrain)])].reset_index(drop=True)
test = temporal[temporal.bugID.isin(chans[list(idxTest)])].reset_index(drop=True)

In [85]:
X1 = train[train.isSlow == 1].code.values.reshape(-1,1)
X2 = train[train.isSlow == 0].code.values.reshape(-1,1)
lengths1 = train[train.isSlow == 1].groupby(['bugID']).count().code.tolist()
lengths2 = train[train.isSlow == 0].groupby(['bugID']).count().code.tolist()

slowModel_05 = hmm.MultinomialHMM(n_components = 5, random_state = 42, n_iter = 22).fit(X = X1, lengths = lengths1)
fastModel_05 = hmm.MultinomialHMM(n_components = 5, random_state = 42, n_iter = 22).fit(X = X2, lengths = lengths2)

slowModel_10 = hmm.MultinomialHMM(n_components = 10, random_state = 42, n_iter = 22).fit(X = X1, lengths = lengths1)
fastModel_10 = hmm.MultinomialHMM(n_components = 10, random_state = 42, n_iter = 22).fit(X = X2, lengths = lengths2)

slowModel_15 = hmm.MultinomialHMM(n_components = 15, random_state = 42, n_iter = 22).fit(X = X1, lengths = lengths1)
fastModel_15 = hmm.MultinomialHMM(n_components = 15, random_state = 42, n_iter = 22).fit(X = X2, lengths = lengths2)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  np.log(self.transmat_),
  np.log(self.transmat_),
  np.log(self.transmat_),
  np.log(self.startprob_),
  np.log(self.startprob_),


In [86]:
activities = 2
results1 = test.copy()
results1.loc[:,'pred'] = pd.Series(np.repeat(-1, results1.shape[0]), index=results1.index)
results1 = results1.loc[:,['bugID','isSlow', 'pred']].drop_duplicates().reset_index(drop=True)
results2 = results1.copy()
results3 = results1.copy()
testChans = results1.bugID.values

for i in range(len(testChans)):
    temp = test[test.loc[:,'bugID'] == testChans[i]].code.values.reshape(-1,1)[0:activities]
    results1.iloc[i,2] = np.where(slowModel_05.score(X = temp) > fastModel_05.score(X = temp), 1, 0)
    results2.iloc[i,2] = np.where(slowModel_10.score(X = temp) > fastModel_10.score(X = temp), 1, 0)
    results3.iloc[i,2] = np.where(slowModel_15.score(X = temp) > fastModel_15.score(X = temp), 1, 0)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  np.log(self.startprob_),
  np.log(self.transmat_),


In [87]:
precisionA2_H5, recallA2_H5, fscoreA2_H5, _  = precision_recall_fscore_support(y_true = results1.isSlow.values,
                                                                y_pred = results1.pred.values)
accuracyA2_H5 = accuracy_score(y_true = results1.isSlow.values,  y_pred = results1.pred.values)

precisionA2_H10, recallA2_H10, fscoreA2_H10, _  = precision_recall_fscore_support(y_true = results2.isSlow.values,
                                                                y_pred = results2.pred.values)
accuracyA2_H10 = accuracy_score(y_true = results2.isSlow.values,  y_pred = results2.pred.values)

precisionA2_H15, recallA2_H15, fscoreA2_H15, _  = precision_recall_fscore_support(y_true = results3.isSlow.values,
                                                                y_pred = results3.pred.values)
accuracyA2_H15 = accuracy_score(y_true = results3.isSlow.values,  y_pred = results3.pred.values)

In [88]:
print('2 Activity, 5 Hidden state')
print('Precision {0}'.format(precisionA2_H5[1]))
print('Recall {0}'.format(recallA2_H5[1]))
print('FScore {0}'.format(fscoreA2_H5[1]))
print('Accuracy {0}'.format(accuracyA2_H5))
print('============================================')
print('2 Activity, 10 Hidden state')
print('Precision {0}'.format(precisionA2_H10[1]))
print('Recall {0}'.format(recallA2_H10[1]))
print('FScore {0}'.format(fscoreA2_H10[1]))
print('Accuracy {0}'.format(accuracyA2_H10))
print('============================================')
print('2 Activity, 15 Hidden state')
print('Precision {0}'.format(precisionA2_H15[1]))
print('Recall {0}'.format(recallA2_H15[1]))
print('FScore {0}'.format(fscoreA2_H15[1]))
print('Accuracy {0}'.format(accuracyA2_H15))

2 Activity, 5 Hidden state
Precision 0.5850839449928288
Recall 0.5679305544181475
FScore 0.5763796542553191
Accuracy 0.7051797437603031
2 Activity, 10 Hidden state
Precision 0.5585639942680444
Recall 0.6065023339611826
FScore 0.5815469179426778
Accuracy 0.6917604187754866
2 Activity, 15 Hidden state
Precision 0.57385819227084
Recall 0.5885676848742937
FScore 0.5811198706286639
Accuracy 0.7003499436041299


In [89]:
activities = 3
results1 = test.copy()
results1.loc[:,'pred'] = pd.Series(np.repeat(-1, results1.shape[0]), index=results1.index)
results1 = results1.loc[:,['bugID','isSlow', 'pred']].drop_duplicates().reset_index(drop=True)
results2 = results1.copy()
results3 = results1.copy()
testChans = results1.bugID.values

for i in range(len(testChans)):
    temp = test[test.loc[:,'bugID'] == testChans[i]].code.values.reshape(-1,1)[0:activities]
    results1.iloc[i,2] = np.where(slowModel_05.score(X = temp) > fastModel_05.score(X = temp), 1, 0)
    results2.iloc[i,2] = np.where(slowModel_10.score(X = temp) > fastModel_10.score(X = temp), 1, 0)
    results3.iloc[i,2] = np.where(slowModel_15.score(X = temp) > fastModel_15.score(X = temp), 1, 0)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  np.log(self.startprob_),
  np.log(self.transmat_),


In [90]:
precisionA3_H5, recallA3_H5, fscoreA3_H5, _  = precision_recall_fscore_support(y_true = results1.isSlow.values,
                                                                y_pred = results1.pred.values)
accuracyA3_H5 = accuracy_score(y_true = results1.isSlow.values,  y_pred = results1.pred.values)

precisionA3_H10, recallA3_H10, fscoreA3_H10, _  = precision_recall_fscore_support(y_true = results2.isSlow.values,
                                                                y_pred = results2.pred.values)
accuracyA3_H10 = accuracy_score(y_true = results2.isSlow.values,  y_pred = results2.pred.values)

precisionA3_H15, recallA3_H15, fscoreA3_H15, _  = precision_recall_fscore_support(y_true = results3.isSlow.values,
                                                                y_pred = results3.pred.values)
accuracyA3_H15 = accuracy_score(y_true = results3.isSlow.values,  y_pred = results3.pred.values)

In [91]:
print('3 Activity, 5 Hidden state')
print('Precision {0}'.format(precisionA3_H5[1]))
print('Recall {0}'.format(recallA3_H5[1]))
print('FScore {0}'.format(fscoreA3_H5[1]))
print('Accuracy {0}'.format(accuracyA3_H5))
print('============================================')
print('3 Activity, 10 Hidden state')
print('Precision {0}'.format(precisionA3_H10[1]))
print('Recall {0}'.format(recallA3_H10[1]))
print('FScore {0}'.format(fscoreA3_H10[1]))
print('Accuracy {0}'.format(accuracyA3_H10))
print('============================================')
print('3 Activity, 15 Hidden state')
print('Precision {0}'.format(precisionA3_H15[1]))
print('Recall {0}'.format(recallA3_H15[1]))
print('FScore {0}'.format(fscoreA3_H15[1]))
print('Accuracy {0}'.format(accuracyA3_H15))

3 Activity, 5 Hidden state
Precision 0.5569649446494465
Recall 0.7910900008189338
FScore 0.6536964980544746
Accuracy 0.7039939844405241
3 Activity, 10 Hidden state
Precision 0.6126649798263011
Recall 0.7336827450659241
FScore 0.6677349631065068
Accuracy 0.7421407293865865
3 Activity, 15 Hidden state
Precision 0.6233357193987116
Recall 0.713127507984604
FScore 0.6652152324204577
Accuracy 0.7465077941984556


In [92]:
activities = 4
results1 = test.copy()
results1.loc[:,'pred'] = pd.Series(np.repeat(-1, results1.shape[0]), index=results1.index)
results1 = results1.loc[:,['bugID','isSlow', 'pred']].drop_duplicates().reset_index(drop=True)
results2 = results1.copy()
results3 = results1.copy()
testChans = results1.bugID.values

for i in range(len(testChans)):
    temp = test[test.loc[:,'bugID'] == testChans[i]].code.values.reshape(-1,1)[0:activities]
    results1.iloc[i,2] = np.where(slowModel_05.score(X = temp) > fastModel_05.score(X = temp), 1, 0)
    results2.iloc[i,2] = np.where(slowModel_10.score(X = temp) > fastModel_10.score(X = temp), 1, 0)
    results3.iloc[i,2] = np.where(slowModel_15.score(X = temp) > fastModel_15.score(X = temp), 1, 0)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  np.log(self.startprob_),
  np.log(self.transmat_),


In [93]:
precisionA4_H5, recallA4_H5, fscoreA4_H5, _  = precision_recall_fscore_support(y_true = results1.isSlow.values,
                                                                y_pred = results1.pred.values)
accuracyA4_H5 = accuracy_score(y_true = results1.isSlow.values,  y_pred = results1.pred.values)

precisionA4_H10, recallA4_H10, fscoreA4_H10, _  = precision_recall_fscore_support(y_true = results2.isSlow.values,
                                                                y_pred = results2.pred.values)
accuracyA4_H10 = accuracy_score(y_true = results2.isSlow.values,  y_pred = results2.pred.values)

precisionA4_H15, recallA4_H15, fscoreA4_H15, _  = precision_recall_fscore_support(y_true = results3.isSlow.values,
                                                                y_pred = results3.pred.values)
accuracyA4_H15 = accuracy_score(y_true = results3.isSlow.values,  y_pred = results3.pred.values)

In [94]:
print('4 Activity, 5 Hidden state')
print('Precision {0}'.format(precisionA4_H5[1]))
print('Recall {0}'.format(recallA4_H5[1]))
print('FScore {0}'.format(fscoreA4_H5[1]))
print('Accuracy {0}'.format(accuracyA4_H5))
print('============================================')
print('4 Activity, 10 Hidden state')
print('Precision {0}'.format(precisionA4_H10[1]))
print('Recall {0}'.format(recallA4_H10[1]))
print('FScore {0}'.format(fscoreA4_H10[1]))
print('Accuracy {0}'.format(accuracyA4_H10))
print('============================================')
print('4 Activity, 15 Hidden state')
print('Precision {0}'.format(precisionA4_H15[1]))
print('Recall {0}'.format(recallA4_H15[1]))
print('FScore {0}'.format(fscoreA4_H15[1]))
print('Accuracy {0}'.format(accuracyA4_H15))

4 Activity, 5 Hidden state
Precision 0.6048638684061259
Recall 0.6986323806404062
FScore 0.6483754512635379
Accuracy 0.7323943661971831
4 Activity, 10 Hidden state
Precision 0.6121879888458138
Recall 0.7371222668086151
FScore 0.6688712194396969
Accuracy 0.7422564132226624
4 Activity, 15 Hidden state
Precision 0.6312354988399071
Recall 0.7129637212349521
FScore 0.6696150444179517
Accuracy 0.7515400410677618


In [95]:
activities = 5
results1 = test.copy()
results1.loc[:,'pred'] = pd.Series(np.repeat(-1, results1.shape[0]), index=results1.index)
results1 = results1.loc[:,['bugID','isSlow', 'pred']].drop_duplicates().reset_index(drop=True)
results2 = results1.copy()
results3 = results1.copy()
testChans = results1.bugID.values

for i in range(len(testChans)):
    temp = test[test.loc[:,'bugID'] == testChans[i]].code.values.reshape(-1,1)[0:activities]
    results1.iloc[i,2] = np.where(slowModel_05.score(X = temp) > fastModel_05.score(X = temp), 1, 0)
    results2.iloc[i,2] = np.where(slowModel_10.score(X = temp) > fastModel_10.score(X = temp), 1, 0)
    results3.iloc[i,2] = np.where(slowModel_15.score(X = temp) > fastModel_15.score(X = temp), 1, 0)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  np.log(self.startprob_),
  np.log(self.transmat_),


In [96]:
precisionA5_H5, recallA5_H5, fscoreA5_H5, _  = precision_recall_fscore_support(y_true = results1.isSlow.values,
                                                                y_pred = results1.pred.values)
accuracyA5_H5 = accuracy_score(y_true = results1.isSlow.values,  y_pred = results1.pred.values)

precisionA5_H10, recallA5_H10, fscoreA5_H10, _  = precision_recall_fscore_support(y_true = results2.isSlow.values,
                                                                y_pred = results2.pred.values)
accuracyA5_H10 = accuracy_score(y_true = results2.isSlow.values,  y_pred = results2.pred.values)

precisionA5_H15, recallA5_H15, fscoreA5_H15, _  = precision_recall_fscore_support(y_true = results3.isSlow.values,
                                                                y_pred = results3.pred.values)
accuracyA5_H15 = accuracy_score(y_true = results3.isSlow.values,  y_pred = results3.pred.values)

In [97]:
print('5 Activity, 5 Hidden state')
print('Precision {0}'.format(precisionA5_H5[1]))
print('Recall {0}'.format(recallA5_H5[1]))
print('FScore {0}'.format(fscoreA5_H5[1]))
print('Accuracy {0}'.format(accuracyA5_H5))
print('============================================')
print('5 Activity, 10 Hidden state')
print('Precision {0}'.format(precisionA5_H10[1]))
print('Recall {0}'.format(recallA5_H10[1]))
print('FScore {0}'.format(fscoreA5_H10[1]))
print('Accuracy {0}'.format(accuracyA5_H10))
print('============================================')
print('5 Activity, 15 Hidden state')
print('Precision {0}'.format(precisionA5_H15[1]))
print('Recall {0}'.format(recallA5_H15[1]))
print('FScore {0}'.format(fscoreA5_H15[1]))
print('Accuracy {0}'.format(accuracyA5_H15))

5 Activity, 5 Hidden state
Precision 0.6151122441753417
Recall 0.7113258537384326
FScore 0.6597296065623576
Accuracy 0.7408682071897504
5 Activity, 10 Hidden state
Precision 0.6088654425870101
Recall 0.7277864220784539
FScore 0.6630357742380723
Accuracy 0.7387569771813634
5 Activity, 15 Hidden state
Precision 0.6313061254289261
Recall 0.7081320121202195
FScore 0.6675158252277289
Accuracy 0.7508748590103248


In [98]:
activities = 6
results1 = test.copy()
results1.loc[:,'pred'] = pd.Series(np.repeat(-1, results1.shape[0]), index=results1.index)
results1 = results1.loc[:,['bugID','isSlow', 'pred']].drop_duplicates().reset_index(drop=True)
results2 = results1.copy()
results3 = results1.copy()
testChans = results1.bugID.values

for i in range(len(testChans)):
    temp = test[test.loc[:,'bugID'] == testChans[i]].code.values.reshape(-1,1)[0:activities]
    results1.iloc[i,2] = np.where(slowModel_05.score(X = temp) > fastModel_05.score(X = temp), 1, 0)
    results2.iloc[i,2] = np.where(slowModel_10.score(X = temp) > fastModel_10.score(X = temp), 1, 0)
    results3.iloc[i,2] = np.where(slowModel_15.score(X = temp) > fastModel_15.score(X = temp), 1, 0)

  return np.log(self.emissionprob_)[:, np.concatenate(X)].T
  np.log(self.startprob_),
  np.log(self.transmat_),


In [99]:
precisionA6_H5, recallA6_H5, fscoreA6_H5, _  = precision_recall_fscore_support(y_true = results1.isSlow.values,
                                                                y_pred = results1.pred.values)
accuracyA6_H5 = accuracy_score(y_true = results1.isSlow.values,  y_pred = results1.pred.values)

precisionA6_H10, recallA6_H10, fscoreA6_H10, _  = precision_recall_fscore_support(y_true = results2.isSlow.values,
                                                                y_pred = results2.pred.values)
accuracyA6_H10 = accuracy_score(y_true = results2.isSlow.values,  y_pred = results2.pred.values)

precisionA6_H15, recallA6_H15, fscoreA6_H15, _  = precision_recall_fscore_support(y_true = results3.isSlow.values,
                                                                y_pred = results3.pred.values)
accuracyA6_H15 = accuracy_score(y_true = results3.isSlow.values,  y_pred = results3.pred.values)

In [100]:
print('6 Activity, 5 Hidden state')
print('Precision {0}'.format(precisionA6_H5[1]))
print('Recall {0}'.format(recallA6_H5[1]))
print('FScore {0}'.format(fscoreA6_H5[1]))
print('Accuracy {0}'.format(accuracyA6_H5))
print('============================================')
print('6 Activity, 10 Hidden state')
print('Precision {0}'.format(precisionA6_H10[1]))
print('Recall {0}'.format(recallA6_H10[1]))
print('FScore {0}'.format(fscoreA6_H10[1]))
print('Accuracy {0}'.format(accuracyA6_H10))
print('============================================')
print('6 Activity, 15 Hidden state')
print('Precision {0}'.format(precisionA6_H15[1]))
print('Recall {0}'.format(recallA6_H15[1]))
print('FScore {0}'.format(fscoreA6_H15[1]))
print('Accuracy {0}'.format(accuracyA6_H15))

6 Activity, 5 Hidden state
Precision 0.6198581560283688
Recall 0.7157480959790353
FScore 0.6643609136862909
Accuracy 0.7445990109032016
6 Activity, 10 Hidden state
Precision 0.6026507979442791
Recall 0.7298337564491033
FScore 0.6601725989851477
Accuracy 0.7346502010006651
6 Activity, 15 Hidden state
Precision 0.6290919648689846
Recall 0.709769879616739
FScore 0.6670001539171925
Accuracy 0.7497180206495647
