In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
import csv
from scipy.io import arff
from sklearn import datasets
from sklearn.cluster import KMeans
import numpy as np

In [2]:
act = pd.read_csv('./UniversityDataset1/test.csv') ## add .csv filename of complete data 
permit = act[act['class'] == "p"]
df1 = pd.DataFrame(permit)
df = df1.drop(columns = 'class')

In [3]:
import math
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from nltk import ngrams
from scipy import cluster
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.sparse import csr_matrix
from scipy.spatial.distance import squareform
from scipy.special import comb
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import adjusted_rand_score, homogeneity_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

# import scipy.spatial.distance as ssd

class nTreeClus:
    def __init__(self, sequences, n, method, ntree=10, C= None, verbose=1):
        """ nTreeClus is a clustering method by Hadi Jahanshahi and Mustafa Gokce Baydogan.
        The method is suitable for clustering categorical time series (sequences). 
        You can always have access to the examples and description in 
        https://github.com/HadiJahanshahi/nTreeClus
        If you have any question about the code, you may email hadijahanshahi [a t] gmail . com
        
        prerequisites:
            numpy
            pandas
            sklearn
            scipy
        
        Args:
            sequences: a list of sequences to be clustered
            n: "the window length" or "n" in nTreeclus. You may provide it or it will be
                calculated automatically if no input has been suggested.
                Currently, the default value of "the square root of average sequences' lengths" is taken.
            method: 
                DT:          Decision Tree
                DT_position: Decision Tree enhanced by position index
                RF:          Random Forest
                RF_position: Random Forest enhanced by position index
                All:         all four methods
            ntree: number of trees to be used in RF method. The default value is 10. 
                (Setting a small value decreases accuracy, and a large value may increase the complexity. 
                 no less than 5 and no greater than 20 is recommended.)
            C: number of clusters. If it is not provided, it will be calculated using silhouette_score.
            verbose [binary]: It indicates whether to print the outputs or not. 

        Returns:
            'C_DT': "the optimal number of clusters for Decision Tree",
            'C_RF': "the optimal number of clusters for Random Forest",
            'Parameter n': the parameter of the nTreeClus (n) - either calculated or manually entered
            'distance_DT': "sparse distance between sequences for Decision Tree",
            'distance_RF': "sparse distance between sequences for Random Forest",
            'labels_DT': "labels based on the optimal number of clusters for DT",
            'labels_RF': "labels based on the optimal number of clusters for RF".
                
                NOTE: in order to convert the distance output to a square distance matrix, 
                    "scipy.spatial.distance.squareform" should be used.
                    
        ## simple example with the output
        sequences = ['evidence','evident','provide','unconventional','convene']
        model     = nTreeClus(sequences, n = None, ntree=5, method = "All")
        model.nTreeClus()
        model.output()
        # {'C_DT': 2,
        # 'distance_DT': array([0.05508882, 0.43305329, 0.68551455, 0.43305329, 0.5       ,
        #        0.7226499 , 0.5       , 0.86132495, 0.75      , 0.4452998 ]),
        # 'labels_DT': array([0, 0, 0, 1, 1]),
        # 'C_RF': 2,
        # 'distance_RF': array([0.10557281, 0.5527864 , 0.58960866, 0.64222912, 0.55      ,
        #       0.72470112, 0.7       , 0.83940899, 0.95      , 0.26586965]),
        # 'labels_RF': array([0, 0, 0, 1, 1]),
        # 'Parameter n': 4}
        """
        self.n                                 = n   # Parameter n
        self.method                            = method
        self.ntree                             = ntree
        self.C_DT                              = C
        self.C_RF                              = C
        self.C_DT_p                            = C
        self.C_RF_p                            = C
        self.sequences                         = sequences
        self.seg_mat                           = None
        self.Dist_tree_terminal_cosine         = None # distance_DT
        self.assignment_tree_terminal_cosine   = None # labels_DT
        self.Dist_tree_terminal_cosine_p       = None # distance_DT + position
        self.assignment_tree_terminal_cosine_p = None # labels_DT   + position
        self.Dist_RF_terminal_cosine           = None # distance_RF
        self.assignment_RF_terminal_cosine     = None # labels_RF
        self.Dist_RF_terminal_cosine_p         = None # distance_RF + position
        self.assignment_RF_terminal_cosine_p   = None # labels_RF   + position
        self.verbose                           = verbose
        self.running_timeSegmentation          = None
        self.running_timeDT                    = None
        self.running_timeDT_p                  = None
        self.running_timeRF                    = None
        self.running_timeRF_p                  = None
    
    @staticmethod
    def purity_score(clusters, classes):
        """
        Calculate the purity score for the given cluster assignments and ground truth classes
        
        :param clusters: the cluster assignments array
        :type clusters: numpy.array
        
        :param classes: the ground truth classes
        :type classes: numpy.array
        
        :returns: the purity score
        :rtype: float
        """
        clusters = np.array(clusters)
        classes = np.array(classes)
        A = np.c_[(clusters,classes)]

        n_accurate = 0.

        for j in np.unique(A[:,0]):
            z = A[A[:,0] == j, 1]
            x = np.argmax(np.bincount(z))
            n_accurate += len(z[z == x])

        return n_accurate / A.shape[0]
    
    @staticmethod
    def rand_index_score(clusters, classes):
        clusters = np.array(clusters)
        classes = np.array(classes)
        tp_plus_fp = comb(np.bincount(clusters), 2).sum()
        tp_plus_fn = comb(np.bincount(classes), 2).sum()
        A = np.c_[(clusters, classes)]
        tp = sum(comb(np.bincount(A[A[:, 0] == i, 1]), 2).sum()
                for i in set(clusters))
        fp = tp_plus_fp - tp
        fn = tp_plus_fn - tp
        tn = comb(len(A), 2) - tp - fp - fn
        return (tp + tn) / (tp + fp + fn + tn)
    
    @staticmethod
    def _1nn(Ground_Truth, distance):
        jj = 0
        distance_sqr = pd.DataFrame(squareform(distance))
        for ii in (range(distance_sqr.shape[0])):
            the_shortest_dist = distance_sqr.iloc[ii].drop(ii).idxmin()
            if Ground_Truth[ii] == Ground_Truth[the_shortest_dist]:
                jj += 1 
        return ((jj)/distance_sqr.shape[0])
      
    def matrix_segmentation(self):
        seg_mat_list = []
        for i in tqdm(range(len(self.sequences)), desc="Matrix Segmentation (Splitting based on window size)", 
                      disable=1-self.verbose):
            sentence = self.sequences[i]
            ngrams_  = ngrams(list(sentence), self.n)
            for idx, gram in enumerate(ngrams_):
                seg_mat_list.append(list(gram + (idx,) + (i,)))
        self.seg_mat         = pd.DataFrame(seg_mat_list)
        # renaming the column indexes
        self.seg_mat.columns = np.append(np.arange(0,self.n-1),('Class', 'Position', 'OriginalMAT_element')) 

    def finding_the_number_of_clusters(self, HC_tree_terminal_cosine, Dist_tree_terminal_cosine, which_one):
        """
        which_one can take the values of either "DT" or "RF".
        """
        max_clusters = min(11, len(self.sequences))
        ress_sil = []
        for i in tqdm(range(2, max_clusters), desc=f"Finding the best number of clusters ({which_one})", disable=1-self.verbose):
            assignment_tree_terminal_cosine = cluster.hierarchy.cut_tree(HC_tree_terminal_cosine,i).ravel() #.ravel makes it 1D array.
            ress_sil.append((silhouette_score(squareform(Dist_tree_terminal_cosine),
                                              assignment_tree_terminal_cosine,metric='cosine').round(3)*1000)/1000)
        if which_one == 'DT':
            self.C_DT = ress_sil.index(max(ress_sil)) + 2
        elif which_one == 'RF':
            self.C_RF = ress_sil.index(max(ress_sil)) + 2
        elif which_one == 'DT_position':
            self.C_DT_p = ress_sil.index(max(ress_sil)) + 2
        elif which_one == 'RF_position':
            self.C_RF_p = ress_sil.index(max(ress_sil)) + 2

    def nTreeClus(self):
        ############# pre processing #################
        if self.n is None:
            if self.verbose: print("Finding the parameter 'n'")
            min_length = min(map(len, self.sequences))
            total_avg  = round(sum( map(len, self.sequences) ) / len(self.sequences)) # average length of strings
            self.n     = min(round(total_avg**0.5)+1, min_length-1)
            if self.verbose: print(f"Parameter 'n' is set to {self.n}")
        if (self.n < 3):
            raise ValueError("""Parameter n could not be less than 3.
                                Remove the sequences with the length shorter than 3 and then re-run the function.""")
        
        ############# matrix segmentation #################
        start_time                    = time.time()
        self.matrix_segmentation()
        self.running_timeSegmentation = round(time.time() - start_time)

        # dummy variable for DT and RF
        if self.verbose: print("one-hot encoding + x/y train")
        le                            = preprocessing.LabelEncoder()
        self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers
        # creating dummy columns for categorical data; one-hot encoding
        self.seg_mat                  = pd.get_dummies(self.seg_mat).reset_index(drop=True)
        
        ############# nTreeClus method using DT #################        
        if (self.method in ["All","DT"]):
            start_time                                   = time.time()
            xtrain                                       = self.seg_mat.drop(labels=['OriginalMAT_element', 'Position', 'Class'],
                                                                             axis=1).copy()
            ytrain                                       = self.seg_mat['Class'].copy()
            dtree                                        = DecisionTreeClassifier()
            if self.verbose: print("Fit DT")
            fitted_tree                                  = dtree.fit(X=xtrain,y=ytrain)
            ### finding the terminal nodes.
            terminal_tree                                = fitted_tree.tree_.apply(xtrain.values.astype('float32')) #terminal output
            if self.verbose: print("DataFrame of terminal nodes")
            terminal_output_tree                         = pd.DataFrame(terminal_tree)
            terminal_output_tree ['OriginalMAT_element'] = self.seg_mat['OriginalMAT_element'].values
            terminal_output_tree.columns                 = ['ter','OriginalMAT_element']
            i, r                                         = pd.factorize(terminal_output_tree['OriginalMAT_element'])
            j, c                                         = pd.factorize(terminal_output_tree['ter'])
            ij, tups                                     = pd.factorize(list(zip(i, j)))
            terminal_output_tree_F                       = csr_matrix((np.bincount(ij), tuple(zip(*tups))))
            if self.verbose: print("Determining the cosine Distance")
            self.Dist_tree_terminal_cosine               = squareform(np.round(1-cosine_similarity(terminal_output_tree_F),
                                                                               8))
            if self.verbose: print("Applying Ward Linkage")
            self.HC_tree_terminal_cosine                 = linkage(self.Dist_tree_terminal_cosine, 'ward')
            #finding the number of clusters
            if self.C_DT is None:
                if self.verbose: print("Finding the optimal number of clusters")
                self.finding_the_number_of_clusters(self.HC_tree_terminal_cosine, 
                                                    self.Dist_tree_terminal_cosine, "DT")
            # assigning the correct label
            if self.verbose: print("Cutting The Tree")
            self.assignment_tree_terminal_cosine = cluster.hierarchy.cut_tree(self.HC_tree_terminal_cosine, 
                                                                              self.C_DT).ravel() #.ravel makes it 1D array.
            self.running_timeDT                          = round(time.time() - start_time)
            
        ############# nTreeClus method using DT + Position #################        
        if (self.method in ["All","DT_position"]):
            start_time                                   = time.time()
            xtrain                                       = self.seg_mat.drop(labels=['OriginalMAT_element', 'Class'],
                                                                             axis=1).copy()
            ytrain                                       = self.seg_mat['Class'].copy()
            dtree                                        = DecisionTreeClassifier()
            if self.verbose: print("Fit DT + POSITION")
            fitted_tree                                  = dtree.fit(X=xtrain,y=ytrain)
            ### finding the terminal nodes.
            terminal_tree                                = fitted_tree.tree_.apply(xtrain.values.astype('float32')) #terminal output
            if self.verbose: print("DataFrame of terminal nodes")
            terminal_output_tree                         = pd.DataFrame(terminal_tree)
            terminal_output_tree ['OriginalMAT_element'] = self.seg_mat['OriginalMAT_element'].values
            terminal_output_tree.columns                 = ['ter','OriginalMAT_element']
            i, r                                         = pd.factorize(terminal_output_tree['OriginalMAT_element'])
            j, c                                         = pd.factorize(terminal_output_tree['ter'])
            ij, tups                                     = pd.factorize(list(zip(i, j)))
            terminal_output_tree_F                       = csr_matrix((np.bincount(ij), tuple(zip(*tups))))
            if self.verbose: print("Determining the cosine Distance")
            self.Dist_tree_terminal_cosine_p               = squareform(np.round(1-cosine_similarity(terminal_output_tree_F),
                                                                               8))
            if self.verbose: print("Applying Ward Linkage")
            self.HC_tree_terminal_cosine_p                 = linkage(self.Dist_tree_terminal_cosine_p, 'ward')
            #finding the number of clusters
            if self.C_DT_p is None:
                if self.verbose: print("Finding the optimal number of clusters")
                self.finding_the_number_of_clusters(self.HC_tree_terminal_cosine_p, 
                                                    self.Dist_tree_terminal_cosine_p, "DT_position")
            # assigning the correct label
            if self.verbose: print("Cutting The Tree")
            self.assignment_tree_terminal_cosine_p = cluster.hierarchy.cut_tree(self.HC_tree_terminal_cosine_p,
                                                                                self.C_DT_p).ravel() #.ravel makes it 1D array.
            self.running_timeDT_p                          = round(time.time() - start_time)
            
        ############# nTreeClus method using RF #################
        if (self.method in ["All","RF"]):
            start_time                                     = time.time()
            xtrain                                         = self.seg_mat.drop(labels=['OriginalMAT_element', 'Position', 'Class'],
                                                                               axis=1).copy()
            ytrain                                         = self.seg_mat['Class'].copy()
            np.random.seed(123)
            forest                                         = RandomForestClassifier(n_estimators=self.ntree, max_features=0.36)
            if self.verbose: print("Fit RF")
            fitted_forest                                  = forest.fit(X=xtrain, y=ytrain)
            ### Finding Terminal Nodes
            terminal_forest                                = fitted_forest.apply(xtrain) #terminal nodes access
            terminal_forest                                = pd.DataFrame(terminal_forest)
            #Adding "columnindex_" to the beginning of all  
            terminal_forest                                = terminal_forest.astype('str')
            if self.verbose: print("DataFrame of terminal nodes")
            for col in terminal_forest:
                terminal_forest[col] = '{}_'.format(col) + terminal_forest[col]
            terminal_forest.head()
            for i in range(terminal_forest.shape[1]):
                if i == 0:
                    temp                  = pd.concat([self.seg_mat['OriginalMAT_element'], 
                                                       terminal_forest[i]], ignore_index=True, axis=1)
                    rbind_terminal_forest = temp
                else:
                    temp                  = pd.concat([self.seg_mat['OriginalMAT_element'], 
                                                       terminal_forest[i]], ignore_index=True, axis=1)
                    rbind_terminal_forest = pd.concat([rbind_terminal_forest, temp], ignore_index=True)
            rbind_terminal_forest.columns                 = ['OriginalMAT_element','ter']
            i, r                                          = pd.factorize(rbind_terminal_forest['OriginalMAT_element'])
            j, c                                          = pd.factorize(rbind_terminal_forest['ter'])
            ij, tups                                      = pd.factorize(list(zip(i, j)))
            terminal_output_forest_F                      = csr_matrix((np.bincount(ij), tuple(zip(*tups))))
            if self.verbose: print("Determining the cosine Distance")
            self.Dist_RF_terminal_cosine                  = squareform(np.round(1-cosine_similarity(terminal_output_forest_F),8))
            if self.verbose: print("Applying Ward Linkage")
            self.HC_RF_terminal_cosine                    = linkage(self.Dist_RF_terminal_cosine, 'ward')
            #finding the number of clusters
            if self.C_RF is None:
                if self.verbose: print("Finding the optimal number of clusters")
                self.finding_the_number_of_clusters(self.HC_RF_terminal_cosine, 
                                                    self.Dist_RF_terminal_cosine, "RF")
            # assigning the correct label
            if self.verbose: print("Cutting The Tree")
            self.assignment_RF_terminal_cosine            = cluster.hierarchy.cut_tree(self.HC_RF_terminal_cosine,
                                                                                       self.C_RF).ravel() #.ravel makes it 1D array.
            self.running_timeRF                           = round(time.time() - start_time)
            
        ############# nTreeClus method using RF + position #################
        if (self.method in ["All","RF_position"]):
            start_time                                     = time.time()
            xtrain                                         = self.seg_mat.drop(labels=['OriginalMAT_element', 'Class'],
                                                                            axis=1).copy()
            ytrain                                         = self.seg_mat['Class'].copy()
            np.random.seed(123)
            forest                                         = RandomForestClassifier(n_estimators=self.ntree, max_features=0.36)
            if self.verbose: print("Fit RF + POSITION")
            fitted_forest                                  = forest.fit(X=xtrain, y=ytrain)
            ### Finding Terminal Nodes
            terminal_forest                                = fitted_forest.apply(xtrain) #terminal nodes access
            terminal_forest                                = pd.DataFrame(terminal_forest)
            #Adding "columnindex_" to the beginning of all  
            terminal_forest                                = terminal_forest.astype('str')
            if self.verbose: print("DataFrame of terminal nodes")
            for col in terminal_forest:
                terminal_forest[col] = '{}_'.format(col) + terminal_forest[col]
            terminal_forest.head()
            for i in range(terminal_forest.shape[1]):
                if i == 0:
                    temp                  = pd.concat([self.seg_mat['OriginalMAT_element'], 
                                                       terminal_forest[i]], ignore_index=True, axis=1)
                    rbind_terminal_forest = temp
                else:
                    temp                  = pd.concat([self.seg_mat['OriginalMAT_element'], 
                                                       terminal_forest[i]], ignore_index=True, axis=1)
                    rbind_terminal_forest = pd.concat([rbind_terminal_forest, temp], ignore_index=True)
            rbind_terminal_forest.columns                 = ['OriginalMAT_element','ter']
            i, r                                          = pd.factorize(rbind_terminal_forest['OriginalMAT_element'])
            j, c                                          = pd.factorize(rbind_terminal_forest['ter'])
            ij, tups                                      = pd.factorize(list(zip(i, j)))
            terminal_output_forest_F                      = csr_matrix((np.bincount(ij), tuple(zip(*tups))))
            if self.verbose: print("Determining the cosine Distance")
            self.Dist_RF_terminal_cosine_p                = squareform(np.round(1-cosine_similarity(terminal_output_forest_F),8))
            if self.verbose: print("Applying Ward Linkage")
            self.HC_RF_terminal_cosine_p                  = linkage(self.Dist_RF_terminal_cosine_p, 'ward')
            #finding the number of clusters
            if self.C_RF_p is None:
                if self.verbose: print("Finding the optimal number of clusters")
                self.finding_the_number_of_clusters(self.HC_RF_terminal_cosine_p, 
                                                    self.Dist_RF_terminal_cosine_p, "RF_position")
            # assigning the correct label
            if self.verbose: print("Cutting The Tree")
            self.assignment_RF_terminal_cosine_p          = cluster.hierarchy.cut_tree(self.HC_RF_terminal_cosine_p, 
                                                                                       self.C_RF_p).ravel() #.ravel makes it 1D array.
            self.running_timeRF_p                         = round(time.time() - start_time)

    def output(self):
        return {"C_DT":self.C_DT, "distance_DT":self.Dist_tree_terminal_cosine, 
                "labels_DT":self.assignment_tree_terminal_cosine, 
                "C_RF":self.C_RF, "distance_RF":self.Dist_RF_terminal_cosine, 
                "labels_RF":self.assignment_RF_terminal_cosine, 
                "C_DT_p":self.C_DT_p, "distance_DT_p":self.Dist_tree_terminal_cosine_p, 
                "labels_DT_p":self.assignment_tree_terminal_cosine_p, 
                "C_RF_p":self.C_RF_p, "distance_RF_p":self.Dist_RF_terminal_cosine_p, 
                "labels_RF_p":self.assignment_RF_terminal_cosine_p, 
                "running_timeSegmentation": self.running_timeSegmentation, 
                "running_timeDT": self.running_timeDT, "running_timeDT_p": self.running_timeDT_p,
                "running_timeRF": self.running_timeRF, "running_timeRF_p": self.running_timeRF_p,
                "Parameter n":self.n}
        
    def performance(self, Ground_Truth):
        """[Reporting the performance]

        Args:
            Ground_Truth ([list]): [list of ground truth labels]

        Returns:
            res [pandas DataFrame]: [A dataframe reporting the performance for different metrics]
        """
        self.res = pd.DataFrame()
        if (self.method in ["All","DT"]):
            predictions_DT           = pd.DataFrame({'labels':Ground_Truth, "labels_DT":self.assignment_tree_terminal_cosine})
            replacement = {}
            for i in predictions_DT.labels_DT.unique():
                replacement[i] = ((predictions_DT[predictions_DT.labels_DT == i].labels.mode()[0]))
            predictions_DT.labels_DT = predictions_DT.labels_DT.map(replacement)
            self.res.loc['DT',"F1S"] = max(score(Ground_Truth, self.assignment_tree_terminal_cosine, average='macro',zero_division=0)[2], 
                                    score(Ground_Truth, predictions_DT.labels_DT, average='macro',zero_division=0)[2]).round(3)
            self.res.loc['DT',"ARS"] = math.ceil((adjusted_rand_score(Ground_Truth, self.assignment_tree_terminal_cosine))*1000)/1000
            self.res.loc['DT',"RS"]  = math.ceil((self.rand_index_score(Ground_Truth, self.assignment_tree_terminal_cosine))*1000)/1000
            self.res.loc['DT',"Pur"] = math.ceil((self.purity_score(Ground_Truth, self.assignment_tree_terminal_cosine))*1000)/1000
            self.res.loc['DT',"Sil"] = math.ceil(silhouette_score(squareform(self.Dist_tree_terminal_cosine),
                                                             self.assignment_tree_terminal_cosine,metric='cosine').round(3)*1000)/1000
            self.res.loc['DT',"1NN"] = math.ceil((self._1nn(Ground_Truth, self.Dist_tree_terminal_cosine))*1000)/1000
        if (self.method in ["All","RF"]):
            predictions_RF = pd.DataFrame({'labels':Ground_Truth, "labels_RF":self.assignment_RF_terminal_cosine})
            # Update cluster names based on the mode of the truth labels
            replacement = {}
            for i in predictions_RF.labels_RF.unique():
                replacement[i] = ((predictions_RF[predictions_RF.labels_RF == i].labels.mode()[0]))
            predictions_RF.labels_RF = predictions_RF.labels_RF.map(replacement)
            self.res.loc['RF',"F1S"] = max(score(Ground_Truth, self.assignment_RF_terminal_cosine, average='macro',zero_division=0)[2], 
                                      score(Ground_Truth, predictions_RF.labels_RF, average='macro',zero_division=0)[2]).round(3)
            self.res.loc['RF',"ARS"] = math.ceil((adjusted_rand_score(Ground_Truth, self.assignment_RF_terminal_cosine))*1000)/1000
            self.res.loc['RF',"RS"]  = math.ceil((self.rand_index_score(Ground_Truth, self.assignment_RF_terminal_cosine))*1000)/1000
            self.res.loc['RF',"Pur"] = math.ceil((self.purity_score(Ground_Truth, self.assignment_RF_terminal_cosine))*1000)/1000
            self.res.loc['RF',"Sil"] = math.ceil(silhouette_score(squareform(self.Dist_RF_terminal_cosine),
                                                             self.assignment_RF_terminal_cosine,metric='cosine').round(3)*1000)/1000
            self.res.loc['RF',"1NN"] = math.ceil((self._1nn(Ground_Truth, self.Dist_RF_terminal_cosine))*1000)/1000
        if (self.method in ["All","DT_position"]):
            predictions_DT = pd.DataFrame({'labels':Ground_Truth, "labels_DT":self.assignment_tree_terminal_cosine_p})
            replacement = {}
            for i in predictions_DT.labels_DT.unique():
                replacement[i] = ((predictions_DT[predictions_DT.labels_DT == i].labels.mode()[0]))
            predictions_DT.labels_DT = predictions_DT.labels_DT.map(replacement)
            self.res.loc['DT_p',"F1S"] = max(score(Ground_Truth, self.assignment_tree_terminal_cosine_p, average='macro',zero_division=0)[2], 
                                    score(Ground_Truth, predictions_DT.labels_DT, average='macro',zero_division=0)[2]).round(3)
            self.res.loc['DT_p',"ARS"] = math.ceil((adjusted_rand_score(Ground_Truth, self.assignment_tree_terminal_cosine_p))*1000)/1000
            self.res.loc['DT_p',"RS"]  = math.ceil((self.rand_index_score(Ground_Truth, self.assignment_tree_terminal_cosine_p))*1000)/1000
            self.res.loc['DT_p',"Pur"] = math.ceil((self.purity_score(Ground_Truth, self.assignment_tree_terminal_cosine_p))*1000)/1000
            self.res.loc['DT_p',"Sil"] = math.ceil(silhouette_score(squareform(self.Dist_tree_terminal_cosine_p),
                                                             self.assignment_tree_terminal_cosine_p,metric='cosine').round(3)*1000)/1000
            self.res.loc['DT_p',"1NN"] = math.ceil((self._1nn(Ground_Truth, self.Dist_tree_terminal_cosine_p))*1000)/1000
        if (self.method in ["All","RF_position"]):
            predictions_RF = pd.DataFrame({'labels':Ground_Truth, "labels_RF":self.assignment_RF_terminal_cosine_p})
            # Update cluster names based on the mode of the truth labels
            replacement = {}
            for i in predictions_RF.labels_RF.unique():
                replacement[i] = ((predictions_RF[predictions_RF.labels_RF == i].labels.mode()[0]))
            predictions_RF.labels_RF = predictions_RF.labels_RF.map(replacement)
            self.res.loc['RF_p',"F1S"] = max(score(Ground_Truth, self.assignment_RF_terminal_cosine_p, average='macro',zero_division=0)[2], 
                                      score(Ground_Truth, predictions_RF.labels_RF, average='macro',zero_division=0)[2]).round(3)
            self.res.loc['RF_p',"ARS"] = math.ceil((adjusted_rand_score(Ground_Truth, self.assignment_RF_terminal_cosine_p))*1000)/1000
            self.res.loc['RF_p',"RS"]  = math.ceil((self.rand_index_score(Ground_Truth, self.assignment_RF_terminal_cosine_p))*1000)/1000
            self.res.loc['RF_p',"Pur"] = math.ceil((self.purity_score(Ground_Truth, self.assignment_RF_terminal_cosine_p))*1000)/1000
            self.res.loc['RF_p',"Sil"] = math.ceil(silhouette_score(squareform(self.Dist_RF_terminal_cosine_p),
                                                             self.assignment_RF_terminal_cosine_p,metric='cosine').round(3)*1000)/1000
            self.res.loc['RF_p',"1NN"] = math.ceil((self._1nn(Ground_Truth, self.Dist_RF_terminal_cosine_p))*1000)/1000            
        return self.res
    
    def plot(self, which_model, labels, save=False, color_threshold=None, linkage_method= 'ward', annotate = False, xy = (0,0)):
        if which_model == 'RF':
            distance = self.Dist_RF_terminal_cosine
        elif which_model == 'RF_position':
            distance = self.Dist_RF_terminal_cosine_p
        elif which_model == 'DT':
            distance = self.Dist_tree_terminal_cosine
        elif which_model == 'DT_position':
            distance = self.Dist_tree_terminal_cosine_p
        else:
            raise Exception(f'Model {which_model} not supported.')
        HC_tree_terminal_cosine = linkage(distance, linkage_method)
        fig = plt.figure(figsize=(25, 10))
        ax = fig.add_subplot(1, 1, 1)
        if color_threshold == None:
            dendrogram(HC_tree_terminal_cosine,labels=labels, ax=ax)
        else:
            dendrogram(HC_tree_terminal_cosine,labels=labels, ax=ax, color_threshold=color_threshold)            
        ax.tick_params(axis='x', which='major', labelsize=18, rotation=90)
        ax.tick_params(axis='y', which='major', labelsize=18)
        if annotate:
            ax.annotate(f"""
                        F1-score = {round(self.res.loc['DT_p', 'F1S'],2)}
                        ARS        = {round(self.res.loc['DT_p', 'ARS'],2)}
                        RS          = {round(self.res.loc['DT_p', 'RS'],2)}
                        Purity     = {round(self.res.loc['DT_p', 'Pur'],2)}
                        ASW       = {round(self.res.loc['DT_p', 'Sil'],2)}
                        1NN       = {round(self.res.loc['DT_p', '1NN'],2)}            
                        """, xy=xy, xytext =(0, 0), fontsize=18, 
                        textcoords='offset points', va='top', ha='left')        
        if save:
            plt.savefig(f"dendrogram_{which_model}.png", dpi=300, bbox_inches='tight')        
        return fig, ax
    
    def __version__(self):
        print('1.2.1')
    
    def updates(self):
        print("""
              - Adding Plotting option
              - Adding Executing time.
              - Adding positional version of nTreeClus 
              - Adding 1NN to the performance metrics
              - Fixing Some bugs in performance calculation
              """)

In [4]:
def concat_strings(row):
    return ''.join(row)

# 使用apply()方法和lambda表达式对每一行的数据进行处理
data = df
data['concat'] = data.apply(lambda row: concat_strings(row), axis=1)
print(data)

     ischair    crs odepartment       action position         type  \
0       TRUE  cs602          ee         read    staff       roster   
1       TRUE  ee601          cs         read  advisor   transcript   
2         na  ee101   registrar    setstatus  advisor  application   
3         na  cs602  admissions     addscore  faculty  application   
4      FALSE  cs602          ce         read  faculty   transcript   
...      ...    ...         ...          ...      ...          ...   
5995    TRUE  cs101          ee  changescore  faculty    gradebook   
5996    TRUE  ee101          ee  changescore  faculty    gradebook   
5997    TRUE  cs602          ee  changescore  faculty    gradebook   
5998    TRUE  cs602          ee  changescore  faculty    gradebook   
5999    TRUE  cs601          ee  changescore  faculty    gradebook   

     udepartment crstaught crstaken  \
0             ee     cs601    cs602   
1      registrar     ee101    cs601   
2     admissions     ee601    ee101   
3  

In [6]:
import time

result = []
for i in range(10):
    start_time = time.time()
    model = nTreeClus(list(data.concat), n=None, ntree=10, method="RF", verbose=1, C=i)
    model.nTreeClus()
    result.append(time.time() - start_time)
    print("--- %s seconds ---" % (time.time() - start_time))
for i in range(10):
    print(result[i])

Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 23296.66it/s]
  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


one-hot encoding + x/y train
Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 20.502262353897095 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 66100.61it/s]

one-hot encoding + x/y train



  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 52.56512784957886 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 6193.11it/s]


one-hot encoding + x/y train


  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 57.20030117034912 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 67961.55it/s]

one-hot encoding + x/y train



  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 19.142507791519165 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 55364.75it/s]


one-hot encoding + x/y train


  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 18.900509357452393 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 71307.65it/s]

one-hot encoding + x/y train



  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 18.683255672454834 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 29988.30it/s]
  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


one-hot encoding + x/y train
Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 19.072103023529053 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 62519.28it/s]

one-hot encoding + x/y train



  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 19.028996467590332 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 24372.33it/s]
  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


one-hot encoding + x/y train
Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 19.248244047164917 seconds ---
Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████| 6000/6000 [00:00<00:00, 30081.11it/s]
  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


one-hot encoding + x/y train
Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree
--- 19.194772243499756 seconds ---
20.502262353897095
52.56512784957886
57.20030117034912
19.142507791519165
18.900509357452393
18.683255672454834
19.072103023529053
19.028996467590332
19.248244047164917
19.194772243499756


In [6]:
data['label'] = model.output()['labels_RF']
print(data)

     ischair    crs odepartment       action position         type  \
0      FALSE  ee601          cs         read  faculty       roster   
1      FALSE  cs101          ee         read  faculty       roster   
2       TRUE  ee601          cs         read  advisor   transcript   
3       TRUE  ee101          cs         read  advisor   transcript   
4       TRUE  cs101   registrar         read    staff  application   
...      ...    ...         ...          ...      ...          ...   
7995   FALSE  ee101  admissions  changescore  faculty    gradebook   
7996   FALSE  ee601          cs  changescore  faculty    gradebook   
7997      na  cs602  admissions  changescore  faculty    gradebook   
7998    TRUE  ee601   registrar  changescore  faculty    gradebook   
7999    TRUE  ee601          ce  changescore  faculty    gradebook   

     udepartment crstaught crstaken  \
0      registrar     ee101    ee601   
1             ce     ee601    cs101   
2      registrar     ee101    cs602   
3  

In [7]:
ndata=data.drop(columns='concat')
ndata=ndata.drop(columns='label')
ndata

Unnamed: 0,ischair,crs,odepartment,action,position,type,udepartment,crstaught,crstaken
0,FALSE,ee601,cs,read,faculty,roster,registrar,ee101,ee601
1,FALSE,cs101,ee,read,faculty,roster,ce,ee601,cs101
2,TRUE,ee601,cs,read,advisor,transcript,registrar,ee101,cs602
3,TRUE,ee101,cs,read,advisor,transcript,ce,cs101,cs101
4,TRUE,cs101,registrar,read,staff,application,admissions,cs101,ee101
...,...,...,...,...,...,...,...,...,...
7995,FALSE,ee101,admissions,changescore,faculty,gradebook,ce,cs602,ee101
7996,FALSE,ee601,cs,changescore,faculty,gradebook,ee,cs602,ee601
7997,na,cs602,admissions,changescore,faculty,gradebook,ce,ee601,cs602
7998,TRUE,ee601,registrar,changescore,faculty,gradebook,ee,cs101,ee601


In [8]:
import time
start_time = time.time()

temp = pd.DataFrame()
#print(datafreq)
clusTemp = pd.DataFrame()

policy ={}
###change FN: p, FP: d or act
datasize= len(act.index)
#print("{")
for i in range(model.output()['C_RF']):
    policy[i] = {}
    
    clusTemp = ndata.loc[data['label'] == i]
    #clusTemp = FN.loc[FN['cluster'] == i]
    clussize= len(clusTemp.index)
    #print(i,":{",sep='',end="")
    #print(clusTemp.describe())
    for col in ndata.columns:
        #actual data:  act[col].value_counts().to_dict()
        # FN data: p[col].value_counts().to_dict()
        # FP data: d[col].value_counts().to_dict()
        datafreq =   act[col].value_counts().to_dict() # act[col].value_counts().to_dict()#
#         print(datafreq)
        #datafreq= pd.DataFrame.from_dict(datafreq, )
        temp=clusTemp[col].value_counts().to_dict()
#         print(temp)
        for key, value in datafreq.items():
            
            
            for key1, value1 in temp.items():
#                 print(key, key1)
                if key == key1:
                    #print(key ,value,value1)
                    x = value1/datasize
                    y = value1/clussize
                    #print((y-x))
                    #if key in centroids[i]:
                    # 在这里调整阈值
                    if (y-x) >= 0.549:
                        if col not in policy[i]:
                            policy[i][col] = key
                    for col2 in ndata.columns:
                                if col !=col2:
                                    clusTemp = ndata.loc[data['label'] == i]
#                                     clus = clusTemp.loc[clusTemp[col] == clusTemp[col2]]
                                    clus = clusTemp.loc[clusTemp[col].astype(str) == clusTemp[col2].astype(str)]
                                    if len(clus) >= len(clusTemp)/2:
                                        #print("'",col,"':'",col2,"',",sep='',end="")
                                        if col not in policy[i] and col2 not in policy[i]: 
                                            policy[i][col] = col2
                                        else:
                                            if col in policy[i]: 
                                                policy[i][col] = col2
                                            if col2 in policy[i] :
                                                del policy[i][col2]
print("--- %s seconds ---" % (time.time() - start_time))    
print(sum(len(v) for v in policy.values()))
print(policy)

--- 0.9396722316741943 seconds ---
34
{0: {'action': 'read', 'position': 'faculty', 'type': 'roster', 'crstaken': 'crs'}, 1: {'ischair': 'TRUE', 'action': 'read', 'type': 'transcript'}, 2: {'action': 'read', 'type': 'application', 'udepartment': 'admissions'}, 3: {'type': 'gradebook', 'udepartment': 'registrar', 'crstaken': 'crstaught'}, 4: {'action': 'checkstatus', 'position': 'student', 'type': 'application'}, 5: {'action': 'addscore', 'type': 'gradebook', 'crstaken': 'crstaught'}, 6: {'action': 'read', 'type': 'roster', 'udepartment': 'registrar'}, 7: {'action': 'changescore', 'position': 'faculty', 'type': 'gradebook', 'crstaken': 'crs'}, 8: {'action': 'readmyscores', 'type': 'gradebook', 'crstaken': 'crs'}, 9: {'odepartment': 'admissions', 'action': 'read', 'position': 'faculty', 'type': 'roster', 'crstaken': 'crs'}}


In [9]:
## clean extracted policy by findsimilar rules and clean them
from itertools import chain
from collections import defaultdict
rules = policy

def jaccard_similarity(list1, list2):
    intersection = (set(list1.values()).intersection(list2.values()))
    #print(list(set(list1).intersection(list2)))
    union = defaultdict(list)
    for k, v in chain(list1.items(), list2.items()):
        #if v not in intersection:
            union[k].append(v)
    #print(union)    
    
    intersectionl = len(intersection)
    unionl = len(union)
    if unionl == 0:
        score = 0.0
        return 
    else:
        score = float(intersectionl / unionl)
    if score > 0.7:
                #print(i,j)
                print(key, ",",key2," score =", score,unionl,intersectionl)
                print( (set(list1).intersection(list2)),"\n" ,union)
                #print(intersectionl)
    return float(intersectionl / unionl)

for key, value in policy.items():
    
    for key2, v  in rules.items():
    #print(key, value)
        
        if key !=key2:
            
            #print(key, key2)
            """for z, j in value.items():
                for y, i in v.items():
                     if z == y:"""
            #score = jaccard_similarity_score(value, v)
            score = jaccard_similarity(value, v)
                        #print(i,j)
            """if score > 0.6:
                #print(i,j)
                print(key, ",",k," score =", score)
    """
print(policy)

0 , 9  score = 0.8 5 4
{'action', 'crstaken', 'type', 'position'} 
 defaultdict(<class 'list'>, {'action': ['read', 'read'], 'position': ['faculty', 'faculty'], 'type': ['roster', 'roster'], 'crstaken': ['crs', 'crs'], 'odepartment': ['admissions']})
9 , 0  score = 0.8 5 4
{'action', 'crstaken', 'type', 'position'} 
 defaultdict(<class 'list'>, {'odepartment': ['admissions'], 'action': ['read', 'read'], 'position': ['faculty', 'faculty'], 'type': ['roster', 'roster'], 'crstaken': ['crs', 'crs']})
{0: {'action': 'read', 'position': 'faculty', 'type': 'roster', 'crstaken': 'crs'}, 1: {'ischair': 'TRUE', 'action': 'read', 'type': 'transcript'}, 2: {'action': 'read', 'type': 'application', 'udepartment': 'admissions'}, 3: {'type': 'gradebook', 'udepartment': 'registrar', 'crstaken': 'crstaught'}, 4: {'action': 'checkstatus', 'position': 'student', 'type': 'application'}, 5: {'action': 'addscore', 'type': 'gradebook', 'crstaken': 'crstaught'}, 6: {'action': 'read', 'type': 'roster', 'udepar

In [10]:
# run the extracted policy over the complete data
import time
def dict_compare(d1, d2):
    d1_keys = set(d1.keys())
    d2_keys = set(d2.keys())

    intersect_keys = d1_keys.intersection(d2_keys)
 
    relation = []
    added = d1_keys - d2_keys
    removed = d2_keys - d1_keys
  
    same = set(o for o in intersect_keys if  d1[o] in d2[o] )
   
    if len(same) == len(intersect_keys):
        return added, removed, same, relation
    for key,  o in d2.items():
        if type(o) == str:
                
 
            temp = set(i for i in intersect_keys if type(d2[i]) == str and d2[i] in d1_keys and d1[i] == d1[d2[i]] )
                #print(temp)
            if len(temp)>0:
                    relation = temp
        
           
    return added, removed, same, relation


def ruleCheck (row):
    lable = 'd'
    drow = row.to_dict()
    
    for key,value in policy.items():
        #print(key)
#         返回策略与本条数据比较的结果，包括有多少值相等，满足多少属性关系
        added, removed, same, relation = dict_compare(drow, value)
#         这行是关键，比较是否满足策略，比较满足的属性值与属性关系是否能比全部值的数量多，如果多，则全覆盖了，通过
        if len(same)+len(relation) >= len(value) or len(same) == len(value):
            #print(len(same)+len(relation), len(value))
            lable = "p"
#             row ['lable'] ='p'
            return lable
    
    return lable
start_time = time.time()

act['lable'] = act.apply(ruleCheck, axis=1)
print(act)
print("--- %s seconds ---" % (time.time() - start_time))    


      ischair    crs odepartment       action position         type  \
0       FALSE  ee601          cs         read  faculty       roster   
1       FALSE  cs101          ee         read  faculty       roster   
2        TRUE  ee601          cs         read  advisor   transcript   
3        TRUE  ee101          cs         read  advisor   transcript   
4        TRUE  cs101   registrar         read    staff  application   
...       ...    ...         ...          ...      ...          ...   
15996      na  ee101          ee    setstatus    staff       roster   
15997    TRUE  ee101   registrar        write  faculty    gradebook   
15998    TRUE  ee101          cs         read  faculty    gradebook   
15999    TRUE  ee101   registrar  changescore    staff   transcript   
16000      na  ee101  admissions    setstatus    staff   transcript   

      udepartment crstaught crstaken class lable  
0       registrar     ee101    ee601     p     p  
1              ce     ee601    cs101     p   

In [11]:
#Evaluation
log = act[(act['class'] == "p") & (act['lable'] == "p")]
log2 = act[(act['class'] == "d") & (act['lable'] == "d")]
FN = act[(act['class'] == "p") & (act['lable'] == "d")]
FP = act[(act['class'] == "d") & (act['lable'] == "p")]

p = act[act['class'] == "p"]
d = act[act['class'] == "d"]
#[(df['column_name'] >= A) & (df['column_name'] <= B)]
#print(len(log),'\n', log.head(10))
#print(FN.describe(),FP.describe(),log2.describe(),d.describe())
#log.describe()
print("FN=", len(FN), ", FP=", len(FP), ", TP=", len(log), ", TN=", len(log2))
recall = (len(log)/(len(log)+len(FN)))*100
precesion = (len(log)/(len(log)+len(FP)))*100
accu = ((len(log)+len(log2))/len(act))*100
f = 2*((recall*precesion)/(recall+precesion))
print("recall =",recall, "\nprecession =",precesion, "\naccuracy = ", accu,"\nf-score = ", f)

FN= 0 , FP= 121 , TP= 8000 , TN= 7880
recall = 100.0 
precession = 98.51003570988794 
accuracy =  99.24379726267108 
f-score =  99.2494262142547


In [23]:
def concat_strings(row):
    return ''.join(row)

# 使用apply()方法和lambda表达式对每一行的数据进行处理
data1 =  FN.drop(columns=['class','lable']).reset_index() #FN.drop(columns=['class','lable']) #FP.drop(columns=['class','lable'])#ReFN.drop(columns=['class','lable','lable1','lable2']) #ReFN.drop(columns=['class','lable','lable1']).reset_index() #FN.drop(columns=['class','lable']) #df
data1 = data1.drop(columns=['index'])
data1['concat'] = data1.apply(lambda row: concat_strings(row), axis=1)
print(data1)

     ischair    crs odepartment        action position         type  \
0       TRUE  cs101   registrar          read    staff  application   
1         na  ee601          ee  readmyscores  advisor    gradebook   
2         na  cs602   registrar          read  faculty  application   
3         na  ee101  admissions          read  advisor  application   
4         na  cs101  admissions          read  student  application   
...      ...    ...         ...           ...      ...          ...   
1986      na  cs101   registrar  readmyscores    staff    gradebook   
1987      na  cs101   registrar  readmyscores  advisor    gradebook   
1988   FALSE  cs602   registrar  readmyscores    staff    gradebook   
1989    TRUE  ee601          ee  readmyscores  faculty    gradebook   
1990   FALSE  cs602          ee  readmyscores  student    gradebook   

     udepartment crstaught crstaken  \
0     admissions     cs101    ee101   
1      registrar     ee601    ee601   
2     admissions     cs101    

In [24]:
# FN
model1 = nTreeClus(list(data1.concat), n=None, ntree=10, method="RF", verbose=1, C=10)
model1.nTreeClus()

Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|██████████████████| 1991/1991 [00:00<00:00, 17770.70it/s]
  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


one-hot encoding + x/y train
Fit RF
DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree


In [25]:
data1['label'] = model1.output()['labels_RF']
print(data1)

     ischair    crs odepartment        action position         type  \
0       TRUE  cs101   registrar          read    staff  application   
1         na  ee601          ee  readmyscores  advisor    gradebook   
2         na  cs602   registrar          read  faculty  application   
3         na  ee101  admissions          read  advisor  application   
4         na  cs101  admissions          read  student  application   
...      ...    ...         ...           ...      ...          ...   
1986      na  cs101   registrar  readmyscores    staff    gradebook   
1987      na  cs101   registrar  readmyscores  advisor    gradebook   
1988   FALSE  cs602   registrar  readmyscores    staff    gradebook   
1989    TRUE  ee601          ee  readmyscores  faculty    gradebook   
1990   FALSE  cs602          ee  readmyscores  student    gradebook   

     udepartment crstaught crstaken  \
0     admissions     cs101    ee101   
1      registrar     ee601    ee601   
2     admissions     cs101    

In [26]:
ndata1=data1.drop(columns='concat')
ndata1=ndata1.drop(columns='label')
ndata1

Unnamed: 0,ischair,crs,odepartment,action,position,type,udepartment,crstaught,crstaken
0,TRUE,cs101,registrar,read,staff,application,admissions,cs101,ee101
1,na,ee601,ee,readmyscores,advisor,gradebook,registrar,ee601,ee601
2,na,cs602,registrar,read,faculty,application,admissions,cs101,cs101
3,na,ee101,admissions,read,advisor,application,admissions,ee601,cs101
4,na,cs101,admissions,read,student,application,admissions,ee601,cs101
...,...,...,...,...,...,...,...,...,...
1986,na,cs101,registrar,readmyscores,staff,gradebook,cs,ee601,cs101
1987,na,cs101,registrar,readmyscores,advisor,gradebook,ce,cs101,cs101
1988,FALSE,cs602,registrar,readmyscores,staff,gradebook,registrar,ee101,cs602
1989,TRUE,ee601,ee,readmyscores,faculty,gradebook,ee,cs601,ee601


In [27]:
# Calculating FN Attributes Effectevness in each cluster
import time
start_time = time.time()

temp = pd.DataFrame()
#print(datafreq)
clusTemp = pd.DataFrame()

FNpolicy ={}
###change FN: p, FP: d or act
datasize= len(p.index)
#print("{")
for i in range(model1.output()['C_RF']):
    FNpolicy[i] = {}
    
    clusTemp = ndata1.loc[data1['label'] == i]
    #clusTemp = FN.loc[FN['cluster'] == i]
    clussize= len(clusTemp.index)
    #print(i,":{",sep='',end="")
    #print(clusTemp.describe())
    for col in ndata1.columns:
        #actual data:  act[col].value_counts().to_dict()
        # FN data: p[col].value_counts().to_dict()
        # FP data: d[col].value_counts().to_dict()
        datafreq =   p[col].value_counts().to_dict() # act[col].value_counts().to_dict()#
#         print(datafreq)
        #datafreq= pd.DataFrame.from_dict(datafreq, )
        temp=clusTemp[col].value_counts().to_dict()
#         print(temp)
        for key, value in datafreq.items():
            
            
            for key1, value1 in temp.items():
#                 print(key, key1)
                if key == key1:
                    #print(key ,value,value1)
                    x = value1/datasize
                    y = value1/clussize
                    #print((y-x))
                    #if key in centroids[i]:
                    if (y-x) >= 0.549:
                        if col not in FNpolicy[i]:
                            FNpolicy[i][col] = key
                    for col2 in ndata1.columns:
                                if col !=col2:
                                    clusTemp = ndata1.loc[data1['label'] == i]
#                                     clus = clusTemp.loc[clusTemp[col] == clusTemp[col2]]
                                    clus = clusTemp.loc[clusTemp[col].astype(str) == clusTemp[col2].astype(str)]
                                    if len(clus) >= len(clusTemp)/2:
                                        #print("'",col,"':'",col2,"',",sep='',end="")
                                        if col not in FNpolicy[i] and col2 not in FNpolicy[i]: 
                                            FNpolicy[i][col] = col2
                                        else:
                                            if col in FNpolicy[i]: 
                                                FNpolicy[i][col] = col2
                                            if col2 in FNpolicy[i] :
                                                del FNpolicy[i][col2]
print("--- %s seconds ---" % (time.time() - start_time))    
print(FNpolicy)

--- 8.893733978271484 seconds ---
{0: {'odepartment': 'registrar', 'action': 'read', 'type': 'application', 'udepartment': 'admissions'}, 1: {'odepartment': 'registrar', 'action': 'readmyscores', 'type': 'gradebook', 'udepartment': 'registrar', 'crstaken': 'crs'}, 2: {'action': 'read', 'position': 'advisor', 'type': 'application', 'udepartment': 'admissions'}, 3: {'action': 'read', 'position': 'student', 'type': 'application', 'udepartment': 'admissions'}, 4: {'action': 'readmyscores', 'type': 'gradebook', 'crstaken': 'crs'}, 5: {'action': 'readmyscores', 'position': 'advisor', 'type': 'gradebook', 'crstaken': 'crs'}, 6: {'action': 'read', 'position': 'staff', 'type': 'application', 'udepartment': 'admissions'}, 7: {'action': 'readmyscores', 'type': 'gradebook', 'udepartment': 'admissions', 'crstaken': 'crs'}, 8: {'action': 'readmyscores', 'position': 'faculty', 'type': 'gradebook', 'crstaken': 'crs'}, 9: {'action': 'read', 'position': 'faculty', 'type': 'application', 'udepartment': '

In [29]:
# refine the extracted policy based on FN policy 

def jaccard_similarity(list1, list2):
        intersection = (set(list1.values()).intersection(list2.values()))
        #print(list(set(list1).intersection(list2)))
        union = defaultdict(list)
        for k, v in chain(list1.items(), list2.items()):
        #if v not in intersection:
            union[k].append(v)
    #print(union)    
    
        intersectionl = len(intersection)
        unionl = len(union)
        score = float(intersectionl / unionl)
        #print(len(policy[key]),len(FNpolicy[key2]))
        if score >= 0.5:
            if len(policy[key]) > len(FNpolicy[key2]) and len(FNpolicy[key2])>=2 :
                #print(len(policy[key]),len(FNpolicy[key2]))
                policy[key] = FNpolicy[key2]
                print(key, ",",key2," score =", score,unionl,intersectionl)
                print( (set(list1).intersection(list2)),"\n" ,union)
                #print(intersectionl)
        
        return float(intersectionl / unionl)
        return float(intersectionl / unionl)
    
#rules = policy
for key, value in policy.items():
    
    for key2, v  in FNpolicy.items():
        #print(key, key2)
        
        #if key !=key2:
            
            #print(key, key2)
            """for z, j in value.items():
                for y, i in v.items():
                     if z == y:"""
            #score = jaccard_similarity_score(value, v)
            score = jaccard_similarity(value, v)
                        #print(i,j)
            """if score > 0.6:
                #print(i,j)
                print(key, ",",k," score =", score)
    """
print(sum(len(v) for v in FNpolicy.values()))
print(policy)

40
{0: {'action': 'read', 'position': 'faculty', 'type': 'roster', 'crstaken': 'crs'}, 1: {'ischair': 'TRUE', 'action': 'read', 'type': 'transcript'}, 2: {'action': 'readmyscores', 'type': 'gradebook', 'crstaken': 'crs'}, 3: {'odepartment': 'admissions', 'type': 'gradebook', 'udepartment': 'registrar', 'crstaken': 'crstaught'}, 4: {'action': 'checkstatus', 'position': 'student', 'type': 'application'}, 5: {'action': 'addscore', 'type': 'gradebook', 'crstaken': 'crstaught'}, 6: {'action': 'read', 'type': 'roster', 'udepartment': 'registrar'}, 7: {'action': 'readmyscores', 'type': 'gradebook', 'crstaken': 'crs'}, 8: {'action': 'readmyscores', 'type': 'gradebook', 'crstaken': 'crs'}, 9: {'action': 'read', 'position': 'faculty', 'type': 'application', 'udepartment': 'admissions'}}


In [30]:
# run the extracted policy over the complete data  (FN)
import time
def dict_compare(d1, d2):
    d1_keys = set(d1.keys())
    d2_keys = set(d2.keys())

    intersect_keys = d1_keys.intersection(d2_keys)
 
    relation = []
    added = d1_keys - d2_keys
    removed = d2_keys - d1_keys
  
    same = set(o for o in intersect_keys if  d1[o] in d2[o] )
   
    if len(same) == len(intersect_keys):
        return added, removed, same, relation
    for key,  o in d2.items():
        if type(o) == str:
                
 
            temp = set(i for i in intersect_keys if type(d2[i]) == str and d2[i] in d1_keys and d1[i] == d1[d2[i]] )
                #print(temp)
            if len(temp)>0:
                    relation = temp
        
           
    return added, removed, same, relation


def ruleCheck (row):
    lable = 'd'
    drow = row.to_dict()
    
    for key,value in policy.items():
        #print(key)
        added, removed, same, relation = dict_compare(drow, value)
        
        if len(same)+len(relation) >= len(value) or len(same) == len(value):
            #print(len(same)+len(relation), len(value))
            lable = "p"
#             row ['lable'] ='p'
            return lable
    
    return lable
start_time = time.time()

act['lable'] = act.apply(ruleCheck, axis=1)
print(act)
print("--- %s seconds ---" % (time.time() - start_time))    


      ischair    crs odepartment       action position         type  \
0       FALSE  ee601          cs         read  faculty       roster   
1       FALSE  cs101          ee         read  faculty       roster   
2        TRUE  ee601          cs         read  advisor   transcript   
3        TRUE  ee101          cs         read  advisor   transcript   
4        TRUE  cs101   registrar         read    staff  application   
...       ...    ...         ...          ...      ...          ...   
15996      na  ee101          ee    setstatus    staff       roster   
15997    TRUE  ee101   registrar        write  faculty    gradebook   
15998    TRUE  ee101          cs         read  faculty    gradebook   
15999    TRUE  ee101   registrar  changescore    staff   transcript   
16000      na  ee101  admissions    setstatus    staff   transcript   

      udepartment crstaught crstaken class lable  
0       registrar     ee101    ee601     p     p  
1              ce     ee601    cs101     p   

In [31]:
#Evaluation
log = act[(act['class'] == "p") & (act['lable'] == "p")]
log2 = act[(act['class'] == "d") & (act['lable'] == "d")]
FN = act[(act['class'] == "p") & (act['lable'] == "d")]
FP = act[(act['class'] == "d") & (act['lable'] == "p")]

p = act[act['class'] == "p"]
d = act[act['class'] == "d"]
#[(df['column_name'] >= A) & (df['column_name'] <= B)]
#print(len(log),'\n', log.head(10))
#print(FN.describe(),FP.describe(),log2.describe(),d.describe())
#log.describe()
print("FN=", len(FN), ", FP=", len(FP), ", TP=", len(log), ", TN=", len(log2))
recall = (len(log)/(len(log)+len(FN)))*100
precesion = (len(log)/(len(log)+len(FP)))*100
accu = ((len(log)+len(log2))/len(act))*100
f = 2*((recall*precesion)/(recall+precesion))
print("recall =",recall, "\nprecession =",precesion, "\naccuracy = ", accu,"\nf-score = ", f)

FN= 1760 , FP= 68 , TP= 6240 , TN= 7933
recall = 78.0 
precession = 98.92200380469245 
accuracy =  88.57571401787389 
f-score =  87.22393066815768


In [15]:
def concat_strings(row):
    return ''.join(row)

# 使用apply()方法和lambda表达式对每一行的数据进行处理
data2 =  FP.drop(columns=['class','lable']).reset_index() #FN.drop(columns=['class','lable']) #FP.drop(columns=['class','lable'])#ReFN.drop(columns=['class','lable','lable1','lable2']) #ReFN.drop(columns=['class','lable','lable1']).reset_index() #FN.drop(columns=['class','lable']) #df
data2 = data2.drop(columns=['index'])
data2['concat'] = data2.apply(lambda row: concat_strings(row), axis=1)
print(data2)

    ischair    crs odepartment       action position       type udepartment  \
0        na  cs602          cs  checkstatus    staff  gradebook   registrar   
1        na  ee101  admissions         read    staff  gradebook   registrar   
2        na  cs101          ce  checkstatus  student  gradebook   registrar   
3      TRUE  cs602   registrar         read    staff  gradebook          cs   
4     FALSE  cs601  admissions        write  student  gradebook   registrar   
..      ...    ...         ...          ...      ...        ...         ...   
116   FALSE  cs602          ee    setstatus  faculty  gradebook   registrar   
117    TRUE  cs601  admissions    setstatus  advisor  gradebook   registrar   
118      na  ee101          ce  changescore  faculty  gradebook   registrar   
119    TRUE  cs602  admissions         read  faculty  gradebook          ce   
120    TRUE  cs602          cs  checkstatus  faculty  gradebook   registrar   

    crstaught crstaken                             

In [16]:
# FP
model2 = nTreeClus(list(data2.concat), n=None, ntree=10, method="RF", verbose=1, C=10)
model2.nTreeClus()

Finding the parameter 'n'
Parameter 'n' is set to 8


Matrix Segmentation (Splitting based on window size): 100%|████████████████████| 121/121 [00:00<00:00, 17274.02it/s]

one-hot encoding + x/y train
Fit RF



  self.seg_mat.loc[:,'Class']   = le.fit_transform(self.seg_mat.loc[:,'Class']) # Convert Y to numbers


DataFrame of terminal nodes
Determining the cosine Distance
Applying Ward Linkage
Cutting The Tree


In [17]:
data2['label'] = model2.output()['labels_RF']
print(data2)

    ischair    crs odepartment       action position       type udepartment  \
0        na  cs602          cs  checkstatus    staff  gradebook   registrar   
1        na  ee101  admissions         read    staff  gradebook   registrar   
2        na  cs101          ce  checkstatus  student  gradebook   registrar   
3      TRUE  cs602   registrar         read    staff  gradebook          cs   
4     FALSE  cs601  admissions        write  student  gradebook   registrar   
..      ...    ...         ...          ...      ...        ...         ...   
116   FALSE  cs602          ee    setstatus  faculty  gradebook   registrar   
117    TRUE  cs601  admissions    setstatus  advisor  gradebook   registrar   
118      na  ee101          ce  changescore  faculty  gradebook   registrar   
119    TRUE  cs602  admissions         read  faculty  gradebook          ce   
120    TRUE  cs602          cs  checkstatus  faculty  gradebook   registrar   

    crstaught crstaken                             

In [18]:
ndata2=data2.drop(columns='concat')
ndata2=ndata2.drop(columns='label')
ndata2

Unnamed: 0,ischair,crs,odepartment,action,position,type,udepartment,crstaught,crstaken
0,na,cs602,cs,checkstatus,staff,gradebook,registrar,ee101,ee101
1,na,ee101,admissions,read,staff,gradebook,registrar,ee601,ee601
2,na,cs101,ce,checkstatus,student,gradebook,registrar,cs101,cs101
3,TRUE,cs602,registrar,read,staff,gradebook,cs,cs602,cs602
4,FALSE,cs601,admissions,write,student,gradebook,registrar,cs601,cs601
...,...,...,...,...,...,...,...,...,...
116,FALSE,cs602,ee,setstatus,faculty,gradebook,registrar,ee101,ee101
117,TRUE,cs601,admissions,setstatus,advisor,gradebook,registrar,ee101,ee101
118,na,ee101,ce,changescore,faculty,gradebook,registrar,ee601,ee601
119,TRUE,cs602,admissions,read,faculty,gradebook,ce,cs101,cs602


In [19]:
# Calculating FP Attributes Effectevness in each cluster
import time
start_time = time.time()

temp = pd.DataFrame()
#print(datafreq)
clusTemp = pd.DataFrame()

FPpolicy ={}
###change FN: p, FP: d or act
datasize= len(d.index)
#print("{")
for i in range(model2.output()['C_RF']):
    FPpolicy[i] = {}
    
    clusTemp = ndata2.loc[data2['label'] == i]
    #clusTemp = FN.loc[FN['cluster'] == i]
    clussize= len(clusTemp.index)
    #print(i,":{",sep='',end="")
    #print(clusTemp.describe())
    for col in ndata2.columns:
        #actual data:  act[col].value_counts().to_dict()
        # FN data: p[col].value_counts().to_dict()
        # FP data: d[col].value_counts().to_dict()
        datafreq =   d[col].value_counts().to_dict() # act[col].value_counts().to_dict()#
#         print(datafreq)
        #datafreq= pd.DataFrame.from_dict(datafreq, )
        temp=clusTemp[col].value_counts().to_dict()
#         print(temp)
        for key, value in datafreq.items():
            
            
            for key1, value1 in temp.items():
#                 print(key, key1)
                if key == key1:
                    #print(key ,value,value1)
                    x = value1/datasize
                    y = value1/clussize
                    #print((y-x))
                    #if key in centroids[i]:
                    if (y-x) >= 0.549:
                        if col not in FPpolicy[i]:
                            FPpolicy[i][col] = key
                    for col2 in ndata2.columns:
                                if col !=col2:
                                    clusTemp = ndata2.loc[data2['label'] == i]
#                                     clus = clusTemp.loc[clusTemp[col] == clusTemp[col2]]
                                    clus = clusTemp.loc[clusTemp[col].astype(str) == clusTemp[col2].astype(str)]
                                    if len(clus) >= len(clusTemp)/2:
                                        #print("'",col,"':'",col2,"',",sep='',end="")
                                        if col not in FPpolicy[i] and col2 not in FPpolicy[i]: 
                                            FPpolicy[i][col] = col2
                                        else:
                                            if col in FPpolicy[i]: 
                                                FPpolicy[i][col] = col2
                                            if col2 in FPpolicy[i] :
                                                del FPpolicy[i][col2]
print("--- %s seconds ---" % (time.time() - start_time))    
print(FPpolicy)

--- 8.073983669281006 seconds ---
{0: {'type': 'gradebook', 'udepartment': 'registrar', 'crstaken': 'crstaught'}, 1: {'action': 'read', 'type': 'gradebook', 'udepartment': 'admissions', 'crstaken': 'crs'}, 2: {'type': 'gradebook', 'udepartment': 'registrar', 'crstaken': 'crstaught'}, 3: {'ischair': 'TRUE', 'odepartment': 'registrar', 'action': 'read', 'position': 'staff', 'type': 'gradebook', 'crstaken': 'crstaught'}, 4: {'odepartment': 'admissions', 'type': 'gradebook', 'udepartment': 'registrar', 'crstaken': 'crstaught'}, 5: {'type': 'gradebook', 'udepartment': 'registrar', 'crstaken': 'crstaught'}, 6: {'ischair': 'na', 'action': 'changescore', 'type': 'gradebook', 'udepartment': 'odepartment', 'crstaken': 'crstaught'}, 7: {'action': 'read', 'type': 'gradebook', 'crstaken': 'crs'}, 8: {'ischair': 'FALSE', 'action': 'read', 'position': 'advisor', 'type': 'gradebook', 'udepartment': 'cs', 'crstaught': 'ee101', 'crstaken': 'crs'}, 9: {'ischair': 'FALSE', 'odepartment': 'registrar', 'act

In [20]:
# refine the extracted policy based on FP policy 

def jaccard_similarity(list1, list2):
        intersection = (set(list1.values()).intersection(list2.values()))
        #print(list(set(list1).intersection(list2)))
        union = defaultdict(list)
        for k, v in chain(list1.items(), list2.items()):
        #if v not in intersection:
            union[k].append(v)
    #print(union)    
    
        intersectionl = len(intersection)
        unionl = len(union)
        score = float(intersectionl / unionl)
        #print(len(policy[key]),len(FNpolicy[key2]))
        if score >= 0.5:
            if len(policy[key]) < len(FPpolicy[key2])  :
                #print(len(policy[key]),len(FPpolicy[key2]))
                
                policy[key] = FPpolicy[key2]
                print(key, ",",key2," score =", score,unionl,intersectionl)
                print( (set(list1).intersection(list2)),"\n" ,union)
                #print(intersectionl)
        
        return float(intersectionl / unionl)
        return float(intersectionl / unionl)
    
#rules = policy
for key, value in policy.items():
    
    for key2, v  in FPpolicy.items():
        #print(key, key2)
        
        #if key !=key2:
            
            #print(key, key2)
            """for z, j in value.items():
                for y, i in v.items():
                     if z == y:"""
            #score = jaccard_similarity_score(value, v)
            score = jaccard_similarity(value, v)
                        #print(i,j)
            """if score > 0.6:
                #print(i,j)
                print(key, ",",k," score =", score)
    """
sum(len(v) for v in policy.values())

2 , 1  score = 0.5 4 2
{'type', 'udepartment', 'action'} 
 defaultdict(<class 'list'>, {'action': ['read', 'read'], 'type': ['application', 'gradebook'], 'udepartment': ['admissions', 'admissions'], 'crstaken': ['crs']})
3 , 4  score = 0.75 4 3
{'type', 'crstaken', 'udepartment'} 
 defaultdict(<class 'list'>, {'type': ['gradebook', 'gradebook'], 'udepartment': ['registrar', 'registrar'], 'crstaken': ['crstaught', 'crstaught'], 'odepartment': ['admissions']})
8 , 1  score = 0.5 4 2
{'type', 'crstaken', 'action'} 
 defaultdict(<class 'list'>, {'action': ['readmyscores', 'read'], 'type': ['gradebook', 'gradebook'], 'crstaken': ['crs', 'crs'], 'udepartment': ['admissions']})


37

In [21]:
# run the extracted policy over the complete data  (FP)
import time
def dict_compare(d1, d2):
    d1_keys = set(d1.keys())
    d2_keys = set(d2.keys())

    intersect_keys = d1_keys.intersection(d2_keys)
 
    relation = []
    added = d1_keys - d2_keys
    removed = d2_keys - d1_keys
  
    same = set(o for o in intersect_keys if  d1[o] in d2[o] )
   
    if len(same) == len(intersect_keys):
        return added, removed, same, relation
    for key,  o in d2.items():
        if type(o) == str:
                
 
            temp = set(i for i in intersect_keys if type(d2[i]) == str and d2[i] in d1_keys and d1[i] == d1[d2[i]] )
                #print(temp)
            if len(temp)>0:
                    relation = temp
        
           
    return added, removed, same, relation


def ruleCheck (row):
    lable = 'd'
    drow = row.to_dict()
    
    for key,value in policy.items():
        #print(key)
        added, removed, same, relation = dict_compare(drow, value)
        
        if len(same)+len(relation) >= len(value) or len(same) == len(value):
            #print(len(same)+len(relation), len(value))
            lable = "p"
#             row ['lable'] ='p'
            return lable
    
    return lable
start_time = time.time()

act['lable'] = act.apply(ruleCheck, axis=1)
print(act)
print("--- %s seconds ---" % (time.time() - start_time))    


      ischair    crs odepartment       action position         type  \
0       FALSE  ee601          cs         read  faculty       roster   
1       FALSE  cs101          ee         read  faculty       roster   
2        TRUE  ee601          cs         read  advisor   transcript   
3        TRUE  ee101          cs         read  advisor   transcript   
4        TRUE  cs101   registrar         read    staff  application   
...       ...    ...         ...          ...      ...          ...   
15996      na  ee101          ee    setstatus    staff       roster   
15997    TRUE  ee101   registrar        write  faculty    gradebook   
15998    TRUE  ee101          cs         read  faculty    gradebook   
15999    TRUE  ee101   registrar  changescore    staff   transcript   
16000      na  ee101  admissions    setstatus    staff   transcript   

      udepartment crstaught crstaken class lable  
0       registrar     ee101    ee601     p     p  
1              ce     ee601    cs101     p   

In [22]:
#Evaluation
log = act[(act['class'] == "p") & (act['lable'] == "p")]
log2 = act[(act['class'] == "d") & (act['lable'] == "d")]
FN = act[(act['class'] == "p") & (act['lable'] == "d")]
FP = act[(act['class'] == "d") & (act['lable'] == "p")]

p = act[act['class'] == "p"]
d = act[act['class'] == "d"]
#[(df['column_name'] >= A) & (df['column_name'] <= B)]
#print(len(log),'\n', log.head(10))
#print(FN.describe(),FP.describe(),log2.describe(),d.describe())
#log.describe()
print("FN=", len(FN), ", FP=", len(FP), ", TP=", len(log), ", TN=", len(log2))
recall = (len(log)/(len(log)+len(FN)))*100
precesion = (len(log)/(len(log)+len(FP)))*100
accu = ((len(log)+len(log2))/len(act))*100
f = 2*((recall*precesion)/(recall+precesion))
print("recall =",recall, "\nprecession =",precesion, "\naccuracy = ", accu,"\nf-score = ", f)

FN= 1991 , FP= 25 , TP= 6009 , TN= 7976
recall = 75.11250000000001 
precession = 99.58568114020551 
accuracy =  87.40078745078432 
f-score =  85.63488670371954
