In [1]:
import spacy
import re

In [2]:
import string

In [3]:
import torch
from torch.nn import CosineSimilarity

In [4]:
import pandas as pd

In [5]:
from numpy import dot
from numpy.linalg import norm

In [6]:
from sklearn.metrics import accuracy_score

In [7]:
from transformers import pipeline, BertTokenizer, BertModel, BertConfig

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [9]:
#from gensim.models import KeyedVectors, TranslationMatrix, fasttext
#from gensim.test.utils import datapath, temporary_file

In [10]:
#import fasttext
from sentence_transformers import SentenceTransformer, util

In [11]:
nlp = spacy.load('en_core_web_lg')

In [12]:
#model = SentenceTransformer('stsb-roberta-large')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

config = BertConfig.from_pretrained("bert-base-uncased",
                                    output_hidden_states=True)

model = BertModel.from_pretrained("bert-base-uncased", config=config)

In [13]:
#nlp = spacy.load('en_core_web_lg')
#model_en = fasttext.load_facebook_vectors("cc.en.300.bin")
#print("en loaded")

In [14]:
def computeCLSBertSimilarity(emb1, emb2):
    #emb1 = model.encode(sentence1)
    #emb2 = model.encode(sentence2)
    
    cos_sim = util.pytorch_cos_sim(emb1, emb2)
    #print("Cosine-Similarity:", cos_sim)
    cos_sim = float(cos_sim)
    return cos_sim

In [15]:
def getPercentageCorrectPredictions(a):
    
    counter = 0
    for x in a:
        if(x[1] == x[3]):
            counter = counter + 1
            
    return counter / len(a)

In [16]:
def getPercentageOfPredicted(a):
    counter = 0
    
    for x in a:
        if(x[4] == "predicted"):
            counter = counter + 1
            
    return counter / (len(unclassifiedSamples4GridSearch) + len(classifiedSamples4GridSearch))

In [17]:
def getNumberOfOracled(a):
    counter = 0
    
    for x in a:
        if(x[4] != "predicted"):
            counter = counter + 1
            
    return counter

In [18]:
def getNumberOfPredicted(a):
    counter = 0
    
    for x in a:
        if(x[4] == "predicted"):
            counter = counter + 1
            
    return counter

In [19]:
def getPercentageCorrectPredictions(a):
    
    global unclassifiedSamples4GridSearch, classifiedSamples4GridSearch
    
    counter = 0
    for x in a:
        if(x[1] == x[3]):
            counter = counter + 1

    counter = counter + len(unclassifiedSamples4GridSearch)
            
    return counter / (len(unclassifiedSamples4GridSearch) + len(classifiedSamples4GridSearch))

def getLocalAutomationRate(a):
    counter = 0
    
    global unclassifiedSamples4GridSearch, classifiedSamples4GridSearch
    
    for x in a:
        if(x[4] == "predicted"):
            counter = counter + 1
            
    return counter / (len(a))

def getLocalAccuracy(a):
    
    counter = 0
    for x in a:
        if(x[1] == x[3]):
            counter = counter + 1

    
            
    return counter / (len(classifiedSamples4GridSearch))

# If beta = 1, then accuracy and automationRate have same importance
# If beta < 1, then accuracy is more important than automation rate
# If beta > 1, then automation rate is more important than accuracy
def getHarmonicScore (accuracy, automationRate, beta):
    
    if(accuracy == 0 and automationRate == 0):
        return 0
    
    score = (1 + beta ** 2) * (accuracy * automationRate) / ((accuracy * (beta ** 2)) + automationRate)
    #s1 = (1 + beta**2)
    #s2 = (accuracy * automationRate)
    #s3 = (accuracy*(beta**2)) + automationRate
    #print(s1)
    #print(s2)
    #print(s3)
    
    return score

In [20]:
#ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted
def getLabelDistribution(a):
    
    predictedDict = {}
    actualDict = {}

    for label in uniqueLabels:
        predictedDict.update({label: 0})
        
    for label in uniqueLabels:
        actualDict.update({label: 0})
        
    print(predictedDict)
    print(actualDict)
    
    for x in a:
        predictedLabel = x[1]
        actualLabel = x[3]
        
        numOfPredicted = predictedDict[predictedLabel]
        numOfActual = actualDict[actualLabel]
        
        numOfPredicted = numOfPredicted + 1
        numOfActual = numOfActual + 1 
        
        predictedDict.update({predictedLabel: numOfPredicted})
        actualDict.update({actualLabel: numOfActual})
        
    print("prediction dict: ", predictedDict)
    print("actual dict: ", actualDict)

In [21]:
def cosineSimilarity(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [25]:
'''
 STRUCTURE OF allSamplesWithLabelThird: ticketText, ticketLabel, TF-IDF, wordVector

 STRUCTURE OF classifiedSamplesInit: ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted, wordVector
'''

'\n STRUCTURE OF allSamplesWithLabelThird: ticketText, ticketLabel, TF-IDF, wordVector\n\n STRUCTURE OF classifiedSamplesInit: ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted, wordVector\n'

In [26]:
def computeCosineSimilarity(sentence1, sentence2):
    doc1 = nlp(sentence1)
    doc2 = nlp(sentence2)
    
    similarity = doc1.similarity(doc2)
    #print(similarity)
    return similarity

In [27]:
def preprocessing(sentence):
    
    doc = nlp(sentence)
    inputSent = ""

    #removing stop words
    for token in doc:
        
        if(token.is_stop == True):
            continue
        else:
            inputSent = inputSent + " " + token.lemma_
        
        #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
    
    #lower case of the sentence
    inputSent = inputSent.lower()
    #remove numbers
    inputSent = re.sub('\d+', '', inputSent)
    #remover punctuation
    inputSent = inputSent.translate(str.maketrans('', '', string.punctuation))
    #remove multiple whitespaces
    inputSent = ' '.join(inputSent.split())
    
    
    
    #print(inputSent)
    return str(inputSent)

In [28]:
#computeCosineSimilarity("The cat chases the dog", "The dog chases the cat")

In [29]:
def getBertEmbedding(text):
    input_sentence = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    out = model(input_sentence)
    embeddings_of_last_layer = out[0]
    cls_embeddings1 = embeddings_of_last_layer[0][0]
    return cls_embeddings1

In [30]:
df = pd.read_csv('IBM_shuffled.csv', delimiter=",")

In [31]:
uniqueLabels = df.Label.unique()

foundLabels = []
classifiedSamplesInit = []

tfIdfList = []
allSamplesInit = []
allSamplesWithLabelInit = []

allSamplesMed = []
allSamplesWithLabelMed = []

allSamplesThird = []
allSamplesWithLabelThird = []

allSamplesOracled = []

In [32]:
#Save information from df in variables. Hereby, we compute the tf-idf values and eliminate stop words, numbers and more.

for ind in df.index:
    #preprocessedSentence = preprocessing(df['Text'][ind])
    preprocessedSentence = (df['Text'][ind])
    allSamplesWithLabelInit.append([preprocessedSentence, df['Label'][ind]])
    allSamplesInit.append(preprocessedSentence)

#Get TF-IDF Values
analyze = vectorizer.build_analyzer()
X = vectorizer.fit_transform(allSamplesInit)
(vectorizer.get_feature_names())
tfIdfList = X.toarray()

In [33]:
#Set up a list for further work with tuples of the format (text, label, tf-idf-values)
counter = 0
for sample in allSamplesWithLabelInit:
    text = sample[0]
    lable = sample[1]
    tfIdfVal = tfIdfList[counter]
    #embedding = nlp(text).vector 
    #embedding = getTfIdfSentenceEmbedding(text, counter)
    #embedding = getFasttextSentenceEmbedding(text, counter, False)
    embedding = getBertEmbedding(text)
    allSamplesWithLabelMed.append([text, lable, tfIdfVal, embedding])
    
    counter = counter + 1 
    
#The oracle assigns the label for one sample per category    

for x in allSamplesWithLabelMed:

    wholeEntry = [x[0], x[1], x[2], x[1], "oracle", x[3]]
    
    if(not(x[1] in foundLabels)):
        foundLabels.append(x[1])
        classifiedSamplesInit.append(wholeEntry)
    else: 
        allSamplesWithLabelThird.append(x)
        
    allSamplesOracled.append(wholeEntry)
 

sentenceDict = {}
similarityDict = {}

for label in uniqueLabels:
    sentenceDict.update({label: ""})
    
for label in uniqueLabels:
    similarityDict.update({label: 0})
    
#print(similarityDict, '\n \n')

In [34]:
len(allSamplesWithLabelThird)

396

In [35]:
len(classifiedSamplesInit)

4

In [36]:
allUnclassifiedSamples = allSamplesWithLabelThird[:]
allClassifiedSamples = classifiedSamplesInit[:]

In [37]:
def addOracledSamples(listOfSamples, samplesPerCategory):
    
    startDict = {}
    
    for label in uniqueLabels:
        startDict.update({label: 0})
        
    classifiedSamples1 = []
        
    limit = samplesPerCategory * len(uniqueLabels)
    
    unclassifiedList = listOfSamples[:]
    
    unclassifiedListFinal = []
    
    #print(limit, len(unclassifiedList))
    #print(startDict)
    
    for x in unclassifiedList:
        #ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted, embedding
        wholeEntry = [x[0], x[1], x[2], x[1], "oracle", x[5]]
        
        amountOfSamplesPerLabel = startDict.get(x[1])
        
        if(amountOfSamplesPerLabel < samplesPerCategory):
            #print(x[1], amountOfSamplesPerLabel)
            classifiedSamples1.append(wholeEntry)
            amountOfSamplesPerLabel = amountOfSamplesPerLabel + 1
            startDict.update({x[1]: amountOfSamplesPerLabel})
            
            
            
            unclassifiedList.remove(x)
            
            
        if(limit == len(classifiedSamples1)):
            break
            
    for x in unclassifiedList:
        text = x[0]
        lable = x[1]
        tfIdfVal = x[2]
        
        unclassifiedListFinal.append([text, lable, tfIdfVal, x[5]])
    
            
            
    print(len(classifiedSamples1), len(unclassifiedListFinal))
    
    #helper = [i[1] for i in classifiedSamples1]
    
    #print(helper)
    
    #print(startDict) 
    
    return classifiedSamples1, unclassifiedListFinal
            
    

In [49]:
classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)

4 196


In [50]:
def resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2):
    
    global classifiedSamples4GridSearch, unclassifiedSamples4GridSearch
    
    
    consideredSamples = allSamplesOracled[startIndex1:endIndex1] + allSamplesOracled[startIndex2:endIndex2]
    
    print(len(consideredSamples))
    
    classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(consideredSamples, 3)

In [51]:
resetClassifiedSamples(100, 300, 0, 0)

200
12 188


In [52]:
#Hereby, if we have 4 categories, we save the biggest similarity per each category with the new sample 
#If the similarites are more or less the same (DEFINE Threshold), then 
def findAndClassifyEqualSimilaritySamples(similarityHP, gapHP):
    
    maxSimilarityPerCategories = []
    
    for label in uniqueLabels:
        maxSimilarityPerCategories.append((label, 0))
    #print(maxSimilarityPerCategories)
    
    
    for unclassifiedSample in unclassifiedSamples4GridSearch:
    
        mostSimilarSample = 0
        maxCosSim = 0
        maxCosSimLabel = ""
        
        for label in uniqueLabels:
            similarityDict.update({label: 0})
    
        for classifiedSample in classifiedSamples4GridSearch:
            #cosSim = cosineSimilarity(unclassifiedSample[2], classifiedSample[2])
            #print(len(unclassifiedSample), len(classifiedSample))
            cosSim = computeCLSBertSimilarity((unclassifiedSample[3]), (classifiedSample[5]))
            classifiedLabel = classifiedSample[1]
            
            #print("cossim ", cosSim)
            
            if(maxCosSim < cosSim):
                maxCosSim = cosSim
                maxCosSimLabel = classifiedLabel
            
            if(cosSim > similarityDict[classifiedLabel]):
                similarityDict [classifiedLabel] = cosSim
                sentenceDict [classifiedLabel] = unclassifiedSample[0]
        
        gapPassed = False
        
        for i in similarityDict:
            if(i != maxCosSimLabel):
                gap = maxCosSim - similarityDict[i]
                if(gap < gapHP):
                    gapPassed = True
                    break
        
        
        if(maxCosSim > similarityHP and gapPassed):
            # ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted
            entry = [unclassifiedSample[0], unclassifiedSample[1], unclassifiedSample[2], unclassifiedSample[1], "oracle", unclassifiedSample[3]]
            #print("entry ", entry)
            unclassifiedSamples4GridSearch.remove(unclassifiedSample)
            classifiedSamples4GridSearch.append(entry)
            
        elif(maxCosSim > similarityHP):
            #STRUCTURE OF classifiedSamplesInit: 
            #ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted, wordVector
            entry = [unclassifiedSample[0], maxCosSimLabel, unclassifiedSample[2], unclassifiedSample[1], "predicted", unclassifiedSample[3]]
            #print("entry ", entry)
            unclassifiedSamples4GridSearch.remove(unclassifiedSample)
            classifiedSamples4GridSearch.append(entry)
            
 
        #print(similarityDict, unclassifiedSample[1])
        #print()
    #print("FINISHED ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
    #print()
    return len(unclassifiedSamples4GridSearch)

In [53]:
def gridSearchAnd3FoldValidation (minSimilarity, maxSimilarity, similarityStep, minGap, maxGap, gapStep, startIndex1, endIndex1, startIndex2, endIndex2):
    
    similarityGrid = []
    gapGrid = []
    
    allResults = []
    dfColumnName = ["Accuracy", "Automation Rate", "Harmonic Score", "Local Accuracy", "Local Automation Rate", "Local Harmonic Score", "Number Of Oracled", "Number Of Predicted", "Minimal Similarity", "Minimal Gap"]
    
    global classifiedSamples4GridSearch, unclassifiedSamples4GridSearch
    
    
    similarity = minSimilarity
    
    while(similarity <= maxSimilarity):
        similarityGrid.append(similarity)
        similarity = similarity + similarityStep
        
    g = minGap

    while(g <= maxGap):
        gapGrid.append(g)
        g = g + gapStep
        
    print("Gap Grid: ", gapGrid)
    print("Similarity Grid: ", similarityGrid)
    print("\n")
    #print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore," 
    #      "numberOfOracled, numberOfPredicted, sim, gap, currentSim, currentGap")
    print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap")
    
    
    maxHarmonicScore = 0
    bestGap = 0
    bestSim = 0
    bestAccuracy = 0
    bestAutomationRate = 0
        
    for sim in similarityGrid:
    
        
        for gap in gapGrid:
            
            print("NOW NEW COMBINATION: ", sim, gap)
            
            numberOfPredicted = -1
            numberOfOracled = -1

            localAccuracy = -1
            globalAccuracy = -1
            localAutomationRate = -1
            globalAutomationRate = -1

            localHarmonicScore = -1
            globalHarmonicScore = -1

            unlabelledSamples = 1000
            
            currentSim = sim
            currentGap = gap
            
            simCounter = 0
            gapCounter = 0
            counter = 0
            
            resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
            
            while (unlabelledSamples != 0 and counter < 35):
                
                
                #print("sim, gap ", currentSim, currentGap)
                unlabelledSamples = findAndClassifyEqualSimilaritySamples(currentSim, currentGap)      
            
                numberOfNewPredicted = getNumberOfPredicted(classifiedSamples4GridSearch)
                numberOfNewOracled = getNumberOfOracled(classifiedSamples4GridSearch)
                
                if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted):
                    currentSim = currentSim - 0.01
                    simCounter = simCounter + 1
                    
                '''
                if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted and simCounter > 12):
                    #currentGap = currentGap - 0.005
                    gapCounter = gapCounter + 1
                '''
                
                numberOfOracled = numberOfNewOracled
                numberOfPredicted = numberOfNewPredicted

                if(unlabelledSamples == 0):
                    counter = counter + 10000
                   
                '''
                print("Automation Rate ",getPercentageOfPredicted(classifiedSamples4GridSearch))
                print("Accuracy ",getPercentageCorrectPredictions(classifiedSamples4GridSearch))
                print("Number of Predicted ", getNumberOfPredicted(classifiedSamples4GridSearch))
                print("Number of Oracled ",getNumberOfOracled(classifiedSamples4GridSearch))
                '''
                
                #accuracy, automationRate, numberOfOracled, numberOfPredicted, currentSim, currentGap
                counter = counter + 1 
            
            #harmonicScore = getHarmonicScore(localAccuracy, localAutomationRate, 0.5)
            
            localAccuracy = getLocalAccuracy(classifiedSamples4GridSearch)
            localAutomationRate = getLocalAutomationRate(classifiedSamples4GridSearch)
            localHarmonicScore = getHarmonicScore(localAccuracy, localAutomationRate, 0.5)
            
            globalAutomationRate = getPercentageOfPredicted(classifiedSamples4GridSearch)
            globalAccuracy  = getPercentageCorrectPredictions(classifiedSamples4GridSearch)
            globalHarmonicScore = getHarmonicScore(globalAccuracy, globalAutomationRate, 0.5)
            
            
            if(globalHarmonicScore > maxHarmonicScore):
                maxHarmonicScore = localHarmonicScore
                bestAccuracy = localAccuracy
                bestAutomationRate = localAutomationRate
                bestGap = gap
                bestSim = sim
            
                
            #print("")
            #print("before reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
            
            
            
            #classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)
            #resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
            #classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)
            #print("after reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
            #print("")
            
            print(globalAccuracy, globalAutomationRate, globalHarmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap)
            
            tempResults = [globalAccuracy, globalAutomationRate, globalHarmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap]
            allResults.append(tempResults)
            
    print("Best: ", maxHarmonicScore, bestAccuracy, bestAutomationRate, bestGap, bestSim)
    return allResults, dfColumnName
            

In [54]:
def validation (similarityGapGrid, startIndex1, endIndex1, startIndex2, endIndex2):
    
    similarityGrid = []
    gapGrid = []
    
    allResults = []
    dfColumnName = ["Accuracy", "Automation Rate", "Harmonic Score", "Local Accuracy", "Local Automation Rate", "Local Harmonic Score", "Number Of Oracled", "Number Of Predicted", "Minimal Similarity", "Minimal Gap"]
    
    global classifiedSamples4GridSearch, unclassifiedSamples4GridSearch

        
    print("Gap Grid: ", gapGrid)
    print("Similarity Grid: ", similarityGrid)
    print("\n")
    #print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore," 
    #      "numberOfOracled, numberOfPredicted, sim, gap, currentSim, currentGap")
    print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap")
    
    
    maxHarmonicScore = 0
    bestGap = 0
    bestSim = 0
    bestAccuracy = 0
    bestAutomationRate = 0
    
    
    
        
    for simGap in similarityGapGrid:
        
        numberOfPredicted = -1
        numberOfOracled = -1
        accuracy = -1
        automationRate = -1
        unlabelledSamples = 1000

        currentSim = simGap[0]
        currentGap = simGap[1]


        simCounter = 0
        gapCounter = 0
        counter = 0
        
        resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)

        while (unlabelledSamples != 0 and counter < 100):

            #print("during process ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))

            unlabelledSamples = findAndClassifyEqualSimilaritySamples(currentSim, currentGap)


            numberOfNewPredicted = getNumberOfPredicted(classifiedSamples4GridSearch)
            numberOfNewOracled = getNumberOfOracled(classifiedSamples4GridSearch)
            automationRate = getPercentageOfPredicted(classifiedSamples4GridSearch)
            accuracy = getPercentageCorrectPredictions(classifiedSamples4GridSearch)

            if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted):
                currentSim = currentSim - 0.01
                simCounter = simCounter + 1
                #currentGap = currentGap - 1

            if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted and simCounter > 12):
                #currentGap = currentGap - 0.005
                gapCounter = gapCounter + 1

            numberOfOracled = numberOfNewOracled
            numberOfPredicted = numberOfNewPredicted

            #print("unlab samples ", unlabelledSamples)

            if(unlabelledSamples == 0):
                counter = counter + 10000

            #accuracy, automationRate, numberOfOracled, numberOfPredicted, currentSim, currentGap
            counter = counter + 1 

        harmonicScore = getHarmonicScore(accuracy, automationRate, 0.5)

        localAccuracy = getLocalAccuracy(classifiedSamples4GridSearch)
        localAutomationRate = getLocalAutomationRate(classifiedSamples4GridSearch)
        localHarmonicScore = getHarmonicScore(localAccuracy, localAutomationRate, 0.5)

        if(harmonicScore > maxHarmonicScore):
            maxHarmonicScore = harmonicScore
            bestAccuracy = accuracy
            bestAutomationRate = automationRate
            bestSim = simGap[0]
            bestGap = simGap[1]


        #print("")
        #print("before reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))

        resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
        #classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)
        #print("after reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
        #print("")

        print(accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, simGap[0], simGap[1])

        tempResults = [accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, simGap[0], simGap[1]]
        allResults.append(tempResults)
            
    print("Best: ", maxHarmonicScore, bestAccuracy, bestAutomationRate, bestGap, bestSim)
    return allResults, dfColumnName
            

In [57]:
startIndex1, endIndex1, startIndex2, endIndex2 = 0,400,0,0
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults1, tfIdfColumns1 = gridSearchAnd3FoldValidation(minSimilarity = 0.25, maxSimilarity = 0.26, similarityStep = 0.05 , minGap = 0.01, maxGap = 0.01, gapStep = 0.001, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 3000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

400
4 396
Gap Grid:  [0.01]
Similarity Grid:  [0.25]


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
NOW NEW COMBINATION:  0.25 0.01
400
4 396
0.7325 0.515 0.6754476275738586 0.7325 0.515 0.6754476275738586 194 206 0.25 0.01
Best:  0.6754476275738586 0.7325 0.515 0.01 0.25


In [None]:
resetClassifiedSamples(0,400,0,0)

In [None]:
similarityHP = 0.20
gapHP = 0.02
#resetClassifiedSamples(0,200,0,0)
findAndClassifyEqualSimilaritySamples(similarityHP, gapHP)

In [88]:
print("Automation Rate ",getPercentageOfPredicted(classifiedSamples4GridSearch))
print("Accuracy ",getPercentageCorrectPredictions(classifiedSamples4GridSearch))
print("Number of Predicted ", getNumberOfPredicted(classifiedSamples4GridSearch))
print("Number of Oracled ",getNumberOfOracled(classifiedSamples4GridSearch))

Automation Rate  0.0
Accuracy  1.0
Number of Predicted  0
Number of Oracled  376


In [90]:
classifiedSamples4GridSearch[0]

['Hello, I have called Citibank for a total of three times with an unresolved issue. My account has been locked. When I called Citibank, they requested a phone verification in order to verify my identity. I provided the home number of my father, an authorized user, which they called and provided a reference number. When I called back with the reference number, the bank further refused to verify my identity and asked for a fax of my ID as well as social security card. I am currently living and working abroad and unable to as well as uncomfortable with faxing sensitive information to the bank which I can verify over the phone.',
 'Credit_Card',
 array([0., 0., 0., ..., 0., 0., 0.]),
 'Credit_Card',
 'oracle',
 tensor([ 5.2957e-03,  6.9682e-02, -1.5333e-01, -3.0119e-01, -3.0599e-01,
         -1.8398e-01,  6.8643e-02,  8.9585e-01, -2.3598e-01,  1.1756e-01,
         -5.7280e-02, -2.2067e-01,  2.0718e-01,  4.5724e-01,  1.1637e-01,
         -1.2168e-01,  1.6088e-01,  1.0032e+00,  4.3449e-01, 

In [55]:
startIndex1, endIndex1, startIndex2, endIndex2 = 0,200,0,0
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults1, tfIdfColumns1 = gridSearchAnd3FoldValidation(minSimilarity = 0.30, maxSimilarity = 0.90, similarityStep = 0.05 , minGap = 0.01, maxGap = 0.05, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

startIndex1, endIndex1, startIndex2, endIndex2 = 100,300,0,0
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults2, tfIdfColumns2 = gridSearchAnd3FoldValidation(minSimilarity = 0.30, maxSimilarity = 0.91, similarityStep = 0.05 , minGap = 0.01, maxGap = 0.05, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

startIndex1, endIndex1, startIndex2, endIndex2 = 0,100,200,300
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults3, tfIdfColumns3 = gridSearchAnd3FoldValidation(minSimilarity = 0.30, maxSimilarity = 0.91, similarityStep = 0.05 , minGap = 0.01, maxGap = 0.05, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)


#resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
#tfIdfResults, tfIdfColumns = gridSearchAnd3FoldValidation(minSimilarity = 0.10, maxSimilarity = 0.3, similarityStep = 0.06 , minGap = 0.01, maxGap = 0.05, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 5000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

200
12 188
Gap Grid:  [0.01, 0.015, 0.02, 0.025, 0.030000000000000002, 0.035, 0.04, 0.045, 0.049999999999999996]
Similarity Grid:  [0.3, 0.35, 0.39999999999999997, 0.44999999999999996, 0.49999999999999994, 0.5499999999999999, 0.6, 0.65, 0.7000000000000001, 0.7500000000000001, 0.8000000000000002, 0.8500000000000002]


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
NOW NEW COMBINATION:  0.3 0.01
200
12 188
0.65 0.6 0.639344262295082 0.65 0.6 0.639344262295082 80 120 0.3 0.01
NOW NEW COMBINATION:  0.3 0.015
200
12 188
0.83 0.36 0.6581497797356827 0.83 0.36 0.6581497797356827 128 72 0.3 0.015
NOW NEW COMBINATION:  0.3 0.02
200
12 188
0.885 0.26 0.5976623376623377 0.885 0.26 0.5976623376623377 148 52 0.3 0.02
NOW NEW COMBINATION:  0.3 0.025
200
12 188
0.945 0.155 0.4679712460063898 0.945 0.155 0.4679712460063898 169 31 0.3 0.025
NOW NEW COMBINATION:  0.3 0.030000000000000002
200
12 188
0.965 0.13 

0.99 0.06 0.24146341463414633 0.99 0.06 0.24146341463414633 188 12 0.5499999999999999 0.049999999999999996
NOW NEW COMBINATION:  0.6 0.01
200
12 188
0.65 0.6 0.639344262295082 0.65 0.6 0.639344262295082 80 120 0.6 0.01
NOW NEW COMBINATION:  0.6 0.015
200
12 188
0.83 0.36 0.6581497797356827 0.83 0.36 0.6581497797356827 128 72 0.6 0.015
NOW NEW COMBINATION:  0.6 0.02
200
12 188
0.885 0.26 0.5976623376623377 0.885 0.26 0.5976623376623377 148 52 0.6 0.02
NOW NEW COMBINATION:  0.6 0.025
200
12 188
0.945 0.155 0.4679712460063898 0.945 0.155 0.4679712460063898 169 31 0.6 0.025
NOW NEW COMBINATION:  0.6 0.030000000000000002
200
12 188
0.965 0.13 0.4223905723905725 0.965 0.13 0.4223905723905725 174 26 0.6 0.030000000000000002
NOW NEW COMBINATION:  0.6 0.035
200
12 188
0.99 0.09 0.32999999999999996 0.99 0.09 0.32999999999999996 182 18 0.6 0.035
NOW NEW COMBINATION:  0.6 0.04
200
12 188
0.99 0.09 0.32999999999999996 0.99 0.09 0.32999999999999996 182 18 0.6 0.04
NOW NEW COMBINATION:  0.6 0.045
200

0.715 0.5 0.6583793738489871 0.715 0.5 0.6583793738489871 100 100 0.3 0.01
NOW NEW COMBINATION:  0.3 0.015
200
12 188
0.805 0.385 0.660820895522388 0.805 0.385 0.660820895522388 123 77 0.3 0.015
NOW NEW COMBINATION:  0.3 0.02
200
12 188
0.885 0.27 0.6080152671755726 0.885 0.27 0.6080152671755726 146 54 0.3 0.02
NOW NEW COMBINATION:  0.3 0.025
200
12 188
0.92 0.215 0.5556179775280898 0.92 0.215 0.5556179775280898 157 43 0.3 0.025
NOW NEW COMBINATION:  0.3 0.030000000000000002
200
12 188
0.94 0.185 0.5175595238095237 0.94 0.185 0.5175595238095237 163 37 0.3 0.030000000000000002
NOW NEW COMBINATION:  0.3 0.035
200
12 188
0.96 0.15 0.4615384615384615 0.96 0.15 0.4615384615384615 170 30 0.3 0.035
NOW NEW COMBINATION:  0.3 0.04
200
12 188
0.98 0.1 0.35507246376811596 0.98 0.1 0.35507246376811596 180 20 0.3 0.04
NOW NEW COMBINATION:  0.3 0.045
200
12 188
0.985 0.075 0.2874513618677043 0.985 0.075 0.2874513618677043 185 15 0.3 0.045
NOW NEW COMBINATION:  0.3 0.049999999999999996
200
12 188
0.9

0.92 0.215 0.5556179775280898 0.92 0.215 0.5556179775280898 157 43 0.6 0.025
NOW NEW COMBINATION:  0.6 0.030000000000000002
200
12 188
0.94 0.185 0.5175595238095237 0.94 0.185 0.5175595238095237 163 37 0.6 0.030000000000000002
NOW NEW COMBINATION:  0.6 0.035
200
12 188
0.96 0.15 0.4615384615384615 0.96 0.15 0.4615384615384615 170 30 0.6 0.035
NOW NEW COMBINATION:  0.6 0.04
200
12 188
0.98 0.1 0.35507246376811596 0.98 0.1 0.35507246376811596 180 20 0.6 0.04
NOW NEW COMBINATION:  0.6 0.045
200
12 188
0.985 0.075 0.2874513618677043 0.985 0.075 0.2874513618677043 185 15 0.6 0.045
NOW NEW COMBINATION:  0.6 0.049999999999999996
200
12 188
0.985 0.075 0.2874513618677043 0.985 0.075 0.2874513618677043 185 15 0.6 0.049999999999999996
NOW NEW COMBINATION:  0.65 0.01
200
12 188
0.715 0.5 0.6583793738489871 0.715 0.5 0.6583793738489871 100 100 0.65 0.01
NOW NEW COMBINATION:  0.65 0.015
200
12 188
0.805 0.385 0.660820895522388 0.805 0.385 0.660820895522388 123 77 0.65 0.015
NOW NEW COMBINATION:  0.

0.965 0.125 0.4116894197952219 0.965 0.125 0.4116894197952219 175 25 0.9000000000000002 0.035
NOW NEW COMBINATION:  0.9000000000000002 0.04
200
12 188
0.98 0.095 0.34227941176470594 0.98 0.095 0.34227941176470594 181 19 0.9000000000000002 0.04
NOW NEW COMBINATION:  0.9000000000000002 0.045
200
12 188
0.985 0.08 0.30191570881226054 0.985 0.08 0.30191570881226054 184 16 0.9000000000000002 0.045
NOW NEW COMBINATION:  0.9000000000000002 0.049999999999999996
200
12 188
0.99 0.075 0.28779069767441856 0.99 0.075 0.28779069767441856 185 15 0.9000000000000002 0.049999999999999996
Best:  0.6750000000000002 0.81 0.405 0.015 0.9000000000000002
200
12 188
Gap Grid:  [0.01, 0.015, 0.02, 0.025, 0.030000000000000002, 0.035, 0.04, 0.045, 0.049999999999999996]
Similarity Grid:  [0.3, 0.35, 0.39999999999999997, 0.44999999999999996, 0.49999999999999994, 0.5499999999999999, 0.6, 0.65, 0.7000000000000001, 0.7500000000000001, 0.8000000000000002, 0.8500000000000002, 0.9000000000000002]


accuracy, automationR

0.965 0.2 0.546742209631728 0.965 0.2 0.546742209631728 160 40 0.5499999999999999 0.025
NOW NEW COMBINATION:  0.5499999999999999 0.030000000000000002
200
12 188
0.975 0.17 0.5007552870090635 0.975 0.17 0.5007552870090635 166 34 0.5499999999999999 0.030000000000000002
NOW NEW COMBINATION:  0.5499999999999999 0.035
200
12 188
0.995 0.125 0.41596989966555176 0.995 0.125 0.41596989966555176 175 25 0.5499999999999999 0.035
NOW NEW COMBINATION:  0.5499999999999999 0.04
200
12 188
0.995 0.11 0.3813588850174216 0.995 0.11 0.3813588850174216 178 22 0.5499999999999999 0.04
NOW NEW COMBINATION:  0.5499999999999999 0.045
200
12 188
0.995 0.095 0.3437272727272727 0.995 0.095 0.3437272727272727 181 19 0.5499999999999999 0.045
NOW NEW COMBINATION:  0.5499999999999999 0.049999999999999996
200
12 188
0.995 0.075 0.28812741312741313 0.995 0.075 0.28812741312741313 185 15 0.5499999999999999 0.049999999999999996
NOW NEW COMBINATION:  0.6 0.01
200
12 188
0.695 0.54 0.6572679509632224 0.695 0.54 0.657267950

0.995 0.095 0.3437272727272727 0.995 0.095 0.3437272727272727 181 19 0.8500000000000002 0.035
NOW NEW COMBINATION:  0.8500000000000002 0.04
200
12 188
0.995 0.085 0.3167602996254682 0.995 0.085 0.3167602996254682 183 17 0.8500000000000002 0.04
NOW NEW COMBINATION:  0.8500000000000002 0.045
200
12 188
0.995 0.075 0.28812741312741313 0.995 0.075 0.28812741312741313 185 15 0.8500000000000002 0.045
NOW NEW COMBINATION:  0.8500000000000002 0.049999999999999996
200
12 188
0.995 0.065 0.2576693227091633 0.995 0.065 0.2576693227091633 187 13 0.8500000000000002 0.049999999999999996
NOW NEW COMBINATION:  0.9000000000000002 0.01
200
12 188
0.755 0.505 0.686981981981982 0.7537688442211056 0.507537688442211 0.6870997587214049 98 101 0.9000000000000002 0.01
NOW NEW COMBINATION:  0.9000000000000002 0.015
200
12 188
0.835 0.39 0.6798538622129436 0.8341708542713567 0.39195979899497485 0.6805996509745379 121 78 0.9000000000000002 0.015
NOW NEW COMBINATION:  0.9000000000000002 0.02
200
12 188
0.92 0.26 0

In [56]:
from pandas import DataFrame

In [57]:
df1 = DataFrame (tfIdfResults1, columns= tfIdfColumns1)
df2 = DataFrame (tfIdfResults2, columns= tfIdfColumns2)
df3 = DataFrame (tfIdfResults3, columns= tfIdfColumns3)
df1

Unnamed: 0,Accuracy,Automation Rate,Harmonic Score,Local Accuracy,Local Automation Rate,Local Harmonic Score,Number Of Oracled,Number Of Predicted,Minimal Similarity,Minimal Gap
0,0.650,0.600,0.639344,0.650,0.600,0.639344,80,120,0.30,0.010
1,0.830,0.360,0.658150,0.830,0.360,0.658150,128,72,0.30,0.015
2,0.885,0.260,0.597662,0.885,0.260,0.597662,148,52,0.30,0.020
3,0.945,0.155,0.467971,0.945,0.155,0.467971,169,31,0.30,0.025
4,0.965,0.130,0.422391,0.965,0.130,0.422391,174,26,0.30,0.030
...,...,...,...,...,...,...,...,...,...,...
103,0.955,0.125,0.410223,0.955,0.125,0.410223,175,25,0.85,0.030
104,0.985,0.080,0.301916,0.985,0.080,0.301916,184,16,0.85,0.035
105,0.990,0.075,0.287791,0.990,0.075,0.287791,185,15,0.85,0.040
106,0.990,0.070,0.272835,0.990,0.070,0.272835,186,14,0.85,0.045


In [58]:
with pd.ExcelWriter("CLS-Bert-3PerCat-CFV.xlsx") as writer:
    df1.to_excel(writer, sheet_name='1', index=False)
    df2.to_excel(writer, sheet_name='2', index=False)
    df3.to_excel(writer, sheet_name='3', index=False)

In [65]:

startIndex1, endIndex1, startIndex2, endIndex2 = 200,300,0,0
#to be changed
simGapGrid1 = [[0.7,0.015], [0.75,0.015], [0.8,0.015]]
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults11, tfIdfColumns11 = validation(simGapGrid1, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

startIndex1, endIndex1, startIndex2, endIndex2 = 0,100,0,0
#to be changed
simGapGrid2 = [[0.3,0.015], [0.35,0.015],[0.4,0.015],[0.45,0.015],[0.5,0.015],[0.55,0.015],[0.6,0.015],[0.65,0.015],[0.7,0.015],[0.75,0.015],[0.8,0.015]]
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults22, tfIdfColumns22 = validation(simGapGrid2, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

startIndex1, endIndex1, startIndex2, endIndex2 = 100,200,0,0
#to be changed
simGapGrid3 = [[0.8,0.015],[0.3,0.015],[0.35,0.015],[0.4,0.015],[0.45,0.015],[0.5,0.015],[0.55,0.015],[0.6,0.015]    ]
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults33, tfIdfColumns33 = validation(simGapGrid3, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)
#resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
#tfIdfResults, tfIdfColumns = gridSearchAnd3FoldValidation(minSimilarity = 0.10, maxSimilarity = 0.3, similarityStep = 0.06 , minGap = 0.01, maxGap = 0.05, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 5000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

100
4 96
Gap Grid:  []
Similarity Grid:  []


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
100
4 96
100
4 96
0.5 0.66 0.5254777070063694 0.5 0.66 0.5254777070063694 34 66 0.7 0.015
100
4 96
100
4 96
0.5 0.66 0.5254777070063694 0.5 0.66 0.5254777070063694 34 66 0.75 0.015
100
4 96
100
4 96
0.5 0.66 0.5254777070063694 0.5 0.66 0.5254777070063694 34 66 0.8 0.015
Best:  0.5254777070063694 0.5 0.66 0.015 0.7
100
4 96
Gap Grid:  []
Similarity Grid:  []


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
100
4 96
100
4 96
0.65 0.61 0.6415857605177995 0.65 0.61 0.6415857605177995 39 61 0.3 0.015
100
4 96
100
4 96
0.65 0.61 0.6415857605177995 0.65 0.61 0.6415857605177995 39 61 0.35 0.015
100
4 96
100
4 96
0.65 0.61 0.6415857605177995 0.65 0.61 0.6415857605177995 39 61 0.4 0.015
100
4 96
100
4 96
0.65 0.61 0.6

In [66]:
df11 = DataFrame (tfIdfResults11, columns= tfIdfColumns11)
df22 = DataFrame (tfIdfResults22, columns= tfIdfColumns22)
df33 = DataFrame (tfIdfResults33, columns= tfIdfColumns33)

In [67]:
with pd.ExcelWriter("CLS-Bert-Validation.xlsx") as writer:
    df11.to_excel(writer, sheet_name='1', index=False)
    df22.to_excel(writer, sheet_name='2', index=False)
    df33.to_excel(writer, sheet_name='3', index=False)

In [68]:

startIndex1, endIndex1, startIndex2, endIndex2 = 300,400,0,0
#to be changed
simGapGrid1 = [[0.8,0.015]]
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults111, tfIdfColumns111 = validation(simGapGrid1, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)


import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 3500  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

100
4 96
Gap Grid:  []
Similarity Grid:  []


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
100
4 96
100
4 96
0.72 0.46 0.646875 0.72 0.46 0.646875 54 46 0.8 0.015
Best:  0.646875 0.72 0.46 0.015 0.8


In [69]:
df111 = DataFrame (tfIdfResults111, columns= tfIdfColumns11)

In [71]:
with pd.ExcelWriter("CLS-Bert-Final.xlsx") as writer:
    df111.to_excel(writer, sheet_name='1', index=False)

In [445]:
findAndClassifyEqualSimilaritySamples()

FINISHED  0 400



In [446]:
getPercentageOfPredicted(allClassifiedSamples)

0.34

In [447]:
getPercentageCorrectPredictions(allClassifiedSamples)

0.855

In [448]:
getNumberOfPredicted(allClassifiedSamples)

136

In [449]:
getNumberOfOracled(allClassifiedSamples)

264

In [450]:
getLabelDistribution(allClassifiedSamples)

{'Credit_Card': 0, 'Bank_Account': 0, 'Loans': 0, 'Mortgage': 0}
{'Credit_Card': 0, 'Bank_Account': 0, 'Loans': 0, 'Mortgage': 0}
prediction dict:  {'Credit_Card': 102, 'Bank_Account': 122, 'Loans': 92, 'Mortgage': 84}
actual dict:  {'Credit_Card': 100, 'Bank_Account': 100, 'Loans': 100, 'Mortgage': 100}
