Importing the libraries

In [1]:
import spacy
import re

In [2]:
import string

In [3]:
import pandas as pd

In [4]:
from numpy import dot
from numpy.linalg import norm

In [5]:
from sklearn.metrics import accuracy_score

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

The spacy library is used for the preprocessing of the ticket text

In [7]:
nlp = spacy.load('en_core_web_lg')

In [8]:
def cosineSimilarity(a, b):
    #aa = float(a)
    #bb = float(b)
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

def computeCosineSimilarity(sentence1, sentence2):
    doc1 = nlp(sentence1)
    doc2 = nlp(sentence2)
    
    similarity = doc1.similarity(doc2)
    #print(similarity)
    return similarity

This function is used to compute the accuracy of the active learning system

In [10]:
def getPercentageCorrectPredictions(a):
    
    counter = 0
    for x in a:
        if(x[1] == x[3]):
            counter = counter + 1
            
    return counter / (len(unclassifiedSamples4GridSearch) + len(classifiedSamples4GridSearch))

This function is used to compute the automation rate of the active learning system


In [11]:
def getPercentageOfPredicted(a):
    counter = 0
    
    for x in a:
        if(x[4] == "predicted"):
            counter = counter + 1
            
    return counter / (len(unclassifiedSamples4GridSearch) + len(classifiedSamples4GridSearch))

In [12]:
def getNumberOfOracled(a):
    counter = 0
    
    for x in a:
        if(x[4] != "predicted"):
            counter = counter + 1
            
    return counter

In [13]:
def getNumberOfPredicted(a):
    counter = 0
    
    for x in a:
        if(x[4] == "predicted"):
            counter = counter + 1
            
    return counter

In [14]:
def getPercentageCorrectPredictions(a):
    
    global unclassifiedSamples4GridSearch, classifiedSamples4GridSearch
    
    counter = 0
    for x in a:
        if(x[1] == x[3]):
            counter = counter + 1

    counter = counter + len(unclassifiedSamples4GridSearch)
            
    return counter / (len(unclassifiedSamples4GridSearch) + len(classifiedSamples4GridSearch))

def getLocalAutomationRate(a):
    counter = 0
    
    global unclassifiedSamples4GridSearch, classifiedSamples4GridSearch
    
    for x in a:
        if(x[4] == "predicted"):
            counter = counter + 1
            
    return counter / (len(a))

def getLocalAccuracy(a):
    
    counter = 0
    for x in a:
        if(x[1] == x[3]):
            counter = counter + 1

    
            
    return counter / (len(classifiedSamples4GridSearch))

# If beta = 1, then accuracy and automationRate have same importance
# If beta < 1, then accuracy is more important than automation rate
# If beta > 1, then automation rate is more important than accuracy
def getHarmonicScore (accuracy, automationRate, beta):
    
    if(accuracy == 0 and automationRate == 0):
        return 0
    
    score = (1 + beta ** 2) * (accuracy * automationRate) / ((accuracy * (beta ** 2)) + automationRate)
    #s1 = (1 + beta**2)
    #s2 = (accuracy * automationRate)
    #s3 = (accuracy*(beta**2)) + automationRate
    #print(s1)
    #print(s2)
    #print(s3)
    
    return score





 STRUCTURE OF allSamplesWithLabelThird: ticketText, ticketLabel, TF-IDF, wordVector

 STRUCTURE OF classifiedSamplesInit: ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted, wordVector


In this step, depending on the model, we do the preprocessing to delete stop words, numbers and punctuation.
Furthermore, we use the lemmatized word and set the text to lower case.

In [19]:
def preprocessing(sentence):
    
    doc = nlp(sentence)
    inputSent = ""

    #removing stop words
    for token in doc:
        
        if(token.is_stop == True):
            continue
        else:
            inputSent = inputSent + " " + token.lemma_
        
        #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)
    
    #lower case of the sentence
    inputSent = inputSent.lower()
    #remove numbers
    inputSent = re.sub('\d+', '', inputSent)
    #remover punctuation
    inputSent = inputSent.translate(str.maketrans('', '', string.punctuation))
    #remove multiple whitespaces
    inputSent = ' '.join(inputSent.split())
    
    
    
    #print(inputSent)
    return str(inputSent)

The dataset is loaded. 

In [21]:
df = pd.read_csv('IBM_shuffled.csv', delimiter=",")


In [22]:
df = pd.read_csv('HR_IT_FM_shuffled.csv', delimiter = ";")

In [51]:
uniqueLabels = df.Label.unique()

foundLabels = []
classifiedSamplesInit = []

tfIdfList = []
allSamplesInit = []
allSamplesWithLabelInit = []

allSamplesMed = []
allSamplesWithLabelMed = []

allSamplesThird = []
allSamplesWithLabelThird = []

allSamplesOracled = []

The ticket data is loaded and preprocessed. The TF-IDF values are computed.

In [52]:
#Save information from df in variables. Hereby, we compute the tf-idf values and eliminate stop words, numbers and more.

for ind in df.index:
    preprocessedSentence = preprocessing(df['Text'][ind])
    allSamplesWithLabelInit.append([preprocessedSentence, df['Label'][ind]])
    allSamplesInit.append(preprocessedSentence)

#Get TF-IDF Values
analyze = vectorizer.build_analyzer()
X = vectorizer.fit_transform(allSamplesInit)
(vectorizer.get_feature_names())
tfIdfList = X.toarray()

For Word2Vec and fastText: We compute the text embedding using the TF-IDF values for the weighting.

In [53]:
def getTfIdfSentenceEmbedding(sentence, sentenceCounter):
    
    doc = nlp(str(sentence))

    embedding = 0
    
    tfIdfForSentence = dict(zip(vectorizer.get_feature_names(), X.toarray()[sentenceCounter]))
    
    #print(tfIdfForSentence)

    
    #removing stop words
    counter = 0
    for token in doc:
        
        #print(token.text)
        
        tfIdfValue = 0
        
        try:
            tfIdfValue = tfIdfForSentence[token.text]
            counter = counter + 1
        except:
            pass
            #print("Key not found: ", token.text)
        
        embedding = embedding + (tfIdfValue * nlp(token.text).vector)
    
    embedding = embedding/counter
        
    return embedding
    
    

Further preprocessing and init. as we stated in the thesis.

In [56]:
#Set up a list for further work with tuples of the format (text, label, tf-idf-values)
counter = 0
for sample in allSamplesWithLabelInit:
    text = sample[0]
    lable = sample[1]
    tfIdfVal = tfIdfList[counter]
    #embedding = nlp(text).vector 
    embedding = getTfIdfSentenceEmbedding(text, counter)
    allSamplesWithLabelMed.append([text, lable, tfIdfVal, embedding])
    
    counter = counter + 1 
    
#The oracle assigns the label for one sample per category    

for x in allSamplesWithLabelMed:

    #text, label, tfIdfVal, label, embedding
    wholeEntry = [x[0], x[1], x[2], x[1], "oracle", x[3]]
    
    if(not(x[1] in foundLabels)):
        foundLabels.append(x[1])
        classifiedSamplesInit.append(wholeEntry)
    else: 
        allSamplesWithLabelThird.append(x)
 
    allSamplesOracled.append(wholeEntry)

sentenceDict = {}
similarityDict = {}

for label in uniqueLabels:
    sentenceDict.update({label: ""})
    
for label in uniqueLabels:
    similarityDict.update({label: 0})
    
#print(similarityDict, '\n \n')

Based on the initialization of h1 (according to the thesis), the list for classified und unclassified samples is prepared 

In [2]:
def addOracledSamples(listOfSamples, samplesPerCategory):
    
    startDict = {}
    
    for label in uniqueLabels:
        startDict.update({label: 0})
        
    classifiedSamples1 = []
        
    limit = samplesPerCategory * len(uniqueLabels)
    
    unclassifiedList = listOfSamples[:]
    
    unclassifiedListFinal = []
    
    #print(limit, len(unclassifiedList))
    #print(startDict)
    
    for x in unclassifiedList:
        #ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted, embedding
        wholeEntry = [x[0], x[1], x[2], x[1], "oracle", x[5]]
        
        #print(wholeEntry)
        #print(x)
        #break
        
        
        amountOfSamplesPerLabel = startDict.get(x[1])
        
        if(amountOfSamplesPerLabel < samplesPerCategory):
            #print(x[1], amountOfSamplesPerLabel)
            classifiedSamples1.append(wholeEntry)
            amountOfSamplesPerLabel = amountOfSamplesPerLabel + 1
            startDict.update({x[1]: amountOfSamplesPerLabel})
            
    
            
            unclassifiedList.remove(x)
            
            
        if(limit == len(classifiedSamples1)):
            break
            
    for x in unclassifiedList:
        text = x[0]
        lable = x[1]
        tfIdfVal = x[2]
        
        
        unclassifiedListFinal.append([text, lable, tfIdfVal, x[5]])
    
            
            
    print(len(classifiedSamples1), len(unclassifiedListFinal))
    
    #helper = [i[1] for i in classifiedSamples1]
    
    #print(helper)
    
    #print(startDict) 
    
    return classifiedSamples1, unclassifiedListFinal
            
    

Each time, a new h2 and h3 are selected, the list of unclassified and classified samples is resetted 

In [58]:
def resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2):
    
    global classifiedSamples4GridSearch, unclassifiedSamples4GridSearch
    
    
    consideredSamples = allSamplesOracled[startIndex1:endIndex1] + allSamplesOracled[startIndex2:endIndex2]
    #print(consideredSamples[1])
    
    print(len(consideredSamples))
    
    classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(consideredSamples, 1)

In [63]:
classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)

3 197


In [64]:
allUnclassifiedSamples = allSamplesWithLabelThird[:]
allClassifiedSamples = classifiedSamplesInit[:]

This function is the core of our active learning system. similarityHP is h2 and gapHP is h3. 

In [65]:
#Hereby, if we have 4 categories, we save the biggest similarity per each category with the new sample 
#If the similarites are more or less the same (DEFINE Threshold), then 
def findAndClassifyEqualSimilaritySamples(similarityHP, gapHP):
    
    maxSimilarityPerCategories = []
    
    for label in uniqueLabels:
        maxSimilarityPerCategories.append((label, 0))
    #print(maxSimilarityPerCategories)
    
    
    for unclassifiedSample in unclassifiedSamples4GridSearch:
    
        mostSimilarSample = 0
        maxCosSim = 0
        maxCosSimLabel = ""
        
        for label in uniqueLabels:
            similarityDict.update({label: 0})
    
        for classifiedSample in classifiedSamples4GridSearch:
            #tf-idf pure
            #cosSim = cosineSimilarity(unclassifiedSample[2], classifiedSample[2])
            #print(len(unclassifiedSample), len(classifiedSample))
            
            cosSim = cosineSimilarity((unclassifiedSample[3]), (classifiedSample[5]))
            classifiedLabel = classifiedSample[1]
            
            if(maxCosSim < cosSim):
                maxCosSim = cosSim
                maxCosSimLabel = classifiedLabel
            
            if(cosSim > similarityDict[classifiedLabel]):
                similarityDict [classifiedLabel] = cosSim
                sentenceDict [classifiedLabel] = unclassifiedSample[0]
        
        gapPassed = False
        
        for i in similarityDict:
            if(i != maxCosSimLabel):
                gap = maxCosSim - similarityDict[i]
                if(gap < gapHP):
                    gapPassed = True
                    break
        
        if(maxCosSim > similarityHP and gapPassed):
            # ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted
            entry = [unclassifiedSample[0], unclassifiedSample[1], unclassifiedSample[2], unclassifiedSample[1], "oracle", unclassifiedSample[3]]
            #print("entry ", entry)
            unclassifiedSamples4GridSearch.remove(unclassifiedSample)
            classifiedSamples4GridSearch.append(entry)
            
        elif(maxCosSim > similarityHP):
            #STRUCTURE OF classifiedSamplesInit: 
            #ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted, wordVector
            entry = [unclassifiedSample[0], maxCosSimLabel, unclassifiedSample[2], unclassifiedSample[1], "predicted", unclassifiedSample[3]]
            #print("entry ", entry)
            unclassifiedSamples4GridSearch.remove(unclassifiedSample)
            classifiedSamples4GridSearch.append(entry)
            
 
        #print(similarityDict, unclassifiedSample[1])
        #print()
    #print("FINISHED ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
    #print()
    #print("Automation Rate ",getPercentageOfPredicted(classifiedSamples4GridSearch))
    #print("Accuracy ",getPercentageCorrectPredictions(classifiedSamples4GridSearch))
    #print("Number of Predicted ", getNumberOfPredicted(classifiedSamples4GridSearch))
    #print("Number of Oracled ",getNumberOfOracled(classifiedSamples4GridSearch))
    
    
    #print()
    return len(unclassifiedSamples4GridSearch)

This function automates the grid search process in the active learning system.

minSimilarity = the smallest h2 value
maxSimilarity = the highest h2 value
similarityStep = by how much the h2-value is increasing in every new iteration

minGap = the smallest h2 value
maxGap = the highest h2 value
gapStep = by how much the h3-value is increasing in every new iteration

startIndex1, endIndex1, startIndex2, endIndex2 helps the algorithm to pick the right samples for the iteration

In [66]:
def gridSearchAnd3FoldValidation (minSimilarity, maxSimilarity, similarityStep, minGap, maxGap, gapStep, startIndex1, endIndex1, startIndex2, endIndex2):
    
    similarityGrid = []
    gapGrid = []
    
    allResults = []
    dfColumnName = ["Accuracy", "Automation Rate", "Harmonic Score", "Local Accuracy", "Local Automation Rate", "Local Harmonic Score", "Number Of Oracled", "Number Of Predicted", "Minimal Similarity", "Minimal Gap"]
    
    global classifiedSamples4GridSearch, unclassifiedSamples4GridSearch
    
    
    similarity = minSimilarity
    
    while(similarity <= maxSimilarity):
        similarityGrid.append(similarity)
        similarity = similarity + similarityStep
        
    g = minGap

    while(g <= maxGap):
        gapGrid.append(g)
        g = g + gapStep
        
    print("Gap Grid: ", gapGrid)
    print("Similarity Grid: ", similarityGrid)
    print("\n")
    #print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore," 
    #      "numberOfOracled, numberOfPredicted, sim, gap, currentSim, currentGap")
    print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap")
    
    
    maxHarmonicScore = 0
    bestGap = 0
    bestSim = 0
    bestAccuracy = 0
    bestAutomationRate = 0
        
    for sim in similarityGrid:
    
        
        for gap in gapGrid:
            
            print("NOW NEW COMBINATION: ", sim, gap)
            
            numberOfPredicted = -1
            numberOfOracled = -1

            localAccuracy = -1
            globalAccuracy = -1
            localAutomationRate = -1
            globalAutomationRate = -1

            localHarmonicScore = -1
            globalHarmonicScore = -1

            unlabelledSamples = 1000
            
            currentSim = sim
            currentGap = gap
            
            simCounter = 0
            gapCounter = 0
            counter = 0
            
            resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
            
            while (unlabelledSamples != 0 and counter < 35):
                
                
                #print("sim, gap ", currentSim, currentGap)
                unlabelledSamples = findAndClassifyEqualSimilaritySamples(currentSim, currentGap)      
            
                numberOfNewPredicted = getNumberOfPredicted(classifiedSamples4GridSearch)
                numberOfNewOracled = getNumberOfOracled(classifiedSamples4GridSearch)
                
                if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted):
                    currentSim = currentSim - 0.01
                    simCounter = simCounter + 1
                    
                '''
                if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted and simCounter > 12):
                    #currentGap = currentGap - 0.005
                    gapCounter = gapCounter + 1
                '''
                
                numberOfOracled = numberOfNewOracled
                numberOfPredicted = numberOfNewPredicted

                if(unlabelledSamples == 0):
                    counter = counter + 10000
                    
                #print("Automation Rate ",getPercentageOfPredicted(classifiedSamples4GridSearch))
                #print("Accuracy ",getPercentageCorrectPredictions(classifiedSamples4GridSearch))
                #print("Number of Predicted ", getNumberOfPredicted(classifiedSamples4GridSearch))
                #print("Number of Oracled ",getNumberOfOracled(classifiedSamples4GridSearch))
                
                #accuracy, automationRate, numberOfOracled, numberOfPredicted, currentSim, currentGap
                counter = counter + 1 
            
            #harmonicScore = getHarmonicScore(localAccuracy, localAutomationRate, 0.5)
            
            localAccuracy = getLocalAccuracy(classifiedSamples4GridSearch)
            localAutomationRate = getLocalAutomationRate(classifiedSamples4GridSearch)
            localHarmonicScore = getHarmonicScore(localAccuracy, localAutomationRate, 0.5)
            
            globalAutomationRate = getPercentageOfPredicted(classifiedSamples4GridSearch)
            globalAccuracy  = getPercentageCorrectPredictions(classifiedSamples4GridSearch)
            globalHarmonicScore = getHarmonicScore(globalAccuracy, globalAutomationRate, 0.5)
            
            
            if(globalHarmonicScore > maxHarmonicScore):
                maxHarmonicScore = globalHarmonicScore
                bestAccuracy = globalAccuracy
                bestAutomationRate = globalAutomationRate
                bestGap = gap
                bestSim = sim
            
                
            #print("")
            #print("before reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
            
            
            
            #classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)
            #resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
            #classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)
            #print("after reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
            #print("")
            
            print(globalAccuracy, globalAutomationRate, globalHarmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap)
            
            tempResults = [globalAccuracy, globalAutomationRate, globalHarmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap]
            allResults.append(tempResults)
            
    print("Best: ", maxHarmonicScore, bestAccuracy, bestAutomationRate, bestGap, bestSim)
    return allResults, dfColumnName
            

With this function, we do the testing.
The best models obtained in every training iteration will be used to

In [67]:
def validation (similarityGapGrid, startIndex1, endIndex1, startIndex2, endIndex2):
    
    similarityGrid = []
    gapGrid = []
    
    allResults = []
    dfColumnName = ["Accuracy", "Automation Rate", "Harmonic Score", "Local Accuracy", "Local Automation Rate", "Local Harmonic Score", "Number Of Oracled", "Number Of Predicted", "Minimal Similarity", "Minimal Gap"]
    
    global classifiedSamples4GridSearch, unclassifiedSamples4GridSearch

        
    print("Gap Grid: ", gapGrid)
    print("Similarity Grid: ", similarityGrid)
    print("\n")
    #print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore," 
    #      "numberOfOracled, numberOfPredicted, sim, gap, currentSim, currentGap")
    print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap")
    
    
    maxHarmonicScore = 0
    bestGap = 0
    bestSim = 0
    bestAccuracy = 0
    bestAutomationRate = 0
    
    
    
        
    for simGap in similarityGapGrid:
        
        numberOfPredicted = -1
        numberOfOracled = -1
        accuracy = -1
        automationRate = -1
        unlabelledSamples = 1000

        currentSim = simGap[0]
        currentGap = simGap[1]


        simCounter = 0
        gapCounter = 0
        counter = 0

        while (unlabelledSamples != 0 and counter < 100):

            #print("during process ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))

            unlabelledSamples = findAndClassifyEqualSimilaritySamples(currentSim, currentGap)


            numberOfNewPredicted = getNumberOfPredicted(classifiedSamples4GridSearch)
            numberOfNewOracled = getNumberOfOracled(classifiedSamples4GridSearch)
            automationRate = getPercentageOfPredicted(classifiedSamples4GridSearch)
            accuracy = getPercentageCorrectPredictions(classifiedSamples4GridSearch)

            if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted):
                currentSim = currentSim - 0.01
                simCounter = simCounter + 1
                #currentGap = currentGap - 1

            if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted and simCounter > 12):
                currentGap = currentGap - 0.005
                gapCounter = gapCounter + 1

            numberOfOracled = numberOfNewOracled
            numberOfPredicted = numberOfNewPredicted

            #print("unlab samples ", unlabelledSamples)

            if(unlabelledSamples == 0):
                break

            #accuracy, automationRate, numberOfOracled, numberOfPredicted, currentSim, currentGap
            counter = counter + 1 

        harmonicScore = getHarmonicScore(accuracy, automationRate, 0.5)

        localAccuracy = getLocalAccuracy(classifiedSamples4GridSearch)
        localAutomationRate = getLocalAutomationRate(classifiedSamples4GridSearch)
        localHarmonicScore = getHarmonicScore(localAccuracy, localAutomationRate, 0.5)

        if(harmonicScore > maxHarmonicScore):
            maxHarmonicScore = harmonicScore
            bestAccuracy = accuracy
            bestAutomationRate = automationRate
            bestSim = simGap[0]
            bestGap = simGap[1]


        #print("")
        print("before reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))

        resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
        #classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)
        print("after reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
        #print("")

        print(accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, simGap[0], simGap[1])

        tempResults = [accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, simGap[0], simGap[1]]
        allResults.append(tempResults)
            
    print("Best: ", maxHarmonicScore, bestAccuracy, bestAutomationRate, bestGap, bestSim)
    return allResults, dfColumnName
            

In [60]:
startIndex1, endIndex1, startIndex2, endIndex2 = 0,400,0,0
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults1, tfIdfColumns1 = gridSearchAnd3FoldValidation(minSimilarity = 0.70, maxSimilarity = 0.71, similarityStep = 0.05 , minGap = 0.01, maxGap = 0.01, gapStep = 0.01, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 3000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

400
8 392
Gap Grid:  [0.01]
Similarity Grid:  [0.7]


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
NOW NEW COMBINATION:  0.7 0.01
400
8 392
0.68 0.725 0.6885474860335195 0.68 0.725 0.6885474860335195 110 290 0.7 0.01
Best:  0.6885474860335195 0.68 0.725 0.01 0.7


In [127]:
resetClassifiedSamples(0,200,0,0)
startIndex1, endIndex1, startIndex2, endIndex2 = 0,200,0,0
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults1, tfIdfColumns1 = gridSearchAnd3FoldValidation(minSimilarity = 0.35, maxSimilarity = 0.96, similarityStep = 0.05 , minGap = 0.01, maxGap = 0.08, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

startIndex1, endIndex1, startIndex2, endIndex2 = 100,300,0,0
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults2, tfIdfColumns2 = gridSearchAnd3FoldValidation(minSimilarity = 0.35, maxSimilarity = 0.96, similarityStep = 0.05 , minGap = 0.01, maxGap = 0.08, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

startIndex1, endIndex1, startIndex2, endIndex2 = 0,100,200,300
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults3, tfIdfColumns3 = gridSearchAnd3FoldValidation(minSimilarity = 0.35, maxSimilarity = 0.96, similarityStep = 0.05 , minGap = 0.01, maxGap = 0.08, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)


#resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
#tfIdfResults, tfIdfColumns = gridSearchAnd3FoldValidation(minSimilarity = 0.10, maxSimilarity = 0.3, similarityStep = 0.06 , minGap = 0.01, maxGap = 0.05, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 3000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

200
12 188
200
12 188
Gap Grid:  [0.01, 0.015, 0.02, 0.025, 0.030000000000000002, 0.035, 0.04, 0.045, 0.049999999999999996, 0.05499999999999999, 0.05999999999999999, 0.06499999999999999, 0.06999999999999999, 0.075, 0.08]
Similarity Grid:  [0.35, 0.39999999999999997, 0.44999999999999996, 0.49999999999999994, 0.5499999999999999, 0.6, 0.65, 0.7000000000000001, 0.7500000000000001, 0.8000000000000002, 0.8500000000000002, 0.9000000000000002, 0.9500000000000003]


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
NOW NEW COMBINATION:  0.35 0.01
200
12 188
0.88 0.45 0.7388059701492536 0.88 0.45 0.7388059701492536 110 90 0.35 0.01
NOW NEW COMBINATION:  0.35 0.015
200
12 188
0.93 0.305 0.6596511627906977 0.93 0.305 0.6596511627906977 139 61 0.35 0.015
NOW NEW COMBINATION:  0.35 0.02
200
12 188
0.97 0.18 0.516568047337278 0.97 0.18 0.516568047337278 164 36 0.35 0.02
NOW NEW COMBINATION:  0.35 0.025
200
12 

1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.49999999999999994 0.05499999999999999
NOW NEW COMBINATION:  0.49999999999999994 0.05999999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.49999999999999994 0.05999999999999999
NOW NEW COMBINATION:  0.49999999999999994 0.06499999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.49999999999999994 0.06499999999999999
NOW NEW COMBINATION:  0.49999999999999994 0.06999999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.49999999999999994 0.06999999999999999
NOW NEW COMBINATION:  0.49999999999999994 0.075
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.49999999999999994 0.075
NOW NEW COMBINATION:  0.49999999999999994 0.08
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.49999999999999994 0.08
NOW NEW COMBINATION:  0.5499999999999999 0.01
200
12 188
0.88 0.45 0.7388059701492536 0.88 0.45 0.7388059701492536 110 90 0.5499999999999999 0.01
NOW NEW COMBINATION:  0.5499999999999999 0.015
200
12 188
0.93 0.305 0.6596511627906977 0.93 0.305 0.6596511627906977 139 61 0.

NOW NEW COMBINATION:  0.7000000000000001 0.06999999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.7000000000000001 0.06999999999999999
NOW NEW COMBINATION:  0.7000000000000001 0.075
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.7000000000000001 0.075
NOW NEW COMBINATION:  0.7000000000000001 0.08
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.7000000000000001 0.08
NOW NEW COMBINATION:  0.7500000000000001 0.01
200
12 188
0.88 0.45 0.7388059701492536 0.88 0.45 0.7388059701492536 110 90 0.7500000000000001 0.01
NOW NEW COMBINATION:  0.7500000000000001 0.015
200
12 188
0.93 0.305 0.6596511627906977 0.93 0.305 0.6596511627906977 139 61 0.7500000000000001 0.015
NOW NEW COMBINATION:  0.7500000000000001 0.02
200
12 188
0.97 0.18 0.516568047337278 0.97 0.18 0.516568047337278 164 36 0.7500000000000001 0.02
NOW NEW COMBINATION:  0.7500000000000001 0.025
200
12 188
0.975 0.115 0.39067944250871084 0.975 0.115 0.39067944250871084 177 23 0.7500000000000001 0.025
NOW NEW COMBINATION:  0.7500000000000001 

1.0 0.01 0.04807692307692308 1.0 0.01 0.04807692307692308 198 2 0.9000000000000002 0.049999999999999996
NOW NEW COMBINATION:  0.9000000000000002 0.05499999999999999
200
12 188
1.0 0.005 0.024509803921568627 1.0 0.005 0.024509803921568627 199 1 0.9000000000000002 0.05499999999999999
NOW NEW COMBINATION:  0.9000000000000002 0.05999999999999999
200
12 188
1.0 0.005 0.024509803921568627 1.0 0.005 0.024509803921568627 199 1 0.9000000000000002 0.05999999999999999
NOW NEW COMBINATION:  0.9000000000000002 0.06499999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.9000000000000002 0.06499999999999999
NOW NEW COMBINATION:  0.9000000000000002 0.06999999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.9000000000000002 0.06999999999999999
NOW NEW COMBINATION:  0.9000000000000002 0.075
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.9000000000000002 0.075
NOW NEW COMBINATION:  0.9000000000000002 0.08
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.9000000000000002 0.08
NOW NEW COMBINATION:  0.95000000

0.88 0.32 0.6518518518518519 0.88 0.32 0.6518518518518519 136 64 0.44999999999999996 0.015
NOW NEW COMBINATION:  0.44999999999999996 0.02
200
12 188
0.935 0.17 0.4921052631578947 0.935 0.17 0.4921052631578947 166 34 0.44999999999999996 0.02
NOW NEW COMBINATION:  0.44999999999999996 0.025
200
12 188
0.97 0.1 0.35401459854014594 0.97 0.1 0.35401459854014594 180 20 0.44999999999999996 0.025
NOW NEW COMBINATION:  0.44999999999999996 0.030000000000000002
200
12 188
0.99 0.055 0.225 0.99 0.055 0.225 189 11 0.44999999999999996 0.030000000000000002
NOW NEW COMBINATION:  0.44999999999999996 0.035
200
12 188
0.99 0.04 0.1721739130434783 0.99 0.04 0.1721739130434783 192 8 0.44999999999999996 0.035
NOW NEW COMBINATION:  0.44999999999999996 0.04
200
12 188
0.995 0.02 0.09255813953488373 0.995 0.02 0.09255813953488373 196 4 0.44999999999999996 0.04
NOW NEW COMBINATION:  0.44999999999999996 0.045
200
12 188
1.0 0.005 0.024509803921568627 1.0 0.005 0.024509803921568627 199 1 0.44999999999999996 0.045


0.97 0.1 0.35401459854014594 0.97 0.1 0.35401459854014594 180 20 0.65 0.025
NOW NEW COMBINATION:  0.65 0.030000000000000002
200
12 188
0.99 0.055 0.225 0.99 0.055 0.225 189 11 0.65 0.030000000000000002
NOW NEW COMBINATION:  0.65 0.035
200
12 188
0.99 0.04 0.1721739130434783 0.99 0.04 0.1721739130434783 192 8 0.65 0.035
NOW NEW COMBINATION:  0.65 0.04
200
12 188
0.995 0.02 0.09255813953488373 0.995 0.02 0.09255813953488373 196 4 0.65 0.04
NOW NEW COMBINATION:  0.65 0.045
200
12 188
1.0 0.005 0.024509803921568627 1.0 0.005 0.024509803921568627 199 1 0.65 0.045
NOW NEW COMBINATION:  0.65 0.049999999999999996
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.65 0.049999999999999996
NOW NEW COMBINATION:  0.65 0.05499999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.65 0.05499999999999999
NOW NEW COMBINATION:  0.65 0.05999999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.65 0.05999999999999999
NOW NEW COMBINATION:  0.65 0.06499999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.65 0

NOW NEW COMBINATION:  0.8500000000000002 0.035
200
12 188
0.99 0.035 0.1533185840707965 0.99 0.035 0.1533185840707965 193 7 0.8500000000000002 0.035
NOW NEW COMBINATION:  0.8500000000000002 0.04
200
12 188
0.995 0.02 0.09255813953488373 0.995 0.02 0.09255813953488373 196 4 0.8500000000000002 0.04
NOW NEW COMBINATION:  0.8500000000000002 0.045
200
12 188
0.995 0.01 0.04806763285024155 0.995 0.01 0.04806763285024155 198 2 0.8500000000000002 0.045
NOW NEW COMBINATION:  0.8500000000000002 0.049999999999999996
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.8500000000000002 0.049999999999999996
NOW NEW COMBINATION:  0.8500000000000002 0.05499999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.8500000000000002 0.05499999999999999
NOW NEW COMBINATION:  0.8500000000000002 0.05999999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.8500000000000002 0.05999999999999999
NOW NEW COMBINATION:  0.8500000000000002 0.06499999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.8500000000000002 0.064

0.835 0.445 0.7104684512428298 0.835 0.445 0.7104684512428298 111 89 0.39999999999999997 0.01
NOW NEW COMBINATION:  0.39999999999999997 0.015
200
12 188
0.94 0.26 0.6171717171717171 0.94 0.26 0.6171717171717171 148 52 0.39999999999999997 0.015
NOW NEW COMBINATION:  0.39999999999999997 0.02
200
12 188
0.98 0.16 0.4839506172839506 0.98 0.16 0.4839506172839506 168 32 0.39999999999999997 0.02
NOW NEW COMBINATION:  0.39999999999999997 0.025
200
12 188
0.985 0.09 0.3295539033457249 0.985 0.09 0.3295539033457249 182 18 0.39999999999999997 0.025
NOW NEW COMBINATION:  0.39999999999999997 0.030000000000000002
200
12 188
0.995 0.055 0.22520576131687242 0.995 0.055 0.22520576131687242 189 11 0.39999999999999997 0.030000000000000002
NOW NEW COMBINATION:  0.39999999999999997 0.035
200
12 188
1.0 0.02 0.09259259259259259 1.0 0.02 0.09259259259259259 196 4 0.39999999999999997 0.035
NOW NEW COMBINATION:  0.39999999999999997 0.04
200
12 188
1.0 0.015 0.07075471698113207 1.0 0.015 0.07075471698113207 197

NOW NEW COMBINATION:  0.5499999999999999 0.06499999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.5499999999999999 0.06499999999999999
NOW NEW COMBINATION:  0.5499999999999999 0.06999999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.5499999999999999 0.06999999999999999
NOW NEW COMBINATION:  0.5499999999999999 0.075
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.5499999999999999 0.075
NOW NEW COMBINATION:  0.5499999999999999 0.08
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.5499999999999999 0.08
NOW NEW COMBINATION:  0.6 0.01
200
12 188
0.835 0.445 0.7104684512428298 0.835 0.445 0.7104684512428298 111 89 0.6 0.01
NOW NEW COMBINATION:  0.6 0.015
200
12 188
0.94 0.26 0.6171717171717171 0.94 0.26 0.6171717171717171 148 52 0.6 0.015
NOW NEW COMBINATION:  0.6 0.02
200
12 188
0.98 0.16 0.4839506172839506 0.98 0.16 0.4839506172839506 168 32 0.6 0.02
NOW NEW COMBINATION:  0.6 0.025
200
12 188
0.985 0.09 0.3295539033457249 0.985 0.09 0.3295539033457249 182 18 0.6 0.025
NOW NEW COMBINATION:

NOW NEW COMBINATION:  0.7500000000000001 0.08
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.7500000000000001 0.08
NOW NEW COMBINATION:  0.8000000000000002 0.01
200
12 188
0.835 0.445 0.7104684512428298 0.835 0.445 0.7104684512428298 111 89 0.8000000000000002 0.01
NOW NEW COMBINATION:  0.8000000000000002 0.015
200
12 188
0.94 0.26 0.6171717171717171 0.94 0.26 0.6171717171717171 148 52 0.8000000000000002 0.015
NOW NEW COMBINATION:  0.8000000000000002 0.02
200
12 188
0.98 0.16 0.4839506172839506 0.98 0.16 0.4839506172839506 168 32 0.8000000000000002 0.02
NOW NEW COMBINATION:  0.8000000000000002 0.025
200
12 188
0.985 0.09 0.3295539033457249 0.985 0.09 0.3295539033457249 182 18 0.8000000000000002 0.025
NOW NEW COMBINATION:  0.8000000000000002 0.030000000000000002
200
12 188
0.995 0.055 0.22520576131687242 0.995 0.055 0.22520576131687242 189 11 0.8000000000000002 0.030000000000000002
NOW NEW COMBINATION:  0.8000000000000002 0.035
200
12 188
1.0 0.02 0.09259259259259259 1.0 0.02 0.0925925925925

1.0 0.005 0.024509803921568627 1.0 0.005 0.024509803921568627 199 1 0.9500000000000003 0.05499999999999999
NOW NEW COMBINATION:  0.9500000000000003 0.05999999999999999
200
12 188
1.0 0.005 0.024509803921568627 1.0 0.005 0.024509803921568627 199 1 0.9500000000000003 0.05999999999999999
NOW NEW COMBINATION:  0.9500000000000003 0.06499999999999999
200
12 188
1.0 0.005 0.024509803921568627 1.0 0.005 0.024509803921568627 199 1 0.9500000000000003 0.06499999999999999
NOW NEW COMBINATION:  0.9500000000000003 0.06999999999999999
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.9500000000000003 0.06999999999999999
NOW NEW COMBINATION:  0.9500000000000003 0.075
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.9500000000000003 0.075
NOW NEW COMBINATION:  0.9500000000000003 0.08
200
12 188
1.0 0.0 0.0 1.0 0.0 0.0 200 0 0.9500000000000003 0.08
Best:  0.723767258382643 0.895 0.41 0.01 0.9500000000000003


In [47]:
from pandas import DataFrame

In [129]:
df1 = DataFrame (tfIdfResults1, columns= tfIdfColumns1)
df2 = DataFrame (tfIdfResults2, columns= tfIdfColumns2)
df3 = DataFrame (tfIdfResults3, columns= tfIdfColumns3)
df1

Unnamed: 0,Accuracy,Automation Rate,Harmonic Score,Local Accuracy,Local Automation Rate,Local Harmonic Score,Number Of Oracled,Number Of Predicted,Minimal Similarity,Minimal Gap
0,0.880,0.450,0.738806,0.880,0.450000,0.738806,110,90,0.35,0.010
1,0.930,0.305,0.659651,0.930,0.305000,0.659651,139,61,0.35,0.015
2,0.970,0.180,0.516568,0.970,0.180000,0.516568,164,36,0.35,0.020
3,0.975,0.115,0.390679,0.975,0.115000,0.390679,177,23,0.35,0.025
4,0.985,0.075,0.287451,0.985,0.075000,0.287451,185,15,0.35,0.030
...,...,...,...,...,...,...,...,...,...,...
190,1.000,0.005,0.024510,1.000,0.005025,0.024631,198,1,0.95,0.060
191,1.000,0.005,0.024510,1.000,0.005025,0.024631,198,1,0.95,0.065
192,1.000,0.000,0.000000,1.000,0.000000,0.000000,199,0,0.95,0.070
193,1.000,0.000,0.000000,1.000,0.000000,0.000000,199,0,0.95,0.075


In [130]:
with pd.ExcelWriter("word2vec-3PerCat-WITHOUT_tf_idf-CFV.xlsx") as writer:
    df1.to_excel(writer, sheet_name='1', index=False)
    df2.to_excel(writer, sheet_name='2', index=False)
    df3.to_excel(writer, sheet_name='3', index=False)

In [None]:
findAndClassifyEqualSimilaritySamples()

In [None]:
resetClassifiedSamples(0,200,0,0)

In [None]:
similarityHP = 0.4
gapHP = 0.01
#resetClassifiedSamples(0,200,0,0)
findAndClassifyEqualSimilaritySamples(similarityHP, gapHP)

In [91]:

startIndex1, endIndex1, startIndex2, endIndex2 = 200,300,0,0
#to be changed
simGapGrid1 = [[0.85, 0.01],[0.35, 0.01],[0.4, 0.01],[0.45, 0.01],[0.5, 0.01],[0.55, 0.01],[0.6, 0.01],[0.65, 0.01],[0.7, 0.01],[0.75, 0.01],[0.8, 0.01]]
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults11, tfIdfColumns11 = validation(simGapGrid1, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

startIndex1, endIndex1, startIndex2, endIndex2 = 0,100,0,0
#to be changed
simGapGrid2 = [[0.85, 0.01],[0.35, 0.01],[0.4, 0.01],[0.45, 0.01],[0.5, 0.01],[0.55, 0.01],[0.6, 0.01],[0.65, 0.01],[0.7, 0.01],[0.75, 0.01],[0.8, 0.01]]
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults22, tfIdfColumns22 = validation(simGapGrid2, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

startIndex1, endIndex1, startIndex2, endIndex2 = 100,200,0,0
#to be changed
simGapGrid3 = [[0.85, 0.01],[0.35, 0.01],[0.4, 0.01],[0.45, 0.01],[0.5, 0.01],[0.55, 0.01],[0.6, 0.01],[0.65, 0.01],[0.7, 0.01],[0.75, 0.01],[0.8, 0.01]]
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults33, tfIdfColumns33 = validation(simGapGrid3, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)
#resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
#tfIdfResults, tfIdfColumns = gridSearchAnd3FoldValidation(minSimilarity = 0.10, maxSimilarity = 0.3, similarityStep = 0.06 , minGap = 0.01, maxGap = 0.05, gapStep = 0.005, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 5000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

100
4 96
Gap Grid:  []
Similarity Grid:  []


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
before reset  0 100
100
4 96
after reset  96 4
0.58 0.7 0.6005917159763313 0.58 0.7 0.6005917159763313 30 70 0.85 0.01
before reset  0 100
100
4 96
after reset  96 4
0.58 0.7 0.6005917159763313 0.58 0.7 0.6005917159763313 30 70 0.35 0.01
before reset  0 100
100
4 96
after reset  96 4
0.58 0.7 0.6005917159763313 0.58 0.7 0.6005917159763313 30 70 0.4 0.01
before reset  0 100
100
4 96
after reset  96 4
0.58 0.7 0.6005917159763313 0.58 0.7 0.6005917159763313 30 70 0.45 0.01
before reset  0 100
100
4 96
after reset  96 4
0.58 0.7 0.6005917159763313 0.58 0.7 0.6005917159763313 30 70 0.5 0.01
before reset  0 100
100
4 96
after reset  96 4
0.58 0.7 0.6005917159763313 0.58 0.7 0.6005917159763313 30 70 0.55 0.01
before reset  0 100
100
4 96
after reset  96 4
0.58 0.7 0.6005917159763313 0.58 0.7 0.60059171597633

In [92]:
df11 = DataFrame (tfIdfResults11, columns= tfIdfColumns11)
df22 = DataFrame (tfIdfResults22, columns= tfIdfColumns22)
df33 = DataFrame (tfIdfResults33, columns= tfIdfColumns33)

In [93]:
with pd.ExcelWriter("word2vec-WITHOUT_tf_idf-Validation.xlsx") as writer:
    df11.to_excel(writer, sheet_name='1', index=False)
    df22.to_excel(writer, sheet_name='2', index=False)
    df33.to_excel(writer, sheet_name='3', index=False)

In [94]:

startIndex1, endIndex1, startIndex2, endIndex2 = 300,400,0,0
#to be changed
simGapGrid1 = [[0.85, 0.01] ]
resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults111, tfIdfColumns111 = validation(simGapGrid1, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)


import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 5000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

100
4 96
Gap Grid:  []
Similarity Grid:  []


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
before reset  0 100
100
4 96
after reset  96 4
0.83 0.47 0.719741697416974 0.83 0.47 0.719741697416974 53 47 0.85 0.01
Best:  0.719741697416974 0.83 0.47 0.01 0.85


In [95]:
df111 = DataFrame (tfIdfResults111, columns= tfIdfColumns111)

In [96]:
with pd.ExcelWriter("word2vec-WITHOUT_tf_idf-Final.xlsx") as writer:
    df111.to_excel(writer, sheet_name='1', index=False)
    

In [78]:
#Hereby, if we have 4 categories, we save the biggest similarity per each category with the new sample 
#If the similarites are more or less the same (DEFINE Threshold), then 
def findAndClassifyEqualSimilaritySamples(similarityHP, gapHP):
    
    maxSimilarityPerCategories = []
    
    for label in uniqueLabels:
        maxSimilarityPerCategories.append((label, 0))
    #print(maxSimilarityPerCategories)
    
    
    for unclassifiedSample in unclassifiedSamples4GridSearch:
    
        mostSimilarSample = 0
        maxCosSim = 0
        maxCosSimLabel = ""
        
        for label in uniqueLabels:
            similarityDict.update({label: 0})
    
        for classifiedSample in classifiedSamples4GridSearch:
            cosSim = cosineSimilarity(unclassifiedSample[2], classifiedSample[2])
            #print(len(unclassifiedSample), len(classifiedSample))
            #cosSim = computeSBertSimilarity((unclassifiedSample[3]), (classifiedSample[5]))
            classifiedLabel = classifiedSample[1]
            
            if(maxCosSim < cosSim):
                maxCosSim = cosSim
                maxCosSimLabel = classifiedLabel
            
            if(cosSim > similarityDict[classifiedLabel]):
                similarityDict [classifiedLabel] = cosSim
                sentenceDict [classifiedLabel] = unclassifiedSample[0]
        
        gapPassed = False
        
        for i in similarityDict:
            if(i != maxCosSimLabel):
                gap = maxCosSim - similarityDict[i]
                if(gap < gapHP):
                    gapPassed = True
                    break
        
        if(maxCosSim > similarityHP and gapPassed):
            # ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted
            entry = [unclassifiedSample[0], unclassifiedSample[1], unclassifiedSample[2], unclassifiedSample[1], "oracle", unclassifiedSample[3]]
            #print("entry ", entry)
        
            try:
                unclassifiedSamples4GridSearch.remove(unclassifiedSample)
                classifiedSamples4GridSearch.append(entry)
            except:
                print("entry ", entry)
        
        elif(maxCosSim > similarityHP):
            #STRUCTURE OF classifiedSamplesInit: 
            #ticketText, predictedLabel, TF-IDF, actualLabel, Oracle/Predicted, wordVector
            entry = [unclassifiedSample[0], maxCosSimLabel, unclassifiedSample[2], unclassifiedSample[1], "predicted", unclassifiedSample[3]]
            #print("entry ", entry)
            
            try:
                unclassifiedSamples4GridSearch.remove(unclassifiedSample)
                classifiedSamples4GridSearch.append(entry)
            except:
                print("entry ", entry)
            
            
            
 
        #print(similarityDict, unclassifiedSample[1])
        #print()
    #print("FINISHED ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
    #print()
    return len(unclassifiedSamples4GridSearch)

In [79]:
def validation (similarityGapGrid, startIndex1, endIndex1, startIndex2, endIndex2):
    
    similarityGrid = []
    gapGrid = []
    
    allResults = []
    dfColumnName = ["Accuracy", "Automation Rate", "Harmonic Score", "Local Accuracy", "Local Automation Rate", "Local Harmonic Score", "Number Of Oracled", "Number Of Predicted", "Minimal Similarity", "Minimal Gap"]
    
    global classifiedSamples4GridSearch, unclassifiedSamples4GridSearch

        
    print("Gap Grid: ", gapGrid)
    print("Similarity Grid: ", similarityGrid)
    print("\n")
    #print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore," 
    #      "numberOfOracled, numberOfPredicted, sim, gap, currentSim, currentGap")
    print("accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap")
    
    
    maxHarmonicScore = 0
    bestGap = 0
    bestSim = 0
    bestAccuracy = 0
    bestAutomationRate = 0
    
    
    
        
    for simGap in similarityGapGrid:
        
        numberOfPredicted = -1
        numberOfOracled = -1
        accuracy = -1
        automationRate = -1
        unlabelledSamples = 1000

        currentSim = simGap[0]
        currentGap = simGap[1]


        simCounter = 0
        gapCounter = 0
        counter = 0

        while (unlabelledSamples != 0 and counter < 100):

            #print("during process ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))

            unlabelledSamples = findAndClassifyEqualSimilaritySamples(currentSim, currentGap)


            numberOfNewPredicted = getNumberOfPredicted(classifiedSamples4GridSearch)
            numberOfNewOracled = getNumberOfOracled(classifiedSamples4GridSearch)
            automationRate = getPercentageOfPredicted(classifiedSamples4GridSearch)
            accuracy = getPercentageCorrectPredictions(classifiedSamples4GridSearch)

            if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted):
                currentSim = currentSim - 0.01
                simCounter = simCounter + 1
                #currentGap = currentGap - 1

            if (numberOfNewOracled == numberOfOracled and numberOfNewPredicted == numberOfPredicted and simCounter > 12):
                currentGap = currentGap - 0.005
                gapCounter = gapCounter + 1

            numberOfOracled = numberOfNewOracled
            numberOfPredicted = numberOfNewPredicted

            #print("unlab samples ", unlabelledSamples)

            if(unlabelledSamples == 0):
                break

            #accuracy, automationRate, numberOfOracled, numberOfPredicted, currentSim, currentGap
            counter = counter + 1 

        harmonicScore = getHarmonicScore(accuracy, automationRate, 0.5)

        localAccuracy = getLocalAccuracy(classifiedSamples4GridSearch)
        localAutomationRate = getLocalAutomationRate(classifiedSamples4GridSearch)
        localHarmonicScore = getHarmonicScore(localAccuracy, localAutomationRate, 0.5)

        if(harmonicScore > maxHarmonicScore):
            maxHarmonicScore = harmonicScore
            bestAccuracy = accuracy
            bestAutomationRate = automationRate
            bestSim = simGap[0]
            bestGap = simGap[1]


        #print("")
        print("before reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))

        resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
        #classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:200], 1)
        print("after reset ", len(unclassifiedSamples4GridSearch), len(classifiedSamples4GridSearch))
        #print("")

        print(accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, simGap[0], simGap[1])

        tempResults = [accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, simGap[0], simGap[1]]
        allResults.append(tempResults)
            
    print("Best: ", maxHarmonicScore, bestAccuracy, bestAutomationRate, bestGap, bestSim)
    return allResults, dfColumnName
            

In [85]:
classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(allSamplesOracled[:], 1)

3 599


In [86]:
def resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2):
    
    global classifiedSamples4GridSearch, unclassifiedSamples4GridSearch
    
    
    consideredSamples = allSamplesOracled[startIndex1:endIndex1] + allSamplesOracled[startIndex2:endIndex2]
    
    print(len(consideredSamples))
    
    classifiedSamples4GridSearch, unclassifiedSamples4GridSearch = addOracledSamples(consideredSamples, 1)

In [87]:
#1 per Cat

startIndex1, endIndex1, startIndex2, endIndex2 = 0, 600,0,0
simGapGrid1 = [[0.95, 0.03], [0.95, 0.035], [0.95, 0.04]]
#resetClassifiedSamples(startIndex1, endIndex1, startIndex2, endIndex2)
tfIdfResults1, tfIdfColumns1 = validation(simGapGrid1, startIndex1 = startIndex1, endIndex1 = endIndex1, startIndex2 = startIndex2, endIndex2 = endIndex2)

import winsound
frequency = 1500  # Set Frequency To 2500 Hertz
duration = 3000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

Gap Grid:  []
Similarity Grid:  []


accuracy, automationRate, harmonicScore, localAccuracy, localAutomationRate, localHarmonicScore, numberOfOracled, numberOfPredicted, sim, gap
before reset  212 390
600
3 597
after reset  597 3
0.6212624584717608 0.6428571428571429 0.6254645458599672 0.4153846153846154 0.9923076923076923 0.4700404858299595 3 387 0.95 0.03
before reset  211 389
600
3 597
after reset  597 3
0.6216666666666667 0.6433333333333333 0.6258824552251783 0.41645244215938304 0.9922879177377892 0.47113318485791866 3 386 0.95 0.035
before reset  211 389
600
3 597
after reset  597 3
0.6216666666666667 0.6433333333333333 0.6258824552251783 0.41645244215938304 0.9922879177377892 0.47113318485791866 3 386 0.95 0.04
Best:  0.6258824552251783 0.6216666666666667 0.6433333333333333 0.035 0.95


In [88]:
d1111 = DataFrame (tfIdfResults1, columns= tfIdfColumns1)


In [89]:
with pd.ExcelWriter("Word2Vec-With-TF-IDF-OwnDataSet-h1-1-3.xlsx") as writer:
    d1111.to_excel(writer, sheet_name='1', index=False)