In [1]:
import sys
from collections import namedtuple
import csv
import glob
import os
import pycrfsuite

def get_utterances_from_file(dialog_csv_file, dialog_csv_filename):
    reader = csv.DictReader(dialog_csv_file)
    path = dialog_csv_filename.split("\\")
    return [_dict_to_dialog_utterance(du_dict, path[-1]) for du_dict in reader]

def get_utterances_from_filename(dialog_csv_filename):
    with open(dialog_csv_filename, "r") as dialog_csv_file:
        return get_utterances_from_file(dialog_csv_file, dialog_csv_filename)

def get_data(data_dir):
    dialog_filenames = sorted(glob.glob(os.path.join(data_dir, "*.csv")))
    for dialog_filename in dialog_filenames:
        yield get_utterances_from_filename(dialog_filename)

DialogUtterance = namedtuple("DialogUtterance", ("act_tag", "speaker", "pos", "text", "fileName"))

PosTag = namedtuple("PosTag", ("token", "pos"))

def _dict_to_dialog_utterance(du_dict, dialog_csv_filename):

    # Remove anything with
    for k, v in du_dict.items():
        if len(v.strip()) == 0:
            du_dict[k] = None

    # Extract tokens and POS tags
    if du_dict["pos"]:
        du_dict["pos"] = [
            PosTag(*token_pos_pair.split("/"))
            for token_pos_pair in du_dict["pos"].split()]
    du_dict["fileName"] = dialog_csv_filename
    return DialogUtterance(**du_dict)

def createFeatureList(files):
    xTrain = []
    yTrain = []
    fileNames = []
    for utterances in files:
        file = []
        labels = []
        first = True
        speaker = ''
        for dialogUtterance in utterances:
            fileName = dialogUtterance.fileName
            feature = []
            labels.append(dialogUtterance.act_tag)
            if first:
                feature.append('1')
                feature.append('0')
                speaker = dialogUtterance.speaker
                first = False
            else:
                feature.append('0')
                if dialogUtterance.speaker == speaker:
                    feature.append('0')
                else:
                    feature.append('1')
                    speaker = dialogUtterance.speaker
            if dialogUtterance.pos:
                for posTag in dialogUtterance.pos:
                    feature.append("TOKEN_"+posTag.token)
                for posTag in dialogUtterance.pos:
                    feature.append("POS_"+posTag.pos)
            file.append(feature)
        xTrain.append(file)
        yTrain.append(labels)
        fileNames.append(fileName)
    return xTrain, yTrain, fileNames


trainDir = 'C:/Users/91908/Desktop/Dream Hokage/Sem-6/Mini Project/SequenceLabelingWithCRF-master/train'
devDir = 'C:/Users/91908/Desktop/Dream Hokage/Sem-6/Mini Project/SequenceLabelingWithCRF-master/dev'
outputFile = 'C:/Users/91908/Desktop/Dream Hokage/Sem-6/Mini Project/SequenceLabelingWithCRF-master/output.txt'


# get all utterances
files_train = get_data(trainDir)
files_test = get_data(devDir)

# create feature list
xTrain, yTrain, filenames_train = createFeatureList(files_train)
xTest, yTest, filenames_test = createFeatureList(files_test)


# train crf model
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(xTrain, yTrain):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 150,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.params()
trainer.train('baseline_model.crfsuite')

# test on dev data
tagger = pycrfsuite.Tagger()
tagger.open('baseline_model.crfsuite')

yPred = [tagger.tag(xseq) for xseq in xTest]

#print(yPred)
#print(yTest)

correctCount = 0
count = 0
# write output file
fileHandler = open(outputFile, "w")
for i in range(0, len(yPred)):
    fileHandler.write("Filename=\""+filenames_test[i]+"\"\n")
    for j in range (0, len(yPred[i])):
        fileHandler.write(yPred[i][j]+"\n")
        if yPred[i][j] == yTest[i][j]:
            correctCount += 1
        count += 1
    fileHandler.write("\n")
fileHandler.close()

In [2]:
import sys
import csv
import glob
import os

def getLabelData(testDir):
    fileMap = {}
    dialog_filenames = sorted(glob.glob(os.path.join(testDir, "*.csv")))
    for dialog_filename in dialog_filenames:
        with open(dialog_filename,"r") as f:
            labels = []
            reader = csv.reader(f,delimiter = ",")
            data = list(reader)
            for row in data[1:]:
                labels.append(row[0])
            path = dialog_filename.split("\\")
            fileMap[path[-1]] = labels

    return fileMap

def getOutputLabels(outputFile):
    fileHandler = open(outputFile, "r")
    lines = fileHandler.readlines()
    fileMap = {}
    labels = []
    for line in lines:
        if "Filename=" in line:
            fileName = line.replace("Filename=\"","").replace("\"","").strip()
            labels = []
            fileMap[fileName] = labels
        elif line.strip() not in "":
            labels.append(line.strip())

    return fileMap


print ('Argument count : ', len(sys.argv))
#exit if file name is not provided as command line argument
if len(sys.argv) != 3:
    print ('Please send file name as command line argument')
    exit(0)

devDir = 'C:/Users/91908/Desktop/Dream Hokage/Sem-6/Mini Project/SequenceLabelingWithCRF-master/dev'
outputFile = 'C:/Users/91908/Desktop/Dream Hokage/Sem-6/Mini Project/SequenceLabelingWithCRF-master/output1.txt'

# get actual file names and labels
print ('devDir : ', devDir,' outputFile : ', outputFile)
fileMap = getLabelData(devDir)

# get file nmaes and labels from output.txt
outputFileMap = getOutputLabels(outputFile)

# count incorrect and total labels
incorrect = 0
total = 0
for key in fileMap.keys():
    acutalLabels = fileMap[key]
    predictedLabels = outputFileMap[key]
    wrong = 0
    total += len(acutalLabels)
    for label1, label2 in zip(acutalLabels, predictedLabels):
        if label1 != label2:
            wrong += 1
    incorrect += wrong
    print(key+" has "+str(wrong)+" incorrect labels and "+str((len(acutalLabels)-len(predictedLabels)))+" missing labels")

print("total label = "+str(total))
print("total incorrect = "+str(incorrect))

accuracy = ((total - incorrect)/total)*100
print("Accuracy = "+str(accuracy))


Argument count :  3
devDir :  C:/Users/91908/Desktop/Dream Hokage/Sem-6/Mini Project/SequenceLabelingWithCRF-master/dev  outputFile :  C:/Users/91908/Desktop/Dream Hokage/Sem-6/Mini Project/SequenceLabelingWithCRF-master/output1.txt
0808.csv has 19 incorrect labels and 0 missing labels
0809.csv has 25 incorrect labels and 0 missing labels
0810.csv has 32 incorrect labels and 0 missing labels
0811.csv has 45 incorrect labels and 0 missing labels
0812.csv has 13 incorrect labels and 0 missing labels
0813.csv has 51 incorrect labels and 0 missing labels
0814.csv has 38 incorrect labels and 0 missing labels
0815.csv has 29 incorrect labels and 0 missing labels
0816.csv has 61 incorrect labels and 0 missing labels
0817.csv has 45 incorrect labels and 0 missing labels
0818.csv has 58 incorrect labels and 0 missing labels
0819.csv has 39 incorrect labels and 0 missing labels
0820.csv has 30 incorrect labels and 0 missing labels
0821.csv has 74 incorrect labels and 0 missing labels
0822.csv ha