In [7]:
import os
import numpy as np
import pandas as pd
from conllu import parse
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import os
import sys
import codecs
import re
from collections import OrderedDict
import csv

In [8]:
class Node():

    def __init__(self, text):
        self.text = text
        self.lex = None
        self.type = None
        self.attributes = OrderedDict()
        self.errors = []
        self.name = None
        self.parent = None
        self.parentRelation = None
        self.alignedTo = None
        self.fsList = None
        self.analyzeNode(self.text)

    def analyzeNode(self, text):
        [token, tokenType, fsDict, fsList] = getTokenFeats(
            text.strip().split())
        attributeUpdateStatus = self.updateAttributes(
            token, tokenType, fsDict, fsList)
        if attributeUpdateStatus == 0:
            self.errors.append("Can't update attributes for node")
            self.probSent = True

    def updateAttributes(self, token, tokenType, fsDict, fsList):
        self.fsList = fsList
        self.lex = token
        self.type = tokenType
        for attribute in fsDict.keys():
            self.attributes[attribute] = fsDict[attribute]
        self.assignName()

    def assignName(self):
        if self.attributes.get('name') is not None:
            self.name = self.getAttribute('name')
        else:
            self.errors.append('No name for this token Node')

    def printValue(self):
        return self.lex

    def printSSFValue(self, prefix, allFeat):
        returnValue = [prefix, self.printValue(), self.type]
        if allFeat is False:
            fs = ['<fs']
            for key in self.attributes.keys():
                fs.append(key + "='" + self.getAttribute(key) + "'")
            delim = ' '
            fs[-1] = fs[-1] + '>'

        else:
            fs = self.fsList
            delim = '|'
        return ['\t'.join(x for x in returnValue) + '\t' + delim.join(x for x in fs)]

    def getAttribute(self, key):
        if key in self.attributes:
            return self.attributes[key]
        else:
            return None

    def addAttribute(self, key, value):
        self.attributes[key] = value

    def deleteAttribute(self, key):
        del self.attributes[key]


In [9]:
class ChunkNode():

    def __init__(self, header):
        self.text = []
        self.header = header
        self.footer = None
        self.nodeList = []
        self.parent = '0'
        self.attributes = OrderedDict()
        self.parentRelation = 'root'
        self.name = None
        self.head = None
        self.isParent = False
        self.errors = []
        self.upper = None
        self.updateDrel()
        self.type = None
        self.fsList = None

    def analyzeChunk(self):
        [chunkType, chunkFeatDict, chunkFSList] = getChunkFeats(self.header)
        self.fsList = chunkFSList
        self.type = chunkType
        self.updateAttributes(chunkFeatDict)
        self.text = '\n'.join([line for line in self.text])

    def updateAttributes(self, fsDict):
        for attribute in fsDict.keys():
            self.attributes[attribute] = fsDict[attribute]
        self.assignName()
        self.updateDrel()

    def assignName(self):
        if 'name' in self.attributes:
            self.name = self.getAttribute('name')
        else:
            self.errors.append('No name for this chunk Node')

    def updateDrel(self):
        if 'drel' in self.attributes:
            drelList = self.getAttribute('drel').split(':')
            if len(drelList) == 2:
                self.parent = drelList[1]
                self.parentRelation = self.getAttribute('drel').split(':')[0]
        elif 'dmrel' in self.attributes:
            drelList = self.getAttribute('dmrel').split(':')
            if len(drelList) == 2:
                self.parent = drelList[1]
                self.parentRelation = self.getAttribute('dmrel').split(':')[0]

    def printValue(self):
        returnString = []
        for node in self.nodeList:
            returnString.append(node.printValue())
        return ' '.join(x for x in returnString)

    def printSSFValue(self, prefix, allFeat):
        returnStringList = []
        returnValue = [prefix, '((', self.type]
        if allFeat is False:
            fs = ['<fs']
            for key in self.attributes.keys():
                fs.append(key + "='" + self.getAttribute(key) + "'")
            delim = ' '
            fs[-1] = fs[-1] + '>'

        else:
            fs = self.fsList
            delim = '|'

        returnStringList.append(
            '\t'.join(x for x in returnValue) + '\t' + delim.join(x for x in fs))
        nodePosn = 0
        for node in self.nodeList:
            nodePosn += 1
            if isinstance(node, ChunkNode):
                returnStringList.extend(
                    node.printSSFValue(prefix + '.' + str(nodePosn), allFeat))
            else:
                returnStringList.extend(
                    node.printSSFValue(prefix + '.' + str(nodePosn), allFeat))
        returnStringList.append('\t' + '))')
        return returnStringList

    def getAttribute(self, key):
        if key in self.attributes:
            return self.attributes[key]
        else:
            return None

    def addAttribute(self, key, value):
        self.attributes[key] = value

    def deleteAttribute(self, key):
        del self.attributes[key]


In [45]:
class Sentence():

    def __init__(self, sentence, ignoreErrors=True, nesting=True, dummySentence=False):
        self.ignoreErrors = ignoreErrors
        self.nesting = nesting
        self.sentence = None
        self.sentenceID = None
        self.sentenceType = None
        self.length = 0
        self.tree = None
        self.nodeList = []
        self.edges = {}
        self.nodes = {}
        self.tokenNodes = {}
        self.rootNode = None
        self.fileName = None
        self.comment = None
        self.probSent = False
        self.errors = []
        self.text = sentence
        self.dummySentence = dummySentence
        if self.dummySentence is False:

            self.analyzeSentence()

    def analyzeSentence(self, ignoreErrors=False, nesting=True):

        lastContext = self

        for line in self.text.split('\n'):
            stripLine = line.strip()

            if stripLine == "":
                continue
            elif stripLine[0] == "<" and ignoreErrors is False:
                self.errors.append('Encountered a line starting with "<"')
                self.probSent = True
            else:
                splitLine = stripLine.split()
                if len(splitLine) > 0 and splitLine[0] == '))':
                    currentChunkNode.footer = line + '\n'
                    currentChunkNode.analyzeChunk()
                    lastContext = currentChunkNode.upper
                    currentChunkNode = lastContext

                elif len(splitLine) > 1 and splitLine[1] == '((':
                    currentChunkNode = ChunkNode(line + '\n')
                    currentChunkNode.upper = lastContext
                    currentChunkNode.upper.nodeList.append(currentChunkNode)
                    if currentChunkNode.upper.__class__.__name__ != 'Sentence':
                        currentChunkNode.upper.text.append(line)
                    lastContext = currentChunkNode
                else:
                    currentNode = Node(line + '\n')
                    lastContext.nodeList.append(currentNode)
                    currentNode.upper = lastContext

    def addEdge(self, parent, child):
        if parent in self.edges.iterkeys():
            if child not in self.edges[parent]:
                self.edges[parent].append(child)
        else:
            self.edges[parent] = [child]

    def updateAttributes(self):
        populateNodesStatus = self.populateNodes()
        populateEdgesStatus = self.populateEdges()
        self.sentence = self.generateSentence()
        if populateEdgesStatus == 0 or populateNodesStatus == 0:
            return 0
        return 1

    def printSSFValue(self, allFeat):
        returnStringList = []
        returnStringList.append("<Sentence id='" + str(self.sentenceID) + "'>")
        if self.nodeList != []:
            nodeList = self.nodeList
            nodePosn = 0
            for node in nodeList:
                nodePosn += 1
                returnStringList.extend(
                    node.printSSFValue(str(nodePosn), allFeat))
        returnStringList.append('</Sentence>\n')
        return '\n'.join(x for x in returnStringList)

    def populateNodes(self, naming='strict'):
        if naming == 'strict':
            for nodeElement in self.nodeList:
                assert nodeElement.name is not None
                self.nodes[nodeElement.name] = nodeElement
        return 1

    def populateEdges(self):
        for node in self.nodeList:
            nodeName = node.name
            if node.parent == '0' or node == self.rootNode:
                self.rootNode = node
                continue
            elif node.parent not in self.nodes.iterkeys():
                #                self.errors.append('Error : Bad DepRel Parent Name ' + self.fileName + ' : ' + str(self.name))
                return 0
            assert node.parent in self.nodes.iterkeys()
            self.addEdge(node.parent, node.name)
        return 1

    def generateSentence(self):
        sentence = []
        for nodeName in self.nodeList:
            sentence.append(nodeName.printValue())
        return ' '.join(x for x in sentence)


class Document():

    def __init__(self, fileName):
        self.header = None
        self.footer = None
        self.text = None
        self.nodeList = []
        self.fileName = fileName
        self.analyzeDocument()
        self.upper = None

    def analyzeDocument(self):

        inputFD = codecs.open(self.fileName, 'r', encoding='utf8')
#        sentenceList = getSentenceIter(inputFD)
        sentenceList = findSentences(inputFD)
        for sentence in sentenceList:
            tree = Sentence(sentence[1], ignoreErrors=True, nesting=True)
            tree.text = sentence[1]
            tree.sentenceID = int(sentence[0])
            tree.footer = sentence[2]
            tree.header = "<Sentence id='" + sentence[0] + "'"
            tree.upper = self
            self.nodeList.append(tree)
        inputFD.close()


def getAddressNode(address, node, level='ChunkNode'):
    ''' Returns the node referenced in the address string relative to the node in the second argument.
        There are levels for setting the starting address-base. These are "ChunkNode", "Node" , "Sentence" , "Document" , "Relative".
        The hierarchy of levels for interpretation is :
        "Document" -> "Sentence" -> "ChunkNode" -> "Node"
        "Relative" value starts the base address from the node which contains the address. This is also the default option.
    '''

    currentContext = node

    if level != 'Relative':
        while(currentContext.__class__.__name__ != level):
            currentContext = currentContext.upper

    currentContext = currentContext.upper

    stepList = address.split('%')

    for step in stepList:
        if step == '..':
            currentContext = currentContext.upper
        else:
            refNode = [
                iterNode for iterNode in currentContext.nodeList if iterNode.name == step][0]
            currentContext = refNode
    return refNode


def getChunkFeats(line):
    lineList = line.strip().split()
    returnErrors = list()
    chunkType = None
    fsList = []
    if len(lineList) >= 3:
        chunkType = lineList[2]

    returnFeats = OrderedDict()
    multipleFeatRE = r'<fs.*?>'
    featRE = r'(?:\W*)(\S+)=([\'|\"])?([^ \t\n\r\f\v\'\"]*)[\'|\"]?(?:.*)'
    fsList = re.findall(multipleFeatRE, ' '.join(lineList))
    for x in lineList:
        feat = re.findall(featRE, x)
        if feat != []:
            if len(feat) > 1:
                returnErrors.append('Feature with more than one value')
                continue
            returnFeats[feat[0][0]] = feat[0][2]

    return [chunkType, returnFeats, fsList]


def getTokenFeats(lineList):
    tokenType, token = None, None
    returnFeats = OrderedDict()
    fsList = []
    if len(lineList) >= 3:
        tokenType = lineList[2]
    returnErrors = list()
    token = lineList[1]
    multipleFeatRE = r'<fs.*?>'
    featRE = r'(?:\W*)(\S+)=([\'|\"])?([^ \t\n\r\f\v\'\"]*)[\'|\"]?(?:.*)'
    fsList = re.findall(multipleFeatRE, ' '.join(lineList))
    for x in lineList:
        feat = re.findall(featRE, x)
        if feat != []:
            if len(feat) > 1:
                returnErrors.append('Feature with more than one value')
                continue
            returnFeats[feat[0][0]] = feat[0][2]

    return [token, tokenType, returnFeats, fsList]


def getSentenceIter(inpFD):

    sentenceRE = r'''(?P<complete>(?P<header><Sentence id=[\'\"]?(?P<sentenceID>\d+)[\'\"]?>)(?P<text>.*?)(?P<footer></Sentence>))'''
    text = inpFD.read()
    text = text.replace('0xe0', '')
    return re.finditer(sentenceRE, text, re.DOTALL)


def findSentences(inpFD):
    sentenceRE = "<Sentence id='(.*?)'>(.*?)(</Sentence>)"
    text = inpFD.read()
    text = text.replace('0xe0', '')
    return re.findall(sentenceRE, text, re.DOTALL)


def folderWalk(folderPath):
    fileList = []
    for dirPath, dirNames, fileNames in os.walk(folderPath):
        for fileName in fileNames:
            fileList.append(os.path.join(dirPath, fileName))
    return fileList


In [56]:
inputPath = "Data/"
fileList = folderWalk(inputPath)
newFileList = []
for fileName in fileList:
    # print("hi")
    xFileName = fileName.split('/')[-1]
    if xFileName == 'err.txt' or xFileName.split('.')[-1] in ['comments', 'bak'] or xFileName[:4] == 'task':
        continue
    else:
        newFileList.append(fileName)

In [57]:
# Define the filename for the CSV file
csv_filename = "sampled_data.csv"
newFileList = newFileList
# Open the CSV file in write mode and create a CSV writer object
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write the header row
    csv_writer.writerow(["Head", "Name", "Post", "Parent", "ParentRelation", "AI", "SRL", "Pred"])

    # Iterate through each file in newFileList
    for fileName in newFileList:
        print(fileName)
        d = Document(fileName)
        for tree in d.nodeList:
            csv_writer.writerow(["----- New Sentence ------"])  # Write separator for each sentence
            for chunkNode in tree.nodeList:
                temp = list(chunkNode.attributes.values())
#                 print(temp)
                key = list(chunkNode.attributes.keys())
                print(key)
                head = ""
                ai = 0
                srl = ""
                pred = ""
                for k in range(len(key)):
                    if key[k] == "head":
                        head = temp[k]

                    elif key[k] == "pbrel":
                        ai = 1
                        srl = temp[k].split(':')[-2]
                        pred = temp[k].split(':')[-1]

                pt = temp[0].split(',')
                if len(pt) > 6:
                    post = pt[6]
                    if post == " " or post == "":
                        post = 0
                else:
                    post = 0

                # Write the extracted data to the CSV file
                csv_writer.writerow([head, chunkNode.name, post, chunkNode.parent, chunkNode.parentRelation, ai, srl, pred])

print("Data saved to", csv_filename)

Data/fullnews_id_2494337_date_1_6_2004.v00.ssf.pb
['af', 'head', 'pbrel', 'posn', 'drel', 'name', 'vpos']
['af', 'posn', 'name', 'drel', 'head', 'vpos']
['af', 'posn', 'drel', 'name', 'pbrel', 'head', 'vpos']
['af', 'drel', 'posn', 'head', 'name', 'pbrel']
['af', 'posn', 'voicetype', 'name', 'pbrole', 'stype', 'head', 'vpos']
['af', 'head', 'name', 'posn', 'drel']
['af', 'drel', 'head', 'name', 'pbrel', 'posn']
['af', 'head', 'pbrel', 'posn', 'drel', 'name', 'vpos']
['af', 'posn', 'head', 'name', 'pbrel', 'drel']
['af', 'drel', 'posn', 'name', 'pbrel', 'head']
['af', 'stype', 'drel', 'pbrole', 'voicetype', 'head', 'name', 'posn']
['af', 'posn', 'name', 'head', 'drel']
['af', 'name', 'drel', 'posn', 'head', 'vpos']
['af', 'drel', 'name', 'posn', 'pbrel', 'head', 'vpos']
['af', 'posn', 'name', 'pbrel', 'head', 'drel']
['af', 'drel', 'name', 'posn', 'head', 'pbrel', 'vpos']
['af', 'head', 'pbrel', 'name', 'posn', 'drel']
['af', 'head', 'name', 'posn', 'stype', 'drel', 'pbrole', 'voicetype

['af', 'stype', 'name', 'posn', 'voicetype', 'head']
['af', 'pbrel', 'head', 'drel', 'posn', 'name', 'vpos']
['af', 'pbrel', 'name', 'head', 'drel', 'posn', 'vpos']
['af', 'posn', 'voicetype', 'name', 'stype', 'pbrole', 'head']
['af', 'name', 'posn', 'drel', 'head', 'pbrel']
['af', 'pbrel', 'head', 'name', 'drel', 'posn']
['pbmrel', 'name', 'ectype', 'head', 'pbref']
['af', 'pbrel', 'head', 'name', 'drel', 'posn']
['af', 'name', 'posn', 'head', 'drel', 'pbrel', 'vpos']
['af', 'pbrole', 'name', 'drel', 'head', 'posn', 'vpos']
['af', 'posn', 'drel', 'name', 'head']
['af', 'drel', 'head', 'posn', 'name', 'stype', 'voicetype', 'vpos']
['af', 'posn', 'drel', 'head', 'name', 'vpos']
['af', 'posn', 'head', 'drel', 'name', 'vpos']
['af', 'pbrel', 'name', 'head', 'drel', 'posn', 'vpos']
['af', 'head', 'drel', 'posn', 'name', 'pbrel', 'vpos']
['af', 'pbrole', 'head', 'voicetype', 'posn', 'stype', 'name']
['af', 'head', 'pbrel', 'name', 'posn', 'drel']
['af', 'name', 'posn', 'drel', 'head', 'pbre

['af', 'drel', 'head', 'posn', 'name']
['af', 'drel', 'head', 'name', 'posn']
['af', 'posn', 'head', 'name', 'drel', 'vpos']
['af', 'head', 'drel', 'name', 'posn']
['af', 'name', 'head', 'posn', 'pbrel', 'drel', 'vpos']
['af', 'pbrel', 'posn', 'name', 'head', 'drel']
['af', 'drel', 'head', 'posn', 'name', 'pbrel']
['af', 'head', 'posn', 'name', 'pbrole', 'unaccusative', 'vpos']
['af', 'head', 'name', 'posn']
['af', 'posn', 'head', 'name', 'drel', 'pbrel', 'vpos']
['af', 'posn', 'name', 'head', 'drel']
['af', 'name', 'posn', 'drel', 'head']
['af', 'head', 'drel', 'stype', 'voicetype', 'posn', 'name']
['af', 'name', 'posn', 'head', 'drel']
['pbref', 'head', 'pbmrel', 'ectype', 'name']
['af', 'head', 'drel', 'pbrel', 'posn', 'name']
['af', 'voicetype', 'name', 'posn', 'head', 'pbrole', 'drel', 'stype']
['af', 'name', 'posn', 'drel', 'head']
['af', 'drel', 'posn', 'head', 'name', 'vpos']
['af', 'drel', 'name', 'posn', 'head', 'vpos']
['af', 'posn', 'name', 'head', 'drel']
['af', 'drel', 'h

In [58]:
# Define the filenames for the input and output CSV files
input_csv_filename = "sampled_data.csv"
output_csv_filename = "output_dataset.csv"

# Open the input CSV file in read mode
with open(input_csv_filename, 'r', newline='', encoding='utf-8') as input_file:
    csv_reader = csv.reader(input_file)
    
    # Open the output CSV file in write mode
    with open(output_csv_filename, 'w', newline='', encoding='utf-8') as output_file:
        csv_writer = csv.writer(output_file)
        
        # Write the header row to the output CSV file
        csv_writer.writerow(["Sentence", "SRLs"])
        
        # Skip the header row in the input CSV file
        next(csv_reader)
        
        # Initialize variables to store the current sentence and its SRLs
        current_sentence = []
        current_srls = []

        # Iterate through each row in the input CSV file
        for row in csv_reader:
            # Check if the row represents the start of a new sentence
            if row[0] == "----- New Sentence ------":
                # If it is not the first sentence, write the previous sentence and its SRLs to the output CSV file
                if current_sentence:
                    csv_writer.writerow([' '.join(current_sentence), ' '.join(current_srls)])
                
                # Assign the current row as the start of a new sentence
                current_sentence = []  # Store the sentence
                current_srls = []  # Reset the SRLs for the new sentence
            else:
                # If it's not the start of a new sentence
                # Append the token from the current row to the current sentence
                current_sentence.append(row[0])

                # Check if the 6th column value is "1"
                if row[5] == "1":
                    # If it is, append the SRL from the 7th column to the current SRLs
                    current_srls.append(row[6])
                else:
                    # If not, append "NO_SRL" to the current SRLs
                    current_srls.append("NO_SRL")
        
        # Write the last sentence and its SRLs after processing all rows
        if current_sentence:
            csv_writer.writerow([' '.join(current_sentence), ' '.join(current_srls)])

print("Output dataset has been saved to:", output_csv_filename)


Output dataset has been saved to: output_dataset.csv


In [59]:

# Define the filenames for the input and output CSV files
input_csv_filename = "sampled_data.csv"

# Define an empty set to store unique ParentRelation values
parent_relation_set = set()

# Open the input CSV file in read mode
with open(input_csv_filename, 'r', newline='', encoding='utf-8') as input_file:
    csv_reader = csv.reader(input_file)
    
    # Skip the header row in the input CSV file
    next(csv_reader)
    
    # Iterate through each row in the input CSV file
    for row in csv_reader:
        if row[0] == "----- New Sentence ------":
            continue
        # Get the ParentRelation value from the row
        parent_relation = row[4]  # Assuming ParentRelation is in the 4th column
        
        # Add the ParentRelation value to the set
        parent_relation_set.add(parent_relation)

# Print the set of unique ParentRelation values
print("Unique ParentRelation values:", parent_relation_set)


Unique ParentRelation values: {'root', 'k1u', 'k1g', 'pof-idiom', 'rd', 'mod', 'ccof', 'rh', 'k2u', 'ras-k7p', 'k2g', 'enm', 'k7', 'rs', 'r6v', 'pk1', 'r6', 'k7p', 'ras-k7', 'k3', 'ras-k1', 'fragof', 'nmod__k1inv', 'k2', 'k7ts', 'ras-neg-k1', 'nmod__relc', 'pof', 'sent-adv', 'k4u', 'k5prk', 'r6-k1', 'nmod__k2inv', 'nmod__pofinv', 'rbmod__relc', 'k1', 'k5', 'jjmod', 'k7t', 'jk1', 'k7pu', 'adv', 'rsp', 'rt', 'r6-k2', 'nmod', 'ras-k2', 'k2p', 'mk1', 'k2s', 'ras-r6', 'vmod', 'ras', 'k4a', 'vmod__adv', 'k4', 'k1s', 'rbmod'}


In [60]:

# Define the filenames for the input and output CSV files
input_csv_filename = "sampled_data.csv"
output_csv_filename = "output_drel_dataset.csv"

# Open the input CSV file in read mode
with open(input_csv_filename, 'r', newline='', encoding='utf-8') as input_file:
    csv_reader = csv.reader(input_file)
    
    # Open the output CSV file in write mode
    with open(output_csv_filename, 'w', newline='', encoding='utf-8') as output_file:
        csv_writer = csv.writer(output_file)
        
        # Write the header row to the output CSV file
        csv_writer.writerow(["Sentence", "Dependency", "SRLs"])
        
        # Skip the header row in the input CSV file
        next(csv_reader)
        
        # Initialize variables to store the current sentence and its SRLs and Dependency
        current_sentence = []
        current_srls = []
        current_dependency = []

        # Iterate through each row in the input CSV file
        for row in csv_reader:
            # Check if the row represents the start of a new sentence
            if row[0] == "----- New Sentence ------":
                # If it is not the first sentence, write the previous sentence and its SRLs to the output CSV file
                if current_sentence:
                    csv_writer.writerow([' '.join(current_sentence), ' '.join(current_dependency), ' '.join(current_srls)])
                
                # Assign the current row as the start of a new sentence
                current_sentence = []  # Store the sentence
                current_srls = []  # Reset the SRLs for the new sentence
                current_dependency = []  # Reset the Dependency for the new sentence
            else:
                # If it's not the start of a new sentence
                # Append the token from the current row to the current sentence
                current_sentence.append(row[0])

                # Check if the 6th column value is "1"
                if row[5] == "1":
                    # If it is, append the SRL from the 7th column to the current SRLs
                    current_srls.append(row[6])
                else:
                    # If not, append "NO_SRL" to the current SRLs
                    current_srls.append("NO_SRL")
                
                # Append the Dependency from the 4th column to the current Dependency
                current_dependency.append(row[4])

        # Write the last sentence, its SRLs, and Dependency after processing all rows
        if current_sentence:
            csv_writer.writerow([' '.join(current_sentence), ' '.join(current_dependency), ' '.join(current_srls)])

print("Output dataset has been saved to:", output_csv_filename)

Output dataset has been saved to: output_drel_dataset.csv


In [61]:

def print_token_counts(filename):
    # Open the CSV file in read mode
    with open(filename, 'r', newline='', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)

        # Get the header row
        header = next(csv_reader)

        # Iterate through each row in the CSV file
        for row_number, row in enumerate(csv_reader, start=1):
            # Split tokens in the first column to count tokens
            first_column_tokens = row[0].split()
            first_column_token_count = len(first_column_tokens)

            # Iterate through each column except the first one
            for col_number, token in enumerate(row[1:], start=2):
                # Count the number of tokens in the current column
                column_token_count = len(token.split())

                # Check if the token count for the current column is different from the first column
                if column_token_count != first_column_token_count:
                    print(f"For row {row_number}, column {col_number}: "
                          f"Different token count ({column_token_count} tokens) "
                          f"than the first column ({first_column_token_count} tokens)")


In [62]:
# Test the function with the provided CSV file
filename = "output_drel_dataset.csv"
print_token_counts(filename)
filename = "output_dataset.csv"
print_token_counts(filename)

For row 446, column 2: Different token count (20 tokens) than the first column (19 tokens)
For row 446, column 3: Different token count (20 tokens) than the first column (19 tokens)
For row 516, column 2: Different token count (22 tokens) than the first column (21 tokens)
For row 516, column 3: Different token count (22 tokens) than the first column (21 tokens)
For row 446, column 2: Different token count (20 tokens) than the first column (19 tokens)
For row 516, column 2: Different token count (22 tokens) than the first column (21 tokens)


In [63]:

def remove_rows_with_mismatch(filename):
    # Open the CSV file in read mode and create a list to store rows to remove
    rows_to_remove = []
    rows = []  # List to store non-header rows

    # Read the CSV file and identify rows with mismatched token counts
    with open(filename, 'r', newline='', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)
        
        # Get the header row
        header = next(csv_reader)
        
        # Iterate through each row in the CSV file
        for row_number, row in enumerate(csv_reader, start=1):
            # Split tokens in the first column to count tokens
            first_column_tokens = row[0].split()
            first_column_token_count = len(first_column_tokens)

            # Iterate through each column except the first one
            for col_number, token in enumerate(row[1:], start=2):
                # Count the number of tokens in the current column
                column_token_count = len(token.split())

                # Check if the token count for the current column is different from the first column
                if column_token_count != first_column_token_count:
                    # Add the row number to the list of rows to remove
                    rows_to_remove.append(row_number)
                    break  # Break the loop as we only need to check one column
            else:
                # If no mismatch is found, add the row to the list of non-header rows
                rows.append(row)

    # Remove rows with mismatched token counts from the list of rows
    for row_number in reversed(rows_to_remove):  # Iterate in reverse to avoid index issues
        del rows[row_number - 1]  # Adjust row number to account for header

    # Write the filtered rows back to the original CSV file
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(header)  # Write the header row
        csv_writer.writerows(rows)   # Write the filtered rows


In [64]:
# Test the function with the provided CSV file
filename = "output_drel_dataset.csv"
remove_rows_with_mismatch(filename)
# Test the function with the provided CSV file
filename = "output_dataset.csv"
remove_rows_with_mismatch(filename)


In [65]:
# Test the function with the provided CSV file
filename = "output_drel_dataset.csv"
print_token_counts(filename)
filename = "output_dataset.csv"
print_token_counts(filename)

In [28]:

inputPath = "Data1/"
fileList = folderWalk(inputPath)
newFileList = []
for fileName in fileList:
    # print("hi")
    xFileName = fileName.split('/')[-1]
    if xFileName == 'err.txt' or xFileName.split('.')[-1] in ['comments', 'bak'] or xFileName[:4] == 'task':
        continue
    else:
        newFileList.append(fileName)


In [None]:

# Define the filename for the CSV file
csv_filename = "urdu_sampled_data.csv"
newFileList = [newFileList[0]]
# Open the CSV file in write mode and create a CSV writer object
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write the header row
    csv_writer.writerow(["Head", "Name", "Post", "Parent", "ParentRelation", "AI", "SRL", "Pred"])

    # Iterate through each file in newFileList
    for fileName in newFileList:
        print(fileName)
        d = Document(fileName)
        for tree in d.nodeList:
            csv_writer.writerow(["----- New Sentence ------"])  # Write separator for each sentence
            for chunkNode in tree.nodeList:
                temp = list(chunkNode.attributes.values())
#                 print(temp)
                key = list(chunkNode.attributes.keys())
                print(key)
                head = ""
                ai = 0
                srl = ""
                pred = ""
                for k in range(len(key)):
                    if key[k] == "head":
                        head = temp[k]

                    elif key[k] == "pbrel":
                        ai = 1
                        srl = temp[k].split(':')[-2]
                        pred = temp[k].split(':')[-1]

                pt = temp[0].split(',')
                if len(pt) > 6:
                    post = pt[6]
                    if post == " " or post == "":
                        post = 0
                else:
                    post = 0

                # Write the extracted data to the CSV file
                csv_writer.writerow([head, chunkNode.name, post, chunkNode.parent, chunkNode.parentRelation, ai, srl, pred])

print("Data saved to", csv_filename)