# **Information Retrieval course (July 2023)**

**Project Supervisor:** Prof. Alfio Ferrara

**Provider:** Reza Ghahremani

**Project Title:** Relation Classification

## Step 0: Requirements


1.   Installing packages
2.   Importing libraries
3.   Defining functions





### Installing packages
**1. Natural Language Toolkit:**

**NLTK** is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for *classification, tokenization, stemming, tagging, parsing, and semantic reasoning*, wrappers for industrial-strength NLP libraries, and an active discussion forum.


---



**Reference:** https://www.nltk.org/ and
https://www.nltk.org/install.html

In [None]:
!pip install nltk



You should consider upgrading via the 'C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


### Importing libraries

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.

This tokenizer performs the following steps:


*   split standard contractions, e.g. don't -> do n't and they'll -> they 'll
*   treat most punctuation characters as separate tokens
*   split off commas and single quotes, when followed by whitespace
*   separate periods that appear at the end of line


---






**Reference:** https://www.nltk.org/api/nltk.tokenize.TreebankWordTokenizer.html

In [1]:
#Importing requirement libraries

import numpy as np
import os
import re
import random
from nltk.tokenize import TreebankWordTokenizer
os.environ['CLASSPATH'] = "H:/Relation-Classification/stanford/stanford-postagger-2017-06-09"

In [2]:
#Reading files and creating new files

train_file = './dataset/TRAIN_FILE.TXT'
test_file = './dataset/TEST_FILE_FULL.TXT'

new_train_file = "./files/new_train_file.txt"
new_test_file = "./files/new_test_file.txt"

### Difining Functions

In [3]:
"""
clean_tokens: this function is used to clean and normalize tokens by
removing leading/trailing whitespace and converting multi-word tokens
into a single string with underscores as separators.
"""

def clean_tokens(sentence_number, tokens):
        temp = []
        for tok in tokens:
            tok = tok.strip().split()
            if len(tok) > 1:
                print(sentence_number, tok)
            tok = "_".join(tok)
            temp.append(tok)
        return temp


In [4]:
"""
create_file: this function processes the input file line by line, transforming
the sentences and their associated information into the desired format and
writes the result to the output file.
"""

def create_file(filepath, outputpath):
    file = open(outputpath, 'w')
    lines = [line.strip() for line in open(filepath)]
    for idx in range(0, len(lines), 4):
        sentence_num = lines[idx].split("\t")[0]
        sentence = lines[idx].split("\t")[1][1:-1]
        label = lines[idx+1]

        sentence = sentence.replace("<e1>", " E1_START ").replace("</e1>", " E1_END ")
        sentence = sentence.replace("<e2>", " E2_START ").replace("</e2>", " E2_END ")

        tokens = TreebankWordTokenizer().tokenize(sentence)
        tokens = clean_tokens(sentence_num, tokens)

        file.write(" ".join([ label, " ".join(tokens) ]))
        file.write("\n")
    file.close()

    print(outputpath, "created")

In [5]:
create_file(train_file, new_train_file)
create_file(test_file, new_test_file)

print("Train / Test file created")

./files/new_train_file.txt created
./files/new_test_file.txt created
Train / Test file created


In [8]:
new_train_file = './files/new_train_file.txt'
new_test_file = './files/new_test_file.txt'

new_train_file_with_line = './files/new_train_file_with_line.txt'
val_file = './files/val_file.txt'
new_test_file_with_line = './files/new_test_file_with_line.txt'

train_answer_keys_path = './files/train_answer_keys.txt'
val_answer_keys_path = './files/val_answer_keys.txt'
test_answer_keys_path = './files/test_answer_keys.txt'

In [9]:
"""
add_sent_number: this function takes an input file,
adds line numbers to each line, and writes the modified
lines to an output file.
"""

def add_sent_number(file_in, file_out):
    print(file_in)
    print(file_out)

    f_in = open(file_in, 'r')
    lines = f_in.readlines()
    f_in.close()

    f_out = open(file_out, 'w')
    for i in range(len(lines)):
        num = str(int(i+1))
        ln = num + " " + lines[i]
        f_out.write(ln)
    f_out.close()

# Call
add_sent_number(new_train_file, new_train_file_with_line)
add_sent_number(new_test_file, new_test_file_with_line)

./files/new_train_file.txt
./files/new_train_file_with_line.txt
./files/new_test_file.txt
./files/new_test_file_with_line.txt


In [10]:
"""
get_val_sent_index:
This function reads a file specified by new_train_file_with_line and extracts
sentence numbers and labels from each line. It then randomly selects a
subset of sentence numbers from each label and returns a sorted list of
these selected numbers as the validation index.
"""

def get_val_sent_index():

    global new_train_file_with_line

    label_to_sent_num = {}

    f_in = open(new_train_file_with_line, 'r')
    lines = f_in.readlines()
    f_in.close()

    for l in lines:
        l = l.strip().split(" ")[:2]
        num = int(l[0])
        lab = str(l[1])

        if lab not in label_to_sent_num:
            label_to_sent_num[lab] = []

        label_to_sent_num[lab].append(num)


    val_index = []

    for l in label_to_sent_num:
        sent_num = label_to_sent_num[l]
        num = int(len(sent_num) / 10)
        random.shuffle(sent_num)
        random.shuffle(sent_num)
        val_index += sent_num[:num]

    val_index = sorted(val_index)
    print("len(val_index)", len(val_index))
    print("val_index[:5]", val_index[:5])
    return val_index


# Call
val_index = get_val_sent_index()

len(val_index) 792
val_index[:5] [9, 29, 31, 34, 35]


In [11]:
"""
train_val_split: this function reads the data from the new_train_file_with_line file,
splits it into train and validation sets based on the val_index list, and writes
the corresponding lines to the new_train_file_with_line and val_file files. It
then prints a message indicating that the train and validation split has been performed.
"""

def train_val_split(val_index):
    global new_train_file_with_line, val_file

    f_in = open(new_train_file_with_line, 'r')
    lines = f_in.readlines()
    f_in.close()

    f_train = open(new_train_file_with_line, 'w')
    f_val = open(val_file, 'w')

    for l in lines:
        l = l.strip().split(" ")
        num = int(l[0])
        lab = str(l[1])

        if num in val_index:
            f_val.write(" ".join(l) + "\n")
        else:
            f_train.write(" ".join(l) + "\n")

    f_train.close()
    f_val.close()

    print("Train - Val - Split ")

# Call
train_val_split(val_index)

Train - Val - Split 


In [12]:
"""
train_val_total_check:
This function calculates the total count of sentences by summing the counts
from two files specified by new_train_file_with_line and val_file. It uses
the get_count function internally to extract the counts of sentences for each
label in each file.
"""

def train_val_total_check(train_attn_sp_path, val_attn_sp_path):

    def get_count(file_path):
        print(file_path)

        label_to_sent_count = {}

        f_in = open(file_path, 'r')
        lines = f_in.readlines()
        f_in.close()

        for l in lines:
            l = l.strip().split(" ")[:2]
            num = int(l[0])
            lab = str(l[1])

            if lab not in label_to_sent_count:
                label_to_sent_count[lab] = 0

            label_to_sent_count[lab] += 1

        return label_to_sent_count

    train = get_count(train_attn_sp_path)
    val = get_count(val_attn_sp_path)

    c = 0
    for l in train:
        c += train[l]
        if l in val:
            c += val[l]

    print(c)


# Call
train_val_total_check(new_train_file_with_line, val_file)

./files/new_train_file_with_line.txt
./files/val_file.txt
8000


In [13]:
"""
create_answer_keys: this function reads a file specified by in_file,
extracts line numbers and labels from each line, and writes them to a
new file specified by out_file in the format of line number followed by
a tab and the label. It then prints a message indicating that the output
file has been created.
"""

def create_answer_keys(in_file, out_file):

    f_in = open(in_file, 'r')
    lines = f_in.readlines()
    f_in.close()

    f_out = open(out_file, 'w')

    for i in range(0, len(lines)):
        l = lines[i].strip().split(" ")
        num = str(i+1)
        lab = str(l[1])
        f_out.write(num + "\t" + lab)
        f_out.write("\n")
    f_out.close()

    print(out_file + " " + "Created")

In [14]:
create_answer_keys(new_train_file_with_line, train_answer_keys_path)
create_answer_keys(val_file, val_answer_keys_path)
create_answer_keys(new_test_file_with_line, test_answer_keys_path)

./files/train_answer_keys.txt Created
./files/val_answer_keys.txt Created
./files/test_answer_keys.txt Created
