# Preprocessing

In [None]:
import json
import pandas as pd
from tqdm import tqdm

## Loading the WIESP Training Jsonl file

In [2]:
import json
with open("WIESP2022-NER-TRAINING.jsonl", 'r') as f:
    wiesp_dev_json = [json.loads(l) for l in list(f)]

## Keys in the Jsonl file - 

In [3]:
columns = list(wiesp_dev_json[0].keys())

In [4]:
columns

['bibcode',
 'label_studio_id',
 'ner_ids',
 'ner_tags',
 'section',
 'tokens',
 'unique_id']

## Converting Jsonl to DataFrame

In [8]:
df = pd.DataFrame(columns = columns)

In [9]:
for row in tqdm(wiesp_dev_json):
  df = df.append(pd.DataFrame(row))

In [12]:
df.to_csv('WIESP2022-NER-TRAINING.csv', index=False)

## Saving dataframe as parquet file

Parquet files have a very less size compared to the CSV files for the same amount of the data they contain. Hence, we will be working with parquet files throughout this project, as downloading parquet files from the GitHub repository will be faster. After converting jsonl to parquet we are storing the parquet file on the following GitHub repository.

url - https://github.com/BhardwajAnshul/NER-Project

In [7]:
df.to_parquet('WIESP2022-NER-TRAINING.parquet', index=False)

In [3]:
# Reading the training parquet file from github repository.
df = pd.read_parquet('https://github.com/BhardwajAnshul/NER-Project/blob/main/WIESP_TRAINING_SPLITTED.parquet?raw=true')

### Defining functions for pre-processing

1. For preprocessing we are replacing all the hyphen in the Dataset with whitespaces

2. If there are years in the Dataset, we are replacing them with 'YEAR' token

3. If any word has number inside it, we are replacing it with 'NUM' token

4. We are also removing non-English characters such as (),.:;/[]"

In [5]:
# replces hyphan with " "
def replace_hyphen(value):
    return value.replace('-', ' ').strip()

# puts spaces around certain characters in token. eg. a(b will become a ( b
chars = '(),.:;/[]"='
def put_space_around_chars(value):
    for char in chars:
        value =  value.replace(char, f' {char} ')
    return value.strip()

# checks if a token has number in it.
def containsNumber(value):
    for character in value:
        if character.isdigit():
            return True
    return False

# checks if a token is Year. Year range : 1900 to 2030
def isYear(value):
    value = value.replace('-', '').strip()
    if value.isdigit():
        if 1900<=int(value)<=2030:
            return True
    return False

In [6]:
# Applying the above defined functions on DataFrame
df['tokens'] = [replace_hyphan(word) for word in df.tokens]
df['tokens'] = [put_space_around_chars(word) for word in df.tokens]
df["tokens"] = df["tokens"].str.split(" ")
df = df.explode("tokens").reset_index(drop=True)
df['tokens'] = ['<YEAR>' if isYear(word) else word for word in df.tokens]
df['tokens'] = ['<NUM>' if containsNumber(word) else word for word in df.tokens]

In [8]:
# saving dataframe to parquet file
df.to_parquet('WIESP_TRAINING_PREPROCESSED.parquet', index=False)