# Pre-process NYT Data
## The original data is in JSON format. Pre-process the train data to generate CSV file: NYT_train_df_nn.csv. The test data can be pre-processed in the same way. Both pre-processed data sets, NYT_train_df_nn.csv and NYT_test_df_nn.csv, were uploaded in the clustering-network-analysis/data/ folder.



In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/RTER')

# Pre-process train NYT JSON File

In [None]:
import json
# This function takes a path as parameter and loads the Json file into a list of json objects
def loadJson(jfile):
    """
    This function takes a path as the argument and loads the Json file into a list of json objects
    
    Arguments:
    jfile -- a path pointing to a file containing Json objects in text format
    
    Return:
    jsons -- a list of Json objects
    """
    jsons = []
    with open(jfile) as f:
        for line in f:
            jsons.append(json.loads(line))
    
    return jsons

In [None]:
def makeDFFromJson(jsons):
    """
    This function takes a list of Json objects as the argument, it then flattens the Json objects, and creates
    a data frame
    
    Arguments:
    jsons --  a list of Json objects representing the annotations of sentences
    
    Return:
    df --  a data frame representing the flattened annotations.
    
    """
    sentId_list= []
    articleId_list = []
    em1_list = []
    em2_list = []
    em1label_list = []
    em2label_list = []
    em1start_list = []
    em2start_list = []
    relation_list = []
    sentText_list = []

    for annot in jsons:
        for relMention in annot['relationMentions']:
            em1 = relMention['em1Text']
            em2 = relMention['em2Text']
            em1lbl = ""
            em2lbl = ""
            em1start = -1
            em2start = -1
            for entityMention in annot['entityMentions']:
                if em1 == entityMention['text']:
                    em1lbl = entityMention['label']
                    em1start = entityMention['start']
                if em2 == entityMention['text']:
                    em2lbl = entityMention['label']
                    em2start =entityMention['start']

            sentId_list.append(annot['sentId'])
            articleId_list.append(annot['articleId'])
            em1_list.append(em1)
            em2_list.append(em2)
            em1label_list.append(em1lbl)
            em2label_list.append(em2lbl)
            em1start_list.append(em1start)
            em2start_list.append(em2start)
            relation_list.append(relMention['label'])
            sentText_list.append(annot['sentText'])
        
    data = {"sentId" : sentId_list, "articleId" : articleId_list, "em1":em1_list, "em1label":em1label_list,\
                           "em1start":em1start_list, "em2":em2_list, "em2label":em2label_list, "em2start":em2start_list, 
                           "relation":relation_list, "sentText":sentText_list}
    #index=np.arange(len(sentId_list))
    df = pd.DataFrame(data=data)
    
    return df

In [None]:
train_json = loadJson('/content/drive/My Drive/Colab Notebooks/RTER/data/train.json')

In [None]:
train_df = makeDFFromJson(train_json)

In [None]:
train_df.shape

In [None]:
# Remove None relation
train_df_nn = train_df[train_df.relation != "None"]
train_df_nn.shape

In [None]:
train_df_nn.drop_duplicates()
train_df_nn.shape

In [None]:
train_df_nn.groupby('relation').count()

In [None]:
# Reset the index to be in the range (0, 395)
train_df_nn.index = range(len(train_df_nn.index))
train_df_nn.tail()

In [None]:
# Convert relation to id number
unique_rel = train_df_nn["relation"].unique()
rel2idx = {rel:id+1 for id, rel in enumerate(unique_rel)}
rel2idx

In [None]:
# Set up a column with relation ids
train_df_nn["relIdx"] = train_df_nn["relation"].map(rel2idx)

In [None]:
# Clean up the sentText by replacing space before punctuations with '' and replacing '' with '
import re
#Clean up the sentText by replacing space before punctuations with '' and replacing \'\' with '
def cleanSent(aSent):
    sent = aSent.replace("\'\'", '"').replace("\'", "'").replace("\r", "").replace("\n", "").replace('" ', '"')
    sent = re.sub(r'\s+([?.,!\':;])', r'\1',sent)
    return sent
train_df_nn['sentTextClnd'] = train_df_nn['sentText'].apply(cleanSent)

In [None]:
!pip install unidecode

In [None]:
from unidecode import unidecode

In [None]:
train_df_nn['sentTextClndUni'] = train_df_nn.sentTextClnd.apply(unidecode)

In [None]:
train_df_nn.shape

In [None]:
train_df_nn.relIdx.value_counts()

In [None]:
#train_df_nn.to_csv("/content/drive/My Drive/Colab Notebooks/RTER/data/NYT_train_df_nn.csv")