In [1]:
import pandas as pd
import pyarrow.parquet as pq
import s3fs
s3 = s3fs.S3FileSystem()

In [72]:
import numpy as np
from IPython.display import display

In [4]:
train_df = pq.ParquetDataset('s3://dagpapsubmission/data/data_train_data.parquet', filesystem=s3).read_pandas().to_pandas()
dev_df = pq.ParquetDataset('s3://dagpapsubmission/data/data_dev_data.parquet', filesystem=s3).read_pandas().to_pandas()

In [5]:
train_df.head()
dev_df.head()

Unnamed: 0_level_0,text,tokens
index,Unnamed: 1_level_1,Unnamed: 2_level_1
12313,Phylogenetic networks are a generalization of ...,"b'[""Phylogenetic"",""networks"",""are"",""a"",""genera..."
3172,Prediction modelling is more closely aligned w...,"b'[""Prediction"",""modelling"",""is"",""more"",""close..."
6451,The heat transfer exhibits the flow of heat (t...,"b'[""The"",""heat"",""transfer"",""exhibits"",""the"",""f..."
4351,a common experience during superficial ultraso...,"b'[""a"",""common"",""experience"",""during"",""superfi..."
22694,Code metadata Current code version v1.5.9 Perm...,"b'[""Code"",""metadata"",""Current"",""code"",""version..."


In [None]:
%%time
# For running locally
train_df = pd.read_parquet('/Users/gayatri/Downloads/train_data.parquet', engine="fastparquet")

In [None]:
print(train_df.shape)
train_df.dtypes

In [20]:
print(type(train_df.iloc[0]['tokens']))
print(type(train_df.iloc[0]['token_label_ids']))
print(type(train_df.iloc[0]['annotations']))

<class 'list'>
<class 'list'>
<class 'list'>


In [47]:
# Verify if len(tokens) == len(token_label_ids) in each row
train_df['len_tokens'] = train_df['tokens'].map(len)
train_df['len_token_ids'] = train_df['token_label_ids'].map(len)
train_df.head()

Unnamed: 0_level_0,text,annotations,tokens,token_label_ids,len_tokens,len_token_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15096,"Across the world, Emergency Departments are fa...","[[0, 3779, human], [3780, 7601, NLTK_synonym_r...","[Across, the, world,, Emergency, Departments, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3821,3821
14428,lung Crab is the in the lead make of cancer-re...,"[[0, 4166, NLTK_synonym_replacement], [4167, 2...","[lung, Crab, is, the, in, the, lead, make, of,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4330,4330
2144,"The number of osteoporotic fractures, particul...","[[0, 3264, chatgpt], [3265, 17179, human], [17...","[The, number, of, osteoporotic, fractures,, pa...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",3439,3439
5826,The COVID-19 pandemic has spread to every coun...,"[[0, 3666, human], [3667, 6954, chatgpt], [695...","[The, COVID-19, pandemic, has, spread, to, eve...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3819,3819
1452,Endophytic fungi live a significant part of th...,"[[0, 10489, human], [10490, 12000, summarized]...","[Endophytic, fungi, live, a, significant, part...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6404,6404


In [48]:
train_df[train_df['len_tokens'] != train_df['len_token_ids']]

Unnamed: 0_level_0,text,annotations,tokens,token_label_ids,len_tokens,len_token_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


#### Verified that len(tokens) = len(token_label_ids) in each row

#### Splitting each row of data into multiple rows based on the annotations

In [66]:
def split_text_rows_on_annotation(input_df):
    transformed_rows = []
    for index, row in input_df.iterrows():
        # print(index)
        len_token = len(row['tokens'])
        for new_annot in row['annotations']:
            row_dict = {}
            row_dict['doc_id'] = index
            # row_dict['doc_text'] = row['text']
            row_dict['tokens'] = row['tokens'][new_annot[0]:new_annot[1]]
            row_dict['token_label_ids'] = row['token_label_ids'][new_annot[0]:new_annot[1]]
            row_dict['annotation'] = new_annot[2]
            row_dict['start_id'] = new_annot[0]
            row_dict['end_id'] = new_annot[1]
            row_dict['token_length'] = len(row_dict['tokens'])
            if row_dict['token_length'] == 0:
                row_dict['tokens'] = None
            row_dict['token_label_id_length'] = len(row_dict['token_label_ids'])
            if row_dict['token_label_id_length'] == 0:
                row_dict['token_label_ids'] = None
            
            row_dict['exp_token_length'] = new_annot[1] - new_annot[0]
            row_dict['doc_token_length'] = len_token
            
            if row_dict['tokens'] is not None:
                unique_ids = set(row_dict['token_label_ids'])
                row_dict['len_unique_token_ids'] = len(unique_ids)
                if row_dict['len_unique_token_ids'] == 1:
                    row_dict['unique_token_id'] = unique_ids.pop()
                else:
                    row_dict['unique_token_id'] = unique_ids
            else:
                row_dict['unique_token_id'] = None
                row_dict['len_unique_token_ids'] = 0

            if row_dict['annotation'] == "human":
                row_dict['expected_token_label_id'] = 0
            elif row_dict['annotation'] == "NLTK_synonym_replacement":
                row_dict['expected_token_label_id'] = 1
            elif row_dict['annotation'] == "chatgpt":
                row_dict['expected_token_label_id'] = 2
            elif row_dict['annotation'] == "summarized":
                row_dict['expected_token_label_id'] = 3
            else:
                row_dict['expected_token_label_id'] = None                
                 
            transformed_rows.append(row_dict)
            
    return pd.DataFrame(transformed_rows)

In [85]:
def sanity_checks(transformed_df):
    print("Total rows in the df", transformed_df.shape)
    print()
    # Find the number of Nulls
    print("Null check")
    display(transformed_df.isna().sum().reset_index().rename(columns={'index':'column_names', 0:'null_count'}))
    print()
    
    # Number of rows with token_length != token_label_id_length
    print("Mismatches where tokens and token_label_ids in input file do not have the same size")
    display(transformed_df[transformed_df['token_length'] != transformed_df['token_label_id_length']])
    print()
    
    # Find the number of rows with len_unique_token_ids > 1
    print("Number of rows with more than one unique token id", transformed_df[transformed_df['len_unique_token_ids'] > 1].shape[0])
    print()
    print("Number of rows with only one unique token id", transformed_df[transformed_df['len_unique_token_ids'] == 1].shape[0])
    
    # Number of rows with unique_token_id != expected_token_label_id
    print("Rows with only one unique token id but the token id does not match with the expected token label id")
    display(transformed_df[(transformed_df['len_unique_token_ids'] == 1) & 
            (transformed_df['unique_token_id'] != transformed_df['expected_token_label_id'])])
    print()

    # Label distribution for the data with unique labels
    print("Label distribution for the data with unique labels")
    display(transformed_df[transformed_df['len_unique_token_ids'] == 1][['annotation', 'expected_token_label_id']].value_counts().reset_index())
    print()
    
    # Label distribution for all data
    print("Label distribution for all the data")
    display(transformed_df[['annotation', 'expected_token_label_id']].value_counts().reset_index())
    print()

    print("Token id distribution for unique labels")
    display(transformed_df[transformed_df['len_unique_token_ids'] == 1][['annotation', 'unique_token_id']].value_counts().reset_index())
    print()


### Main eda/ pipeline

1. Read the train data file
2. Transform the data
3. Perform sanity checks
4. Save the file locally for further feature transformation

In [67]:
%%time
# Read the train data file

# Transform the data
transformed_train_df = split_text_rows_on_annotation(train_df)
print(transformed_train_df.shape)
transformed_train_df.head()

(26414, 13)
CPU times: user 907 ms, sys: 1.19 s, total: 2.1 s
Wall time: 5.49 s


Unnamed: 0,doc_id,tokens,token_label_ids,annotation,start_id,end_id,token_length,token_label_id_length,exp_token_length,doc_token_length,len_unique_token_ids,unique_token_id,expected_token_label_id
0,15096,"[Across, the, world,, Emergency, Departments, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",human,0,3779,3779,3779,3779,3821,3,"{0, 1, 3}",0
1,15096,"[Resources,, Project, administration,, Supervi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",NLTK_synonym_replacement,3780,7601,41,41,3821,3821,1,0,1
2,15096,,,summarized,7602,9401,0,0,1799,3821,0,,3
3,15096,,,human,9402,25014,0,0,15612,3821,0,,0
4,14428,"[lung, Crab, is, the, in, the, lead, make, of,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",NLTK_synonym_replacement,0,4166,4166,4166,4166,4330,2,"{0, 1}",1


In [86]:
# Perform sanity checks
sanity_checks(transformed_train_df)

Total rows in the df (26414, 13)

Null check


Unnamed: 0,column_names,null_count
0,doc_id,0
1,tokens,18630
2,token_label_ids,18630
3,annotation,0
4,start_id,0
5,end_id,0
6,token_length,0
7,token_label_id_length,0
8,exp_token_length,0
9,doc_token_length,0



Mismatches where tokens and token_label_ids in input file do not have the same size


Unnamed: 0,doc_id,tokens,token_label_ids,annotation,start_id,end_id,token_length,token_label_id_length,exp_token_length,doc_token_length,len_unique_token_ids,unique_token_id,expected_token_label_id



Number of rows with more than one unique token id 6588

Number of rows with only one unique token id 1196
Rows with only one unique token id but the token id does not match with the expected token label id


Unnamed: 0,doc_id,tokens,token_label_ids,annotation,start_id,end_id,token_length,token_label_id_length,exp_token_length,doc_token_length,len_unique_token_ids,unique_token_id,expected_token_label_id
1,15096,"[Resources,, Project, administration,, Supervi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",NLTK_synonym_replacement,3780,7601,41,41,3821,3821,1,0,1
7,2144,"[and, PKM, are, receiver, of, a, Ministry, of,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",human,3265,17179,174,174,13914,3439,1,1,0
10,5826,"[and, thus, combat, vaccine, hesitancy,, parti...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",chatgpt,3667,6954,152,152,3287,3819,1,0,2
82,24410,"[last, two, years,, we, have, been, .»»»., ., ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",human,6331,19986,630,630,13655,6961,1,3,0
89,8525,"[temporal, envelope, vectors, of, the, target,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",summarized,2692,4557,1865,1865,1865,9056,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26363,14116,"[Mo, were, enriched, in, the, austenite, grain...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",summarized,2870,4148,1278,1278,1278,5916,1,0,3
26364,14116,"[and, ultimate, tensile, strength, of, 996, MP...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",chatgpt,4149,6123,1767,1767,1974,5916,1,0,2
26369,22258,"[in, 1, H,, 15, N, amide, shifts, less, than, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",NLTK_synonym_replacement,2816,6546,3730,3730,3730,7784,1,0,1
26395,13123,"[from, soybean, (, Fig., 2, )., This, is, expl...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",summarized,2824,4013,1189,1189,1189,7300,1,0,3



Label distribution for the data with unique labels


Unnamed: 0,annotation,expected_token_label_id,count
0,human,0,663
1,summarized,3,192
2,chatgpt,2,191
3,NLTK_synonym_replacement,1,150



Label distribution for all the data


Unnamed: 0,annotation,expected_token_label_id,count
0,human,0,13346
1,chatgpt,2,4447
2,summarized,3,4376
3,NLTK_synonym_replacement,1,4245



Token id distribution for unique labels


Unnamed: 0,annotation,unique_token_id,count
0,human,0,645
1,chatgpt,0,184
2,summarized,0,184
3,NLTK_synonym_replacement,0,142
4,human,1,9
5,chatgpt,1,5
6,human,2,5
7,NLTK_synonym_replacement,1,4
8,NLTK_synonym_replacement,3,4
9,human,3,4





In [88]:
%%time
# Save the file locally for further feature transformation
transformed_train_df.to_csv('../data/tranformed_train_data.csv', index=False)