As mentioned by @erikbruin in his EDA notebook (https://www.kaggle.com/erikbruin/nlp-on-student-writing-eda), we notice that for some of the records inside the train.csv there is a mismatch in the discousre_text values and the predictionstring values.

Below are some of the analysis that I did as part of this.


In [None]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tqdm

In [None]:
train_data = pd.read_csv('../input/feedback-prize-2021/train.csv')
print(f"Shape of the training dataset: {train_data.shape}")

In [None]:
col_name = train_data.columns
train_data.head(10)

In [None]:
train_data['disclosure_text_tokens'] = train_data['discourse_text'].apply(lambda x: len(x.split()))
train_data['predictionstring_tokens'] = train_data['predictionstring'].apply(lambda x: len(x.split()))
train_data.head(10)

Here creating a separate dataframe for just the length mismatch records

In [None]:
sample = train_data[train_data.disclosure_text_tokens != train_data.predictionstring_tokens]
sample = sample.reset_index(drop=True)
print(f"Shape of the above prepared sample dataset: {sample.shape}")

#### Here, initially we are extracting the starting and ending positions as mentioned in the predictionstring column.
The discourse text start and finish values are then compared to the real document (which is being extracted using the above position values)
If the strings do not match, we apply the following set of rules:

1) Check if the discourse_text begins with special characters like [',','.']; if it does, we'll skip that value.
2) If the beginning string matches but the ending string does not:
        a) Look to see if it has a special character at the end.
        b) If any extra characters are selected, they will be removed.
3) Similar checks are also done if the starting string does not match with the disclose_text


In [None]:
def string_cleaning(i):
    print('\n')
    print(f'========== {i} ==========')
    print(f"------------------- {sample['id'][i]}.txt -------------------- ")
    file_dir = f"../input/feedback-prize-2021/train/{sample['id'][i]}.txt"
    file = open(file_dir)
    text_data = file.read()
    print('-'*50)
    print(sample['discourse_text'][i])
    print('-'*50)
    print(sample['predictionstring'][i])
    print('-'*50)
    print(f"Shape of the discourse text: {sample['disclosure_text_tokens'][i]}")
    print('-'*50)
    print(f"Shape of the Prediction String: {sample['predictionstring_tokens'][i]}")
    print('-'*50)
    start = sample['predictionstring'][i].split()[0]
    end = sample['predictionstring'][i].split()[-1]
    print(f"Prediction string starting from position: {start}")
    print(f"Prediction string end at postion: {end}")

    print('-'*50)
    print(f"Discourse value: {sample['discourse_text'][i].split()[0]}")
    print(f"Prediction value: {text_data.split()[int(sample['predictionstring'][i].split()[0])]}")
    print('-'*50)
    
    print('-'*50)
    print(f"Discourse value: {sample['discourse_text'][i].split()[-1]}")
    print(f"Prediction value: {text_data.split()[int(sample['predictionstring'][i].split()[-1])]}")
    print('-'*50)
    
    
    dict1 = {}
    for val in sample['predictionstring'][i].split():
        dict1[val] = text_data.split()[int(val)]
    display(dict1)
    
    dict2 = {}
    for val in range(len(sample['discourse_text'][i].split())):
        dict2[str(val)] = sample['discourse_text'][i].split()[val]
    display(dict2)
    
    if sample['discourse_text'][i].split()[0] in [',','.']:
        print('-'*50)
        print("Starting with a sepecial character")
        sample.loc[i,'discourse_text'] = ' '.join(sample.loc[i,'discourse_text'].split()[1:])
        print(f"New discourse value length: {len(sample['discourse_text'][i].split())}")
        print('-'*50)
        
        
    if sample['discourse_text'][i].split()[0] == text_data.split()[int(sample['predictionstring'][i].split()[0])]:
        print("values are matching")
        if sample['discourse_text'][i].split()[-1] == text_data.split()[int(sample['predictionstring'][i].split()[-1])]:
            print('The ending values are matching')
        else:
            print(f"Final value: {text_data.split()[int(sample['predictionstring'][i].split()[-1])]}")
            print('-'*50)
            dict_values = [v for k,v in dict1.items()]
            dict_key = [k for k,v in dict1.items()]
            
            
                
            if text_data.split()[int(sample['predictionstring'][i].split()[-1])] == ',' or text_data.split()[int(sample['predictionstring'][i].split()[-1])] == '.':
                print('-------- Ending with special character -------------')
                end = str(int(end)-1)
                
            elif sample['discourse_text'][i].split()[-1] in dict_values:
                end = dict_key[dict_values.index(sample['discourse_text'][i].split()[-1])]
            else:
                print("ending values doesnt match")
                print(f"Discourse value: {sample['discourse_text'][i].split()[-1]}")
                print(f"Extracted value: {text_data.split()[int(sample['predictionstring'][i].split()[-1])]}")

                end_bk = end
                end = [str(j) for j in range(len(text_data.split())) if j>int(start) and text_data.split()[j] == sample['discourse_text'][i].split()[-1]]
                if len(end) == 1:
                    end = ''.join(end) 
                elif end:
                    end = end[0]
                else:
                    end = end_bk


#                 print(f"Index value of the last string: {text_data.split().index(sample['discourse_text'][i].split()[-1])}")
#                 end = text_data.split().index(sample['discourse_text'][i].split()[-1])
    else:
        print('Change the starting position')
        print(text_data.split()[int(sample['predictionstring'][i].split()[0])-1])
        print(sample['discourse_text'][i].split()[0])
        for sign in [',','.','"']:
            if text_data.split()[int(sample['predictionstring'][i].split()[0])-1].find(sign) != -1:
                print('We have special character in the first position!')
                val = text_data.split()[int(sample['predictionstring'][i].split()[0])-1].split(sign)
                if val[1] is not None or text_data.split()[int(sample['predictionstring'][i].split()[0])-1].split(',')[1] == sample['discourse_text'][i].split()[0]:
                    start = int(sample['predictionstring'][i].split()[0])-1
        
        if sample['discourse_text'][i].split()[0] == text_data.split()[int(sample['predictionstring'][i].split()[0])-1]:
            print("Now the values are matching as we chnaged the index position")
            print(f"Original Index value: {int(sample['predictionstring'][i].split()[0])}")
            print(f"New index position is : {int(sample['predictionstring'][i].split()[0])-1}")
            start = int(sample['predictionstring'][i].split()[0])-1
        elif text_data.split()[int(sample['predictionstring'][i].split()[0])] == '.':
            start = int(sample['predictionstring'][i].split()[0])+1
            
        print(f" ----- Length of modified Discourse value: {len(sample['discourse_text'][i].split())} ------- ")
            
        if sample['discourse_text'][i].split()[-1] == text_data.split()[int(sample['predictionstring'][i].split()[-1])]:
            print('The ending values are matching')
        else:
            print(f"Final value: {text_data.split()[int(sample['predictionstring'][i].split()[-1])]}")
            dict_values = [v for k,v in dict1.items()]
            dict_key = [k for k,v in dict1.items()]
            
            if sample['discourse_text'][i].split()[-1] in dict_values:
                end = dict_key[dict_values.index(sample['discourse_text'][i].split()[-1])]
                

            elif text_data.split()[int(sample['predictionstring'][i].split()[-1])] == ',' or text_data.split()[int(sample['predictionstring'][i].split()[-1])] == '.':
                print('-------- Ending with special character -------------')
                end = str(int(end)-1)
            else:
                print("ending values doesnt match")
                print(f"Discourse value: {sample['discourse_text'][i].split()[-1]}")
                print(f"Extracted value: {text_data.split()[int(sample['predictionstring'][i].split()[-1])]}")
                
                end_bk = end
                end = [str(j) for j in range(len(text_data.split())) if j>int(start) and text_data.split()[j] == sample['discourse_text'][i].split()[-1]]
                if len(end) == 1:
                    end = ''.join(end) 
                elif end:
                    end = end[0]
                else:
                    end = end_bk
#             print(f"List of new end values: {new_end}")
            
#             print(f"Index value of the last string: {text_data.split().index(sample['discourse_text'][i].split()[-1])}")
#             end = text_data.split().index(sample['discourse_text'][i].split()[-1])


    print('-'*50)
    print(f"New start value: {start}")
    print('-'*50)
    print(f"New ending value: {end}")
    print('-'*50)
    list1 = [str(i) for i in range(int(start), int(end)+1)]
    print(' '.join(list1))
    print(len(list1))
#     final_list.append(list1)
    
    print('-'*50)
    
    return list1

In [None]:
df_list = [string_cleaning(x) for x in tqdm(sample.index)]

In [None]:
print(len(df_list))

In [None]:
for i in range(len(df_list)):
    df_list[i] = ' '.join(df_list[i])
    
sample['new_predictionstring'] = df_list
sample['new_disclosure_text_tokens'] = sample['discourse_text'].apply(lambda x: len(x.split()))
sample['new_predictionstring_tokens'] = sample['new_predictionstring'].apply(lambda x: len(x.split()))
sample.head(10)

In [None]:
sample.shape

In [None]:
df_final = train_data.merge(sample[['id','discourse_id','discourse_text','new_predictionstring']], on = ['id','discourse_id'], how = 'left')
print(f"Shape of the final prepared dataframe: {df_final.shape}")

In [None]:
index_list = []
for x in tqdm(range(len(df_final))):
    if pd.isnull(df_final.loc[x, 'new_predictionstring']):
        pass
    else:
        index_list.append(x)
        df_final.loc[x,'discourse_text_x'] = df_final.loc[x,'discourse_text_y']
        df_final.loc[x,'predictionstring'] = df_final.loc[x,'new_predictionstring']

In [None]:
df_final['disclosure_text_tokens'] = df_final['discourse_text_x'].apply(lambda x: len(x.split()))
df_final['predictionstring_tokens'] = df_final['predictionstring'].apply(lambda x: len(x.split()))

In [None]:
df_final = df_final.drop(['discourse_text_y','new_predictionstring'], axis=1)

In [None]:
print(f"Total number of mismtach after the above processing: {df_final[df_final['disclosure_text_tokens'] != df_final['predictionstring_tokens']].shape}")

#### There are still some mismatches, for these records the initial value of the discourse_text are missing some text values due to which, its not getting considered in any of the above rules.

For example:

1) listening and istening \
2) the and he\
3) Students and tudents

While the position differnce between the text mentioned in the discourse_text and that in the predictionstring are greater than 1.

In [None]:
df_final.to_csv('updates_train.csv', index=False)