A simple baseline: look if any of the training label text appears in the test data text (string matching training labels in test document text) and if they do, assign them to the test documents.

In [None]:
import pandas as pd

import json
import re
import os

INPUT_PATH = '/kaggle/input/coleridgeinitiative-show-us-the-data'
WORKING_PATH = '/kaggle/working'

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def read_doc(doc_id, split:str = 'train'):
    path = os.path.join(INPUT_PATH, split, f'{doc_id}.json')
    return json.loads(open(path).read())

    
def row_to_doc_text(row: pd.Series, split:str = 'train') -> str:
    
    doc_id = row['Id']
    doc_title = row['pub_title'] if 'pub_title' in row else ''
    
    doc_content = read_doc(doc_id, split)
    doc_text = ' '.join([section['section_title'] + ' ' + 
                         section['text'] for section in doc_content])
    
    return clean_text(doc_title + ' ' + doc_text).strip()

In [None]:
train_df = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv'))

test_df = pd.read_csv(os.path.join(INPUT_PATH, 'sample_submission.csv'))
test_df['pub_text'] = test_df.apply(row_to_doc_text, split='test', axis='columns')

In [None]:
unique_labels = train_df['cleaned_label'].unique()

def text_to_labels(text, labels):
    return '|'.join([label for label in labels if label in text]).strip()

test_df['PredictionString'] = test_df['pub_text'].apply(text_to_labels, labels=unique_labels)
test_df.drop('pub_text', inplace=True, axis=1)

In [None]:
test_df.to_csv('submission.csv', index=False)

In [None]:
test_df