In [None]:
import numpy as np 
import pandas as pd 
import os 
import random 
import json 
from tqdm import tqdm 
import re 
from functools import partial 
import string

In [None]:
RANDOM_SEED = 42 
def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
seed_everything()

Prepare data

In [None]:
submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

In [None]:
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

train_df.head()

In [None]:
train_df.info()

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    ''' Read json file and then reutrn the text data from them and append to the dataframe'''
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []

    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))

    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)

    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings 
    else:
        return all_data


In [None]:
%%time 
tqdm.pandas()
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

In [None]:
train_df.head()

In [None]:
%%time
tqdm.pandas()
submission['text'] = submission['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))



In [None]:
submission.head()

Clean the data

In [None]:
def text_cleaning(text):
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()

    return text 

In [None]:
%%time
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

In [None]:
%%time
tqdm.pandas()
submission['text'] = submission['text'].progress_apply(text_cleaning)

String matching

In [None]:
ds_label = [text_cleaning(s) for s in train_df["dataset_label"].unique()]

In [None]:
cleaned_label = [text_cleaning(s) for s in train_df["cleaned_label"].unique()]

In [None]:
ds_title = [text_cleaning(s) for s in train_df["dataset_title"].unique()]

In [None]:
label_references = set(ds_label + cleaned_label + ds_title)

In [None]:
len(label_references)

In [None]:
prediction_labels = []

In [None]:
for item in tqdm(submission["text"]):
    labels = []
    for label in label_references:
        if label in item:
            labels.append(text_cleaning(label))
            
    prediction_labels.append("|".join(labels))

In [None]:
submission["PredictionString"] = prediction_labels

In [None]:
submission

In [None]:
submission = submission[["Id", "PredictionString"]]

In [None]:
submission

In [None]:
submission["PredictionString"].iloc[0]

In [None]:
submission.to_csv("submission.csv", index=False)