In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os
import re
import json
import glob
from copy import deepcopy
from collections import defaultdict
from textblob import TextBlob
from functools import partial

import pandas as pd
import numpy as np

from nltk import sent_tokenize

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

import unidecode

from tqdm.notebook import tqdm
import string

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

%matplotlib inline

os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/')

def clean_text(txt):
    return [re.sub('[^A-Za-z0-9]+', ' ', t.lower()) for t in txt]

device='cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
basepath='/kaggle/input/coleridgeinitiative-show-us-the-data/'
train_df=pd.read_csv(basepath+'train.csv')
sample_sub = pd.read_csv(basepath+'sample_submission.csv')

In [None]:
train_df.head(5)

In [None]:
train_files_path=basepath+'train/'
test_files_path=basepath+'test/'
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.extend(sent_tokenize(unidecode.unidecode(data.get('section_title'))))
            contents.extend(sent_tokenize(unidecode.unidecode(data.get('text'))))
            combined.extend(sent_tokenize(unidecode.unidecode(data.get('section_title'))))
            combined.extend(sent_tokenize(unidecode.unidecode(data.get('text'))))
    
    if output == 'text':
        return contents
    elif output == 'head':
        return headings
    else:
        return combined

In [None]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

In [None]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))

In [None]:
train_df['cleaned_text']=train_df.text.progress_apply(clean_text)
sample_sub['cleaned_text']=sample_sub.text.progress_apply(clean_text)

In [None]:
labels_list=train_df.cleaned_label

In [None]:
labels=[]
sub_labels=[]
for text in tqdm(train_df.cleaned_text):
    text=' '.join(text)
    tmp=sorted([label for label in labels_list if label in text],key=lambda x: len(x))
    result=[]
    for i,label in enumerate(tmp):
        try:
            if sum([label in ref for ref in tmp[i+1:]])>0:
                continue
            else:
                result.append(label)
        except:
            result.append(label)
    labels.append(sorted(result))
    
for text in tqdm(sample_sub.cleaned_text):
    text=' '.join(text)
    tmp=sorted([label for label in labels_list if label in text],key=lambda x: len(x))
    sub_labels.append(tmp)

In [None]:
train_df['complete_lower_labels']=labels

In [None]:
def get_text_labels(row):
    text_labels=[]
    for i,txt in enumerate(row['cleaned_text']):
        have_label=False
        for existing in row['complete_lower_labels']:
            tmp_span=[m.span() for m in re.finditer(existing,txt)]
            if len(tmp_span) > 0:
                have_label=True
                break
        text_labels.append(int(have_label))
    return text_labels

In [None]:
train_df['text_labels']=train_df.progress_apply(get_text_labels,axis=1)

In [None]:
training_texts=[t.strip() for txt in train_df.cleaned_text for t in txt]
training_labels=[label for labels in train_df['text_labels'] for label in labels]
test_texts=[[t.strip() for t in txt] for txt in sample_sub.cleaned_text]

In [None]:
processed_train_df=pd.DataFrame(zip(training_texts,training_labels),columns=['text','label'])
processed_test_df=pd.DataFrame(zip(test_texts,sample_sub.Id),columns=['text','Id'])

processed_train_df.to_csv('processed_train_df.csv',index=False)
processed_test_df.to_csv('processed_test_df.csv',index=False)

In [None]:
sub_final_labels=[]
for i in sub_labels:
    tmp=[]
    for index,label in enumerate(i):
        try:
            if sum([label in ref for ref in i[index+1:]])>0:
                continue
            else:
                tmp.append(label)
        except:
            tmp.append(label)
    sub_final_labels.append('|'.join(sorted(tmp)))

submissions=pd.DataFrame(zip(sample_sub.Id,sub_final_labels),columns=['Id','PredictionString'])
submissions.to_csv('submission.csv',index=False)