In [None]:
import transformers
print(transformers.__version__)
from transformers import *

In [None]:
# Asthetics
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
import gc
# Basic
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import json
import os
import random
from tqdm.autonotebook import tqdm
import string
import re
from functools import partial

tqdm.pandas()

In [None]:
RANDOM_SEED = 42
def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
seed_everything()

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'
train_df.head(10)

In [None]:
infer_limit = 999999
is_real_run = len(os.listdir('../input/coleridgeinitiative-show-us-the-data/test')) > 10
is_real_run

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def clean_text_v2(txt):
    return re.sub('[^A-Za-z0-9\(\)]+', ' ', str(txt).lower().strip())

In [None]:
temp_1 = [clean_text(x) for x in train_df['dataset_label'].unique()]
temp_2 = [clean_text(x) for x in train_df['dataset_title'].unique()]
temp_3 = [clean_text(x) for x in train_df['cleaned_label'].unique()]

empty_ids = []
existing_labels = set(temp_1 + temp_2 + temp_3)
id_list = []
labels_list = []
for index, row in tqdm(sample_sub.iterrows()):
    sample_text = clean_text(row['text'])
    row_id = row['Id']
    cleaned_labels = []
    for known_label in existing_labels:
        if known_label in sample_text:
            cleaned_labels.append(clean_text(known_label))
#     cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    labels_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)
#     if len(labels_list[-1]) == 0:
    empty_ids.append(row_id)

In [None]:
# existing_labels

In [None]:
if not is_real_run:
    empty_ids = train_df.Id.values[:30]
    data_dir = "train"
else:
    empty_ids = empty_ids[:infer_limit]
    data_dir = "test"

In [None]:
from transformers import *
import torch
from torch import nn
import torch.nn.functional as F

import logging
import math
import os

import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss

In [None]:
def convert_lines(tokenizer, df, max_sequence_length = 512, is_test=False):
    pad_token_idx = tokenizer.pad_token_id or tokenizer.eos_token_id
    cls_token_idx = tokenizer.cls_token_id or tokenizer.eos_token_id
    sep_token_idx = tokenizer.sep_token_id or tokenizer.eos_token_id
    outputs = np.zeros((len(df), max_sequence_length))
    type_outputs = np.zeros((len(df), max_sequence_length))
    position_outputs = np.zeros((len(df), 2))
    offset_outputs = np.ones((len(df),))
    extracted = []
    for idx, row in tqdm(df.iterrows(), total=len(df)): 
        input_ids_1 = tokenizer.encode(row.text,add_special_tokens=False)
#         print(len(input_ids_1))
        input_ids = [cls_token_idx, ] +input_ids_1 + [sep_token_idx, ]
        token_type_ids = [0,]*len(input_ids)
        if len(input_ids) > max_sequence_length: 
            input_ids = input_ids[:max_sequence_length]
            input_ids[-1] = sep_token_idx
            token_type_ids = token_type_ids[:max_sequence_length]
        else:
            input_ids = input_ids + [pad_token_idx, ]*(max_sequence_length - len(input_ids))
            token_type_ids = token_type_ids + [pad_token_idx, ]*(max_sequence_length - len(token_type_ids))
        assert len(input_ids) == len(token_type_ids)
        outputs[idx,:max_sequence_length] = np.array(input_ids)
        type_outputs[idx,:] = token_type_ids
        if is_test:
            continue
        selected_text = row.label.strip()
        if len(selected_text) == 0 or len(row.text) == 0:
            start_idx, end_idx = (0,0)
            position_outputs[idx,:] = [0, 0]
        else:
            if " "+selected_text in row.text:
                input_ids_2 = tokenizer.encode(" "+selected_text,add_special_tokens=False)
            else:
                input_ids_2 = tokenizer.encode(selected_text,add_special_tokens=False)
            for i in range(len(input_ids_2)):
                start_idx, end_idx = contains(input_ids_2[:len(input_ids_2)-i], input_ids_1) #[:max_sequence_length - len(input_ids_0) - 2])
                if start_idx is not None:
                    if i > 1:
                        print(input_ids_2, i)
                    break
            if start_idx is None:
                start_idx = 0
                end_idx = 0
            position_outputs[idx,:] = [start_idx + 1, end_idx + 1]
            if max(position_outputs[idx,:]) >= max_sequence_length:
                position_outputs[idx,:] = 0,0
    if is_test:
        return outputs, type_outputs
    else:
        return outputs, type_outputs, position_outputs, offset_outputs, df
    

def find_best_combinations(start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, valid_start= 0, valid_end=512):
    best = (valid_start, valid_end - 1)
    best_score = -9999
#    print(valid_end, start_top_index, end_top_index)
    for i in range(len(start_top_log_probs)):
        for j in range(end_top_log_probs.shape[0]):
            if valid_start <= start_top_index[i] < valid_end and valid_start <= end_top_index[j,i] < valid_end and start_top_index[i] <= end_top_index[j,i]:
                score = start_top_log_probs[i] * end_top_log_probs[j,i]
                if score > best_score:
                    best = (start_top_index[i],end_top_index[j,i])
                    best_score = score
    return best

def jaccard_similarity(str1: str, str2: str) -> float:
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('../input/roberta-base-config/')

window_size = 24
max_sequence_length = 256
test_df = pd.DataFrame()
texts = []
labels = []
ids = []
for idx in tqdm(empty_ids):
    x = json.load(open(f"../input/coleridgeinitiative-show-us-the-data/{data_dir}/{idx}.json","rt"))
    article = ""
    for section in x:
        raw_text = " ".join(section["text"].replace("\n", " ").split())
        article += raw_text
        article += " "
#     article =  clean_text(article)
    input_ids = tokenizer.encode(article, add_special_tokens=False)
    n_samples = math.ceil(len(input_ids)/(max_sequence_length - window_size))
    for sample_idx in range(n_samples):
        start = max(0, (max_sequence_length - window_size)*sample_idx)
        end = start + max_sequence_length
        curr_ids = input_ids[start: end]
        curr_text = tokenizer.decode(curr_ids)
        texts.append(curr_text)
        ids.append(idx)
test_df["id"] = ids
test_df["text"] = texts
test_df = test_df.fillna("")

In [None]:
!cp ../input/gpt-eval-script/*.py .

In [None]:
chunk_size = 200000
n_chunks = math.ceil(len(test_df)/chunk_size)

tokenizer = GPT2Tokenizer.from_pretrained('../input/gpt2-config/')
config = GPT2Config.from_pretrained('../input/gpt2-config/', output_hidden_states=True)
X_test, X_type_test = convert_lines(tokenizer,test_df,is_test=True, max_sequence_length=256)
np.save("X_test_gpt.npy",X_test)

# tokenizer = RobertaTokenizer.from_pretrained('../input/roberta-base-config/')
# config = RobertaConfig.from_pretrained('../input/roberta-base-config/', output_hidden_states=True)
# X_test, X_type_test = convert_lines(tokenizer,test_df,is_test=True)
# np.save("X_test_roberta.npy",X_test)

In [None]:
!python eval.py --input_path ./X_test_gpt.npy --ckpt_path ../input/gpt-test/gpt2-medium_extra_data_13.bin --config_path ../input/gpt-medium-config --output_path o1.txt --model gpt --batch_size 8 --beam_size 10 --sigmoid_decoding --threshold 0.8
!ls -halt

In [None]:
selected_texts = []
for i in [1]:
    data = [x.strip() for x in open(f"o{i}.txt").readlines()]
#     test_df[f"preds_{i}"] = [clean_text(x) for x in data]
    selected_texts.extend(data)

In [None]:
good_words = [" Dataset "," Datasets ", " Database "," Databases ", " Data ", " Survey "," Study "," Studies "," Surveys "]
bad_words = [" are ", " is ", " was ", " were "]
def has_good_words(x):
    if "." in x or "!" in x or "?" in x or (not x.split()[0].isalpha()):
        return False
    x = f" {x} "
    for w in bad_words:
        if w in x:
            return False
    for w in good_words:
        if w in x:
            return True
    return False
def is_long_pred(x):
    long_enough = len(x) > 10 and len(x) < 100 and len(x.split()) > 3 
    return long_enough and ("," not in x)

temp_df = pd.DataFrame()
temp_df["preds"] = [x for x in selected_texts if len(x) > 0]
pred_counts = temp_df.preds.value_counts()
unique_preds =  [clean_text(x) for x in pred_counts.index[pred_counts.values > 3] if is_long_pred(x)]
more_preds = [clean_text(x) for x in pred_counts.index[pred_counts.values <= 3] if has_good_words(x) and is_long_pred(x)]
unique_preds.extend(more_preds)
unique_preds = list(set(unique_preds))

black_list = []
for pred in unique_preds:
    for label in existing_labels:
        if jaccard_similarity(pred,label) >= 0.5:
            black_list.append(pred)
            break
black_list = list(set(black_list))
#     print(black_list, preds)
for x in black_list:
    unique_preds.remove(x)
best_preds = list(set(unique_preds))

In [None]:
print(best_preds[:20], more_preds[:20], len(best_preds), len(more_preds))

In [None]:
import numpy as np
import re


def find_all_pred_in_text(normed_text, all_unique_preds):
    preds = []
    preds_indexs = []
    for pred in all_unique_preds:
        if pred in normed_text:
            preds.append(pred)
    unique_preds = []  # unique in terms of index.
    for pred in preds:
        matchs = re.finditer(pred, normed_text)
        for match in matchs:
            start_index = match.start()
            end_index = match.end()
            preds_indexs.append([start_index, end_index])
            unique_preds.append(pred)
    group_idxs = []
    for i in range(len(preds_indexs)):
        for j in range(len(preds_indexs)):
            if i != j:
                start_i, end_i = preds_indexs[i]
                start_j, end_j = preds_indexs[j]
                if start_i <= end_j and end_i <= end_j and start_i >= start_j:
                    group_idxs.append([i, j])
    unique_preds = np.array(unique_preds)
    for group_idx in group_idxs:
        unique_preds[group_idx[0]] = unique_preds[group_idx[1]]
    return np.unique(unique_preds)

In [None]:
extra_dict = dict()
test_df["orig_text"] = test_df.text.copy()
test_df.text = test_df.text.apply(clean_text)
for idx in sample_sub["Id"].unique():
    sub_df = sample_sub[sample_sub["Id"] == idx]
    sub_texts = clean_text(sub_df.text.values[0])
    preds = []
    for pred in best_preds:
        if pred in sub_texts and pred not in preds:
            preds.append(pred)
    black_list = []
    preds = list(find_all_pred_in_text(sub_texts, preds))
    for pred in preds:
        for label in existing_labels:
            if jaccard_similarity(pred,label) >= 0.5:
                black_list.append(pred)
                break
    black_list = list(set(black_list))
    for x in black_list:
        preds.remove(x)
    extra_dict[idx] = '|'.join(preds)

In [None]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = ""
submission['PredictionString'] = submission.apply(lambda row: extra_dict.get(row.Id, row.PredictionString),axis=1)
# submission.to_csv("submission.csv",index=False)
# !head submission.csv

In [None]:
def check_valid_acronym(label, acronym):
    guess_acronym = ' '.join([w[0] for w in label.split()])
    js = jaccard_similarity(guess_acronym, ' '.join([c for c in acronym.split()[0]]))
    if js >= 0.5:
        return True
    return False

def is_last_word_acronym(label):
    words = label.split()
    last_word = words[-1]
    label = " ".join(words[:-1])
    if check_valid_acronym(label, last_word):
        return True
    return False

def find_all_acronyms_candidates(row):
    string = row.text
    all_labels = row.PredictionString.split("|")
    curr_preds = row.PredictionString
    for label in all_labels:
        if label != "":
            acronyms_candidates = re.findall(f"{label} \((.*?)\)", string)
            acronyms_candidates = np.unique([ac for ac in acronyms_candidates if len(ac.split()) >= 1])
            if is_last_word_acronym(label):
                acronyms_candidates = np.unique(np.append(acronyms_candidates, label.split()[-1]))
            if len(acronyms_candidates) > 0:
                for ac in acronyms_candidates:
                    index_of_label = np.array([i for i in range(len(string)) if string.startswith(f'{label} ({ac})', i)]) + len(f"{label} (")
                    index_of_ac = np.array([i for i in range(len(string)) if string.startswith(f'{ac}', i)])
                    if len(list(set(index_of_ac) - set(index_of_label))) != 0:
                        if check_valid_acronym(label, ac):
                            curr_preds += f"|{ac}"
    curr_preds = list(set(curr_preds.split("|")))
    return "|".join(curr_preds)

In [None]:
sample_sub["text"] = sample_sub.text.apply(clean_text_v2)
sample_sub.PredictionString = submission.PredictionString.copy()

In [None]:
sample_sub.PredictionString = sample_sub.apply(find_all_acronyms_candidates,axis=1)
sample_sub.PredictionString = sample_sub.PredictionString.apply(lambda x: "|".join([i for i in x.split("|") if len(i.strip()) > 0]))
print(sample_sub.PredictionString.values[:5])

In [None]:
sample_sub[["Id","PredictionString"]].to_csv("submission.csv",index=False)