In [None]:
import numpy as np
import pandas as pd
import re

import spacy
nlp = spacy.load('en_core_web_sm')
all_stopwords = nlp.Defaults.stop_words

from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
class CFG:
    clean_before_matching = True
    use_acronym = False
    use_govt = True
    probe_threshold = 0.3
    
    
    
CLN_BFR_MTCH = CFG.clean_before_matching
USE_ACRONYM = CFG.use_acronym
USE_GOVT = CFG.use_govt
PROBE_THRESHOLD = CFG.probe_threshold


if CLN_BFR_MTCH:
    LABEL = "cleaned_label"
else:
    LABEL = "dataset_label"

In [None]:
def clean_text(txt):
    """
    Args:
        txt: str
    Returns:
        txt: str
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def clean_text_strong(txt):
    """
    Args:
        txt: str
    Returns:
        txt: str
    """
    txt = re.sub(r"\b\d+\b", "", txt)
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def get_acronym(txt):
    """
    Args:
        txt: str
    Returns:
        txt: str
    """
    txt = clean_text_strong(txt)
    txt = txt.split()
    
    if len(txt) > 1:
        txt = [word[0] for word in txt if not word in all_stopwords]
        txt = "".join(txt) # converted to acronym style
    else:
        txt = ""
    
    return txt

In [None]:
dataset_label_train = set()
train = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    dataset_label_train.add(str(label_1).lower())
    dataset_label_train.add(str(label_2).lower())
    dataset_label_train.add(str(label_3).lower())
print("len(dataset_label_train) before USE_GOVT: ", len(dataset_label_train))
    
if USE_GOVT:
    adnl_govt_labels_path = '../input/coleridge-additional-gov-datasets-22000popular/data_set_800_with2000popular.csv'
    adnl_govt_labels = pd.read_csv(adnl_govt_labels_path)
    for l in adnl_govt_labels.title:
        dataset_label_train.add(l)
    print("len(dataset_label_train) after USE_GOVT: ", len(dataset_label_train))

In [None]:
dataset_label_train = list(dataset_label_train)

In [None]:
# if USE_GOVT:
#     gvt = pd.read_csv("../input/bigger-govt-dataset-list/data_set_26897.csv")
#     dataset_label_train = list(set(gvt["title"]))
#     del gvt
# else:
#     # set(train[LABEL]) - set(gvt["title"]) == 0, but...
#     train = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
#     dataset_label_train = list(set(train[LABEL]))
#     del train  
# print("len(dataset_label_train) before USE_ACRONYM: ", len(dataset_label_train))



if USE_ACRONYM:
    acronyms = [get_acronym(txt) for txt in dataset_label_train]
    acronyms = [acro for acro in acronyms if len(acro)<10]
    acronyms = [acro for acro in acronyms if len(acro)>2]
    acronyms = [acro for acro in acronyms if acro!=""]
    
    #print(len(dataset_label_train))
    dataset_label_train.extend(acronyms)
    dataset_label_train = list(set(dataset_label_train))
    #print(len(dataset_label_train))
    print("len(dataset_label_train) after USE_ACRONYM: ", len(dataset_label_train))

In [None]:
#[label for label in dataset_label_train if len(label.split())==1]

In [None]:
kw = "individual"
ind = np.where(np.array(dataset_label_train)==kw)[0]
np.array(dataset_label_train)[ind]

In [None]:
dataset_label_train[:10]

In [None]:
df = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")

def get_text(filename, train=False):
    if train:
        df = pd.read_json(f'../input/coleridgeinitiative-show-us-the-data/train/{filename}.json')
    else:
        df = pd.read_json(f'../input/coleridgeinitiative-show-us-the-data/test/{filename}.json')

    text = " ".join(list(df['text']))
    
    if CLN_BFR_MTCH:
        text = clean_text(text)
    
    return text

df["text"] = df["Id"].progress_apply(lambda x: get_text(x))
df

In [None]:
def string_matching(txt, labels):
    """
    Args:
        txt: str
        lables: List[str]
    Returns:
        preds: str
    """
    preds = []
    found_flag = 0
    for label in labels:
        #### RIOW
#         if label in txt:
#             preds.append(label)
        if len(label.split())>1:
            if label in txt:
                preds.append(label)
        elif len(label.split())==1:
            if label in txt.split():
                preds.append(label)
        #### RIOWRIOW

    if not CLN_BFR_MTCH:
        # Clean text here for the first time
        preds = [clean_text(pred) for pred in preds]
    
    pred = "|".join(preds)
    
    if pred != "":
        found_flag = 1
    
    return pred, found_flag

In [None]:
#df["PredictionString"] = df["text"].progress_apply(lambda x: string_matching(x, dataset_label_train))
counter = 0
for i,row in tqdm(df.iterrows()):
    pred, found_flag = string_matching(row["text"], dataset_label_train)
    df.loc[i, "PredictionString"] = pred
    counter += found_flag

if counter/len(df) < PROBE_THRESHOLD:
    df["PredictionString"] = "" # give up the entire preds like a good grace
    
df.drop("text", axis=1, inplace=True)
df

In [None]:
#"a significant body" in df.loc[3, "text"]

In [None]:
#"ody" in df.loc[3, "text"].split()

In [None]:
df.to_csv("submission.csv", index=False)