In [None]:
import pandas as pd
import numpy as np
import os
import json
import glob
import re


In [None]:
class PATH:
    base="/kaggle/input/coleridgeinitiative-show-us-the-data/"
    train="/kaggle/input/coleridgeinitiative-show-us-the-data/train/"
    test="/kaggle/input/coleridgeinitiative-show-us-the-data/test/"
    traincsv = base+"train.csv"
    
    sample_submission=base+"sample_submission.csv"
    submission="submission.csv"

In [None]:
from queue import Queue

class Node(dict):
        def __init__(self):
            super().__init__()
            self.final = False;
            
            self.out = set();
            self.fail = None;
            
        def addout(self,out):
            if type(out) is set:
                self.out = self.out.union(out)
            else :
                self.out.add(out)
        
        def addchild(self,alphabet,node = None):
            self[alphabet] = Node() if node is None else node

class AC():
       
    def __init__(self,patterns):
        self.patterns = patterns
        self.head = Node()
        
        self.maketrie()
        self.constructfail()
        
    def search(self,sentence):
        crr = self.head
        ret = []
        for c in sentence :
            while crr is not self.head and c not in crr:
                crr = crr.fail
            if c in crr:
                crr = crr[c]
            
            if crr.final:
                ret.extend(list(crr.out))
        return ret
    
    def maketrie(self):
        for pattern in self.patterns:
            crr = self.head
            for c in pattern :
                if c not in crr:
                    crr.addchild(c)
                crr = crr[c]
            crr.final = True
            crr.addout(pattern)
            
    def constructfail(self):
        queue = Queue()
        self.head.fail = self.head
        queue.put(self.head)
        while not queue.empty():
            crr = queue.get()
            for nextc in crr:
                child = crr[nextc]
                
                if crr is self.head:
                    child.fail = self.head
                else :
                    f = crr.fail
                    while f is not self.head and nextc not in f:
                        f = f.fail
                    if nextc in f:
                        f = f[nextc]
                    child.fail = f
                
                child.addout(child.fail.out)
                child.final |= child.fail.final
                
                queue.put(child)
        

In [None]:
# from https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview/evaluation
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
df_train = pd.read_csv(PATH.traincsv)
unq_labels = df_train['cleaned_label'].unique()
unq_labels = map(lambda x : x.strip(),unq_labels)

ac = AC(unq_labels)

In [None]:
ac.search(' oh alzheimer s disease neuroimaging initiative adniiii')

In [None]:
ids, predictions =[],[]

for path in glob.iglob(PATH.test+"*"):
    idx = os.path.basename(path)[:-5]
    
    json_file = None
    with open(path,'r') as file:
        json_file = json.load(file)
    
    pred = []
    for content in json_file :
        txt = clean_text(content['text'])
        pred += ac.search(txt)
        
    pred  = list(set(pred))
    predictions.append("|".join(pred))
    ids.append(idx)


In [None]:
submission = pd.DataFrame.from_dict({'Id':ids,'PredictionString':predictions})
submission.to_csv(PATH.submission, index=False)