In [None]:
import numpy as np
import pandas as pd
import os
import json
import re
import glob
import spacy
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from nltk.tag.sequential import ClassifierBasedPOSTagger

In [None]:
# load train.csv
train_csv = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
train_csv.head()

In [None]:
train_csv.shape

In [None]:
# list of training publications
train_pubs = glob.glob("/kaggle/input/coleridgeinitiative-show-us-the-data/train/*.json")

In [None]:
def clean_sentence(txt):
     #return re.sub('[^A-Za-z0-9.]+', ' ', str(txt).lower())   
    return re.sub('[^A-Za-z0-9.]+', ' ', str(txt))   

In [None]:
train_csv.head()

In [None]:
import nltk

DATA = []
for idx,row in tqdm(train_csv[0:500].iterrows()):
    TRAIN_DATA=[]

    pub = "../input/coleridgeinitiative-show-us-the-data/train/" + row.Id + ".json"            
    f = open(pub)  
    data = json.load(f)      

    sentences = nltk.tokenize.sent_tokenize(str(data))
    for sentence in sentences:          
        sentence = clean_sentence(sentence).strip()        
        
        #loc = re.search(row.dataset_label.lower(),sentence)
        loc = re.search(row.dataset_label,sentence)
        
        if loc!=None:
            begin=loc.span()[0]
            end=loc.span()[1]
            
            tokens1 = nltk.word_tokenize(sentence[0:begin])
            tokens2 = nltk.word_tokenize(sentence[begin:end+1])
            tokens3 = nltk.word_tokenize(sentence[end+1:])
            
            pos_tag1 = nltk.pos_tag(tokens1)
            pos_tag2 = nltk.pos_tag(tokens2)
            pos_tag3 = nltk.pos_tag(tokens3)
            
            first = True
            
            for pos in pos_tag1:
                TRAIN_DATA.append((pos,"O"))
            for pos in pos_tag2:
                if first:
                    TRAIN_DATA.append((pos,"B-WORK_OF_ART"))
                    first = False
                else:
                    TRAIN_DATA.append((pos,"I-WORK_OF_ART"))
            for pos in pos_tag3:
                TRAIN_DATA.append((pos,"O"))
    #print(TRAIN_DATA)
    DATA.append(TRAIN_DATA)
            

In [None]:
import pickle
import nltk
import string

from nltk import pos_tag
from nltk import word_tokenize
from nltk.chunk import ChunkParserI
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag import ClassifierBasedTagger
from nltk.tag.util import untag
from nltk.stem.snowball import SnowballStemmer


In [None]:
# IOB tag name for specifying dataset label 
GPE_TAG = "WORK_OF_ART"

class DatasetChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        #print(train_sents)
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=self.features,
            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
    
    def features(self, tokens, index, history):
        # for more details see: http://nlpforhackers.io/named-entity-extraction/ 
        
        """
        `tokens`  = a POS-tagged sentence [(w1, t1), ...]
        `index`   = the index of the token we want to extract features for
        `history` = the previous predicted IOB tags
        """

        # init the stemmer
        stemmer = SnowballStemmer('english')

        # Pad the sequence with placeholders
        tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        # shift the index with 2, to accommodate the padding
        index += 2

        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])

        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase

        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase

        nextallcaps = prevword == prevword.capitalize()
        nextcapitalized = prevword[0] in string.ascii_uppercase

        f = {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-ascii': allascii,

            'next-word': nextword,
            'next-lemma': stemmer.stem(nextword),
            'next-pos': nextpos,

            'next-next-word': nextnextword,
            'nextnextpos': nextnextpos,

            'prev-word': prevword,
            'prev-lemma': stemmer.stem(prevword),
            'prev-pos': prevpos,

            'prev-prev-word': prevprevword,
            'prev-prev-pos': prevprevpos,

            'prev-iob': previob,

            'contains-dash': contains_dash,
            'contains-dot': contains_dot,

            'all-caps': allcaps,
            'capitalized': capitalized,

            'prev-all-caps': prevallcaps,
            'prev-capitalized': prevcapitalized,

            'next-all-caps': nextallcaps,
            'next-capitalized': nextcapitalized,
        }

        return f

def get_dataset_chunker(dataset_file_name):
    """
    returns DatasetChunker instance with dataset_file_name as training samples
    `dataset_file_name` = file name of pickled list of CoNLL IOB format sentences
    """

    chunker = DatasetChunker(dataset_file_name)

    return chunker

In [None]:
def get_chuncker_accuracy(chunker, test_samples):
    """
    returns score of the chunker against the gold standard
    """
    score = chunker.evaluate([
        conlltags2tree([(w, t, iob) for (w, t), iob in iobs])
        for iobs in test_samples
        ])
    return score.accuracy()

def get_tagged_sentence(chunker, sentence):
    """
    returns IOB tagged tree of sentence
    """
    return chunker.parse(pos_tag(word_tokenize(sentence)))

def extract_dataset(chunker, sentence):
    """
    returns all datasets in sentence
    """
    def tree_filter(tree):
        return GPE_TAG == tree.label()

    tagged_tree = get_tagged_sentence(chunker, sentence)
    datasets = list()
    for subtree in tagged_tree.subtrees(filter=tree_filter):
        datasets.append(untag(subtree.leaves()))
    return datasets

In [None]:
print("Loading dataset...")
chunker = get_dataset_chunker(DATA[0:25])
print("Done.")

In [None]:
# getting list of publication ids in the test set
test_pubs = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv").Id
test_pubs

# load submission.csv
sub = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")

In [None]:
i = 0

for pub in test_pubs:    
    print("pub:",pub)
    
    f = open("../input/coleridgeinitiative-show-us-the-data/test/" + pub + ".json")  
    
    data = json.load(f)      

    sentences = nltk.tokenize.sent_tokenize(str(data))        
    
    predicted_dataset=""
    dataset=[]
    final_prediction = ""
    
    for sentence in sentences: 
        text = clean_sentence(sentence).strip()       
        dataset = extract_dataset(chunker, text)       
        
        if len(dataset)>0:
            for j in range(len(dataset)):
                for ds in dataset[j]:
                    predicted_dataset = predicted_dataset + " " + ds
                    #print(predicted_dataset)

        
              
                sub.PredictionString.loc[i] = predicted_dataset.lower().strip()
    
    i = i + 1

In [None]:
sub.to_csv('submission.csv',index=False)
sub