In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction

Hello everyone! This notebook is an attempt to look at the problem like on a NER-problem (Named Entity Recognition). For this purpose I have used the spacy library. You are welcome to write any comments and suggestions to imrove the result!

## Load the data

In [None]:
sub = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
sub

In [None]:
train_csv = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
train_csv

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

import glob
train_files = glob.glob("/kaggle/input/coleridgeinitiative-show-us-the-data/train/*.json")
test_files = glob.glob("/kaggle/input/coleridgeinitiative-show-us-the-data/test/*.json")

## Generate dataframes from jsons

In [None]:
from tqdm import tqdm

df_train_publications = pd.DataFrame()

for train_file in tqdm(train_files):
    file_data = pd.read_json(train_file)
    file_data.insert(0,'pub_id', train_file.split('/')[-1].split('.')[0].replace('train\\', ''))
    df_train_publications = pd.concat([df_train_publications, file_data])

df_train_publications

In [None]:
df_test_publications = pd.DataFrame()

for test_file in tqdm(test_files):
    file_data = pd.read_json(test_file)
    file_data.insert(0,'pub_id', test_file.split('/')[-1].split('.')[0].replace('test\\', ''))
    df_test_publications = pd.concat([df_test_publications, file_data])

df_test_publications

## Implement several functions to concatenate text

In [None]:
def compare(column):
    return '|'.join(list(set(column)))

def make_list(df):
    ids = df['Id'].unique()
    df1 = pd.DataFrame(columns=['Id', 'cleaned_label'])
    for id_ in ids:
        df1 = pd.concat([df1, pd.DataFrame({"Id": id_, "cleaned_label":[df[df['Id']==id_].apply(compare)['cleaned_label']]})])
    return df1.drop_duplicates()

In [None]:
def concat(column):
    res = ' '
    for st in column:
        if type(st) == str:
            res += st
    return res

In [None]:
train = df_train_publications.groupby('pub_id')['text'].apply(concat).reset_index()

train.loc[train['pub_id'].isin(train_csv['Id']), 'cleaned_label'] = train_csv.loc[train_csv['Id'].isin(train['pub_id']),'cleaned_label']

In [None]:
train

In [None]:
test = df_test_publications.groupby('pub_id')['text'].apply(concat).reset_index()
test

## Named Entity Recognition

In [None]:
# NER
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path

from spacy.util import minibatch, compounding
from pathlib import Path
import random

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def train_spacy(data,iterations):
    TRAIN_DATA = [] 
    for i in range(300):
        TRAIN_DATA.append(random.choice(data))

    nlp=spacy.blank('en', disable=['parser'])

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')
        
    ner.add_label('DATASET')

    optimizer = nlp.begin_training()
    for itn in range(iterations):
        print("Statring iteration " + str(itn))
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],
                [annotations],
                drop=0.2,
                sgd=optimizer,
                losses=losses)
        print("ITERATION {}, Losses".format(itn), losses)
    return nlp

In [None]:
train.head()

## Make TRAIN_DATA

In [None]:
TRAIN_DATA = []
for idx in tqdm(train.index):
    text = train.loc[idx, 'text'].lower().replace('!?.,;:-"\'$%^&*#@{}[]|\//(/)"', ' ').strip().replace('  ', ' ')
    index = text.find(train.loc[idx, 'cleaned_label'])
    if index >= 0:
        TRAIN_DATA.append(
            (text,
                {"entities": 
                    [
                        (index, index+len(text), "DATASET")
                    ]
                }
            )
            
        )

In [None]:
len(TRAIN_DATA)

## Start training

In [None]:
import warnings
warnings.filterwarnings("ignore")
spacy.require_gpu()

prdnlp = train_spacy(TRAIN_DATA, 3)

modelfile = 'ner'
prdnlp.to_disk(modelfile)

## Test the resulting model

In [None]:
for text, annotations in tqdm(TRAIN_DATA[:100]):

    doc = prdnlp(text)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        
        print('-----------------------------------')

Okay, we can see that model trained on 300 texts cannot find the dataset names from the texts. Have you any idea for improvement?