伊藤氏作成の[
EX_Data_patern+Spacy3_TR(0.604) ver.2](https://www.kaggle.com/ti110106/ex-data-patern-spacy3-tr-0-604?scriptVersionId=66122583) のコピー.<br>
inputデータはlocalnb004-spacy-trainのoutput.


This notebook simply uses matching if a dataset is in the document, it "predicts" the title.  It uses the 180 dataset list from the train data and adds some hand curated govt dataset titles.

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib
import pickle
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import torch 
if torch.cuda.is_available():
    import cupy

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_count_tp_fp_fn(prediction, verbose=True):
    preds = prediction.split(" ")
    if verbose:
        print(preds)
    tpc = 0
    fpc = 0
    fnc = 0
    for pred in preds:
        if pred == "TP":
            tpc = tpc + 1
        elif pred == "FP":
            fpc = fpc + 1
        elif pred == "FN":
            fnc = fnc + 1
    return [tpc, fpc, fnc]

def make_col_tp_fp_fn(df, col):
    df['TP'] = df[col].apply(lambda x : x[0])
    df['FP'] = df[col].apply(lambda x : x[1])
    df['FN'] = df[col].apply(lambda x : x[2])
    return df

def get_precision_recall(tp, fp, fn):
    precision = tp / (tp+fp)
    recall = tp / (tp + fn)
    return precision, recall

def fbeta_score(precision, recall, beta):
    fbeta = (1+(beta*beta))*((precision*recall)/( (beta*beta*precision) + recall))
    return fbeta

def coleridge_initiative_jaccard(ground_truth, prediction, verbose=True):
    gts = ground_truth.split('|')
    pds = sorted(prediction.split('|'))
    if verbose:
        print("Ground truth : " , gts)
        print("Prediction : ", pds)
        
    js_scores = []
    cf_matrix = []
    
    #### Counting True Positives (TP) and False Positives (FP)

    for pd in pds:
        if len(pd)>0:
            score = -1
            for gt in gts:
                js = jaccard(pd, gt)
                if js > score:
                    score = js
            if score >= 0.5:
                js_scores.append(score)
                cf_matrix.append("TP")
            else:
                js_scores.append(score)
                cf_matrix.append("FP")

    
    #### Counting False Negatives (FN)
    
    for gt in gts:
        score = -1
        for pd in pds:
            js = jaccard(gt, pd)
            if js > score:
                score = js
        if score == 0:
            js_scores.append(score)
            cf_matrix.append("FN")
            
    return js_scores, " ".join(cf_matrix)
    

def score_df_coleridge_initiative(output, gt_col, pred_col, beta=0.5, verbose=True):
    
    '''
    This function will calculate the FBeta score for Coleridge Initiative competition 
    if given appropriate arguments
    
    Arguments - 
    output - Your submission dataframe that has both ground truth and prediction columns.
    gt_col - This is the column name of ground truth column.
    pred_col - This is the column name of predictions column.
    beta - Beta value to calculate FBeta score.
    
    Returns - 
    This function will return the FBeta (beta=0.5) score.
    
    ## Set verbose = True to print logs    
    '''
    
    ### Jaccard Similarity
    output['evaluation'] = output.apply(lambda x: coleridge_initiative_jaccard(x[gt_col], x[pred_col], verbose=False), axis=1)
    output['js_scores'] = output['evaluation'].apply(lambda x : x[0])
    output['pred_type'] = output['evaluation'].apply(lambda x : x[1])
    
    ### TP, FP and FN 
    output['tp_fp_fn'] = output['pred_type'].apply(lambda x : get_count_tp_fp_fn(x, verbose=False))
    output = make_col_tp_fp_fn(output, 'tp_fp_fn')
    
    tp = sum(output['TP'])
    fp = sum(output['FP'])
    fn = sum(output['FN'])
    precision, recall = get_precision_recall(tp, fp, fn)
    fbeta = fbeta_score(precision, recall, 0.5)
    
    if verbose:
        #print("True Positives (TP) : ", tp)
        #print("False Positives (FP) : ", fp)
        #print("False Negatives (FN) : ", fn)
        #print("Precision : ", precision)
        #print("Recall : ", recall)
        #print("FBeta Score : ", fbeta)
        #display(output.head())
        print("TP_FP_FN : ", tp,fp,fn)

    return fbeta

In [None]:
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

sample_sub

In [None]:
train_data_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_data_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

In [None]:
def read_json_pub(filename, train_data_path=train_data_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text


In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
df2=pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
#df2=pd.read_csv("../input/coleridge-additional-gov-datasets-22000popular/additional_gov_datasets_22000popular.csv")
#df2=pd.read_csv("../input/add-dataset-coloridge/data_set_800_with2000popular.csv")

In [None]:
start_time = time.time()


#### remove >.5 jaccard matches from predicitons
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

#############################
#path=train_data_path
path=test_data_path

#for training use train_sample

#for submission use sample_sub

#############

column_names = ["Id", "PredictionString"]

submission = pd.DataFrame(columns = column_names)
fn_list=[]
fn_text=[]
all_list=[]
all_text=[]
to_append=[]
for index, row in sample_sub.iterrows():
#for index, row in tqdm(train_df.iterrows()):
    to_append=[row['Id'],'']
    large_string = str(read_json_pub(row['Id'],path))
    clean_string=text_cleaning(large_string)
    for index, row2 in df2.iterrows():
        query_string = str(row2['title'])
        if query_string in clean_string:
            if to_append[1]!='' and clean_text(query_string) not in to_append[1]:
                to_append[1]=to_append[1]+'|'+clean_text(query_string)
            if to_append[1]=='':
                to_append[1]=clean_text(query_string)

                
    if to_append[1]=='':
        fn_list+=[row['Id']]
        fn_text+=[large_string]
    all_list+=[row['Id']]
    all_text+=[large_string]

    ###### remove similar jaccard
    #got_label=to_append[1].split('|')
    #filtered=[]
    #filtered_labels = ''
    #for label in sorted(got_label, key=len):
        #label = clean_text(label)
        #if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 1.0 for got_label in filtered):
            #filtered.append(label)
            #if filtered_labels!='':
                #filtered_labels=filtered_labels+'|'+label
            #if filtered_labels=='':
                #filtered_labels=label
    #to_append[1] = filtered_labels         
    #print ('################')
    #print (to_append)
    #print (large_string)
    #print ('################')
    ###### remove similar jaccard
    df_length = len(submission)
    submission.loc[df_length] = to_append
submission.to_csv('submission.csv', index = False)
print("--- %s seconds ---" % (time.time() - start_time))
submission


In [None]:
#name=pd.Series(submission["PredictionString"].str.split("|").sum()).value_counts()
#use_name=name[name>200].index
#df3=pd.DataFrame({"title":use_name})
"""
start_time = time.time()


column_names = ["Id", "PredictionString"]

submission = pd.DataFrame(columns = column_names)
fn_list=[]
fn_text=[]
to_append=[]
for index, row in sample_sub.iterrows():
#for index, row in tqdm(train_df.iterrows()):
    to_append=[row['Id'],'']
    large_string = str(read_json_pub(row['Id'],path))
    clean_string=text_cleaning(large_string)
    for index, row2 in df3.iterrows():
        query_string = str(row2['title'])
        if query_string in clean_string:
            if to_append[1]!='' and clean_text(query_string) not in to_append[1]:
                to_append[1]=to_append[1]+'|'+clean_text(query_string)
            if to_append[1]=='':
                to_append[1]=clean_text(query_string)

                
    if to_append[1]=='':
        fn_list+=[row['Id']]
        fn_text+=[large_string]



    ###### remove similar jaccard
    df_length = len(submission)
    submission.loc[df_length] = to_append
submission.to_csv('submission.csv', index = False)
print("--- %s seconds ---" % (time.time() - start_time))
submission
"""

In [None]:
#df3["title"].sort_values().drop_duplicates()
#a=[len(df2["title"].loc[i].split())==1 for i in range(df2.shape[0])]
#df2[a]

In [None]:
%%time
!pip uninstall fastai en-core-web-sm en-core-web-lg spacy -y -q
!pip install ../input/spacy3/catalogue-2.0.3-py3-none-any.whl ../input/spacy3/typer-0.3.2-py3-none-any.whl ../input/spacy3/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/pathy-0.5.2-py3-none-any.whl ../input/spacy3/smart_open-3.0.0-py3-none-any.whl ../input/spacy3/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_legacy-3.0.5-py2.py3-none-any.whl -q
!pip install ../input/spacy3/en_core_web_lg-3.0.0-py3-none-any.whl ../input/spacy3/en_core_web_md-3.0.0-py3-none-any.whl ../input/spacy3/en_core_web_sm-3.0.0-py3-none-any.whl -q
!pip install ../input/spacy3/spacy_alignments-0.8.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_transformers-1.0.2-py2.py3-none-any.whl ../input/spacy3/en_core_web_trf-3.0.0-py3-none-any.whl -q

In [None]:
import spacy
assert spacy.__version__ == '3.0.6'
import en_core_web_trf
import torch 
if torch.cuda.is_available():
    spacy.prefer_gpu()
    
#### RIOW

#nlp = spacy.load("../input/spacy-train-set/cv4-3s-model-best")
#nlp2 = spacy.load("../input/spacy-train-set/cv0-model-best") #load the best model

nlp = spacy.load("../input/localnb004-spacy-train/model-best")

#### RIOWRIOW

In [None]:
df2.head()

In [None]:
#fn_list=list(train_df["Id"].iloc[0:10])
#fn_text=[str(read_json_pub(str(fn_list[i]),train_data_path)) for i in range(10)]

In [None]:
#from transformers import pipeline
#if torch.cuda.is_available():
#    nlp_qa = pipeline("question-answering",device=0)
#else:
#    nlp_qa = pipeline("question-answering")

In [None]:
#with open("../input/spacy-train-set/nlp_qa_model.pkl","rb") as f:
#    nlp_qa=pickle.load(f)

In [None]:
%%time


existing_labels = set(df2["title"])
def nlp_label(Id,text,existing_labels,nlp_er):
    doc = nlp_er(text)
    ent_d=set([doc.ents[i].text  for i in range(len(doc.ents)) if (doc.ents[i].label_ == 'DB_label') & (clean_text(doc.ents[i].text) != "")] )
    c_label=[]

    for ent in ent_d:
        j_val=[jaccard(clean_text(ent.lower()), clean_text(list(existing_labels)[i]))>0.7  for i in range(len(existing_labels)) ]
        #c_label+=set(pd.Series(list(existing_labels))[j_val] )
        #j_val=[jaccard(clean_text(ent.lower()), clean_text(list(existing_labels)[i]))  for i in range(len(existing_labels)) ]
        #if np.max(j_val) > 0.7:
        #    c_label+=set(pd.Series(list(existing_labels)).iloc[np.argmax(j_val)] )
        if sum(j_val)==0:
            c_label+=[clean_text(str(ent).lower())]
            #if nlp_qa0(question="dataset?", context=str(ent))["score"] > 0.7:
            #    c_label+=[clean_text(nlp_qa0(question="dataset?", context=str(ent))['answer'].lower())  ]
    
    
    
    del nlp_er
    #del nlp_qa0

    
    return ["|".join(c_label)]

In [None]:
%%time


existing_labels = set(df2["title"])
def nlp_label_cv(Id,text,existing_labels,nlp_list):
    c_label=[]
    for nlp_er0 in nlp_list:
        doc = nlp_er0(text)
        ent_d=set([doc.ents[i].text  for i in range(len(doc.ents)) if (doc.ents[i].label_ == 'DB_label') & (clean_text(doc.ents[i].text) != "")] )
       

        for ent in ent_d:
            #j_val=[jaccard(clean_text(ent.lower()), clean_text(list(existing_labels)[i]))>0.7  for i in range(len(existing_labels)) ]
            #c_label+=set(pd.Series(list(existing_labels))[j_val] )
            #j_val=[jaccard(clean_text(ent.lower()), clean_text(list(existing_labels)[i]))  for i in range(len(existing_labels)) ]
            #if np.max(j_val) > 0.7:
            #    c_label+=set(pd.Series(list(existing_labels)).iloc[np.argmax(j_val)] )
            #if sum(j_val)==0:
            c_label+=[clean_text(str(ent).lower())]
                #if nlp_qa0(question="dataset?", context=str(ent))["score"] > 0.7:
                #    c_label+=[clean_text(nlp_qa0(question="dataset?", context=str(ent))['answer'].lower())  ]



        del nlp_er0
    #del nlp_qa0

    
    return ["|".join(list(set(c_label)))]

In [None]:


pred_ner=pd.DataFrame(columns=["Id",'PredictionString'])#
tex_df=pd.DataFrame({"Id":fn_list,"raw_text":fn_text}).drop_duplicates()#train

Id_list=[]
pred_list=[]
for Id in tqdm(fn_list):
    if torch.cuda.is_available():
        spacy.prefer_gpu()
        torch.cuda.empty_cache()
        cupy.get_default_memory_pool().free_all_blocks()
    nlp_er = nlp
    #### RIOW
    #nlp_er2 = nlp2
    #### RIOWRIOW
    text = tex_df.set_index("Id").loc[Id,"raw_text"]
    if len(text) > 200_000:
        text=text[0:200_000]
    Id_list+=[Id]
    #### RIOW
    #pred_list+=nlp_label_cv(Id,text,existing_labels,[nlp_er,nlp_er2])
    pred_list+=nlp_label_cv(Id,text,existing_labels,[nlp_er])
    #### RIOWRIOW


pred_ner=pd.DataFrame({"Id":Id_list,'PredictionString':pred_list})   
sum(pred_ner["PredictionString"]=="")


In [None]:
#score_df_coleridge_initiative(train_df.merge(pred_ner,left_on="Id",right_on="Id",how="right"), gt_col="cleaned_label", pred_col="PredictionString", beta=0.5, verbose=True)

In [None]:
pred_ner

In [None]:
name=pd.Series(pred_ner["PredictionString"].str.split("|").sum()).value_counts()

In [None]:
tex_df=pd.DataFrame({"Id":all_list,"raw_text":all_text}).drop_duplicates()
use_name=name[name>100].index
column_names = ["Id", "PredictionString"]
pred_match = pd.DataFrame(columns = column_names)
to_append=[]
for Id in tqdm(all_list):
#for index, row in tqdm(train_df.iterrows()):
    to_append=[Id,'']
    large_string = str(tex_df.set_index("Id").loc[Id,"raw_text"])
    clean_string=text_cleaning(large_string)
    for row2 in use_name:
        query_string = str(row2)
        if query_string in clean_string:
            if to_append[1]!='' and clean_text(query_string) not in to_append[1]:
                to_append[1]=to_append[1]+'|'+clean_text(query_string)
            if to_append[1]=='':
                to_append[1]=clean_text(query_string)
    #pred_match+=to_append
    df_length = len(pred_match)
    pred_match.loc[df_length] = to_append

In [None]:
#sub=pd.concat([submission,pred_ner])
sub=pd.concat([submission,pred_match])
sub["PredictionString"]=sub["PredictionString"].str.split("|")
sub=sub.groupby("Id").sum()
sub["PredictionString"]=["|".join(list(set(sub["PredictionString"][i]))) for i in range(sub.shape[0]) ]
sub=sub.reset_index()
sub

In [None]:
sub.to_csv('submission.csv', index=False)

In [None]:
#score_df_coleridge_initiative(train_df.merge(submission,left_on="Id",right_on="Id"), gt_col="cleaned_label", pred_col="PredictionString", beta=0.5, verbose=True)