Reference: https://www.kaggle.com/prashansdixit/coleridge-initiative-eda-baseline-model

In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os
import re
import json
import glob
from collections import defaultdict
from textblob import TextBlob
from functools import partial

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

import nltk
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
nlp.max_length = 4000000
from nltk.probability import FreqDist
from wordcloud import WordCloud, STOPWORDS

from tqdm.autonotebook import tqdm
import string

%matplotlib inline

os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/')

In [None]:
# reading csv files and train & test file paths
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
kaggle_env = False
if kaggle_env:
    dirname = "/kaggle/input/coleridgeinitiative-show-us-the-data"
else:
    dirname = ""
#df = pd.read_csv(os.path.join(dirname, "train.csv"))

In [None]:
train_df["cleaned_label"].value_counts(dropna=False)

> There are only 130 'cleaned_label's / only 45 unique 'dataset_title's for the 14,316 training data files.
https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/discussion/228322

In [None]:
train_df["cleaned_label"].nunique()

In [None]:
train_df["dataset_title"].nunique()

In [None]:
i = 0
display(train_df.iloc[i, :])
display(pd.read_json(os.path.join(dirname, "train", train_df["Id"][i]+".json")))

In [None]:
i = 1
display(train_df.iloc[i, :])
display(pd.read_json(os.path.join(dirname, "train", train_df["Id"][i]+".json")))

In [None]:
i = 3
display(train_df.iloc[i, :])
display(pd.read_json(os.path.join(dirname, "train", train_df["Id"][i]+".json")))

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(read_append_return)

In [None]:
train_df.sample(5)

In [None]:
train_df["text"][0]

In [None]:
%%time
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))

In [None]:
sample_sub

In [None]:
sample_sub["text"][0]

In [None]:
sample_sub["text"][1]

In [None]:
sample_sub["text"][2]

In [None]:
sample_sub["text"][3]

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
%%time
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

In [None]:
%%time 
tqdm.pandas()
sample_sub['text'] = sample_sub['text'].progress_apply(text_cleaning)

In [None]:
def prepare_text(text, nlp=nlp):
    '''
    Returns the text after stop-word removal and lemmatization.
    text - Sentence to be processed
    nlp - Spacy NLP model
    '''
    doc = nlp(text)
    lemma_list = [token.lemma_ for token in doc if not token.is_stop]
    lemmatized_sentence = ' '.join(lemma_list)
    
    return lemmatized_sentence

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
# %%time
# tqdm.pandas()
# train_df['text'] = train_df['text'].progress_apply(prepare_text)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
i = 2
example = train_df["text"][i][:100]
ner_results = nlp(example)
print(ner_results)

In [None]:
train_df["ner_result"] = train_df["text"].apply(lambda x: nlp(x[:100]))

In [None]:
#example = "My name is Wolfgang and I live in Berlin"
#ner_results = nlp(example)
#print(ner_results)

In [None]:
train_df["ner_result"]

In [None]:
train_df_ = train_df[train_df["ner_result"].apply(lambda x: len(x)) > 0]

In [None]:
train_df_["ner_result"][85]

In [None]:
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)
id_list = []
lables_list = []
for index, row in tqdm(sample_sub.iterrows()):
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]
    cleaned_labels = temp_df['cleaned_label'].to_list()
    for known_label in existing_labels:
        if known_label in sample_text.lower():
            cleaned_labels.append(clean_text(known_label))
    cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

In [None]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list
display(submission)

In [None]:
submission["PredictionString"][0]

In [None]:
submission["PredictionString"][1]

In [None]:
submission["PredictionString"][2]

In [None]:
submission["PredictionString"][3]

In [None]:
submission.to_csv('submission.csv', index=False)