In [None]:
# !pip install scispacy
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz

In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os
from tqdm.autonotebook import tqdm
import json
from functools import partial
import string
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D

from sklearn.utils import resample, shuffle
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

import nltk
import spacy
from nltk.probability import FreqDist

nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
nlp.max_length = 40000000

os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/')
INPUT_SHAPE = 300
OUTPUT_SHAPE = 2

try:
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
except ValueError:
    print("already has pipe")

# Citation worthiness data

In [None]:
cite_worthiness_path = '../input/citeworthinesstrainjsonl/cite-worthiness-scaffold-train.jsonl'
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
cite_worthiness = open(cite_worthiness_path, "r").read().split("\n")[:-1]
print(len(cite_worthiness))
print(json.loads(cite_worthiness[0]))

In [None]:
arr = []
idx = 0
for citation in tqdm(cite_worthiness[:-1]):
    c = json.loads(citation)
    arr.append([idx, c['text'], c['cleaned_cite_text'], c['is_citation']])
    idx+=1
cite_columns = ['id', 'text', 'cleaned_text', 'is_citation']
x_df  = pd.DataFrame(arr, columns=cite_columns)
x_df.head()

In [None]:
def read_append_return(filename,train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        # load json from a single publication
        json_decode = json.load(f)
        # for all chapters/sections in a publication
        for data in json_decode:
            headings.append(data.get('section_title')) # place all headings of a publication in a list
            contents.append(data.get('text')) # place all texts of a publication in a list
            combined.append(data.get('section_title')) # combination of above 2
            combined.append(data.get('text'))
    all_headings = ' '.join(headings) # place all headings of a document in
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    if output == 'head':
        return all_headings
    else:
        return all_data
    

In [None]:
%%time
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
 
# loop through all publications in train folder and add all te text in publication to the train_df['text'] column
train_df['text'] = train_df['Id'].progress_apply(read_append_return)
train_df.head()

In [None]:
worthy = x_df[x_df.is_citation == True]
unworthy = x_df[x_df.is_citation == False]

In [None]:
unworthy_worthy_ration = 8/2
unworthy_downsampled = resample(unworthy,
                               replace=False,
                               n_samples=int(unworthy_worthy_ration*len(worthy)),
                               random_state=1337)
x_df_balanced = pd.concat([worthy, unworthy_downsampled])
x_df_balanced = shuffle(x_df_balanced).reset_index()[['text', 'cleaned_text', 'is_citation']]
x_df_balanced.head()

print(len(x_df_balanced[x_df_balanced['is_citation'] == True]))
print(len(x_df_balanced[x_df_balanced['is_citation'] == False]))



In [None]:
for i, row in tqdm(train_df.iterrows()):
    text = row['text']
    label = row['dataset_label']
    location = text.find(label)
    start = location-400 if location > 400 else 0
    end = location+400
    text_slice = text[start:end]
    doc = nlp(text_slice)
    for sent in doc.sents:
        if label in sent.text:
            x_df_balanced = x_df_balanced.append({'text': sent.text, 'cleaned_text':sent.text, 'is_citation':True}, ignore_index=True)

In [None]:
print(len(x_df_balanced[x_df_balanced['is_citation'] == True]))
print(len(x_df_balanced[x_df_balanced['is_citation'] == False]))

In [None]:
len(unworthy_downsampled) / (len(worthy) + len(unworthy_downsampled))

In [None]:
def make_dataset(data_df, input_shape, output_shape):
    X = np.zeros((len(data_df), 1, input_shape))
    y = np.zeros((len(data_df), 1, output_shape))
    i = 0
    for idx, record in tqdm(data_df.iterrows()):
        X[i] = [nlp(record['cleaned_text']).vector]
        y[i] = [1.0, 0] if record['is_citation'] else [0, 1.0] # [1.0, 0.0] if citations else [0.0, 1.0] 
        i+=1
    return X, y

In [None]:
X, y= make_dataset(x_df_balanced, INPUT_SHAPE, OUTPUT_SHAPE)
len(X)


# Model

In [None]:
class CitationWorthinessModel:
    def __init__(self, input_shape, output_shape, use_dropout = True, dropout_rate = 0.2, nro_hidden_layers=1):
        self.input_shape = input_shape
        self.output_shape = output_shape
        self.use_dropout = use_dropout
        self.dropout_rate = dropout_rate
        self.nro_hidden_layers = nro_hidden_layers
    
    def make_model_mlp(self) -> Sequential:
        dropout = self.use_dropout
        model = Sequential()
        model.add(Input(shape=(self.input_shape)))
        for i in range(self.nro_hidden_layers):
            if dropout:
                model.add(Dropout(self.dropout_rate))
            model.add(Dense(60, activation='relu'))
        model.add(Dense(self.output_shape, activation='softmax'))

        model.compile(
            optimizer= tf.keras.optimizers.Adam(),  # Optimizer
            # Loss function to minimize
            loss=tf.keras.losses.BinaryCrossentropy(),
            # List of metrics to monitor
            metrics=["accuracy"]
        )
        model.summary()
        return model
    
    def make_model_conv(self) -> Sequential:
        model = Sequential()
        model.add(Input(shape=(1, self.input_shape)))
        
        
        model.add(Conv1D(32, 3, padding='same', activation='relu'))
        model.add(Conv1D(16, 3, padding='same', activation='relu'))
        model.add(Conv1D(8, 3, padding='same', activation='relu'))
        model.add(Conv1D(4, 3, padding='same', activation='relu'))
        model.add(Conv1D(2, 3, padding='same', activation='relu'))
        
        
        model.add(Dense(self.output_shape, activation='softmax'))
        model.compile(
            optimizer= tf.keras.optimizers.Adam(),  # Optimizer
            # Loss function to minimize
            loss=tf.keras.losses.BinaryCrossentropy(),
            # List of metrics to monitor
            metrics=["accuracy"]
        )
        return model
    
    def train(self, X, y, epochs, batch_size, validation_data=None, verbose=0) -> None:
        self.model = self.make_model_mlp()
        if not validation_data:
            self.model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=int(verbose))
        else:
            self.model.fit(X, y, epochs=epochs, batch_size=batch_size, validation_data=validation_data, verbose=int(verbose))
        
    def test(self, X, y, epochs, batch_size, verbose=0):
        self.confusion_matrix = np.zeros(shape=(2,2))
        kfold = KFold(n_splits=10, shuffle=True)
        fold_nr = 1
        for train_idx, val_idx in kfold.split(X, y):
            print(f"Training Fold-{fold_nr}")
            self.train(X[train_idx], y[train_idx],
                       epochs=epochs, batch_size=batch_size,
                       validation_data=(X[val_idx], y[val_idx]),
                       verbose=verbose)
            metrics = self.get_metrics(data=(X[val_idx], y[val_idx]), batch_size=32)
            prediction = self.predict(data=X[val_idx])
            self.confusion_matrix+=confusion_matrix(y_true=y[val_idx][:,0],
                                                    y_pred=np.round()[:,0])
            fold_nr+=1
        
    def get_metrics(self, data, batch_size) -> str:
        x_val, y_val = data
        return self.model.evaluate(x_val, y_val, batch_size=batch_size)
        
    def predict(self, data) -> list:
        return self.model.predict(data)


In [None]:
model = CitationWorthinessModel(input_shape=INPUT_SHAPE, output_shape=OUTPUT_SHAPE, dropout_rate=0.2, nro_hidden_layers=3)
model.train(X, y, epochs=50, batch_size=8, validation_data=None, verbose=1)


In [None]:
# model.test(X, y, epochs=50, batch_size=8, verbose=1)

In [None]:
test_random_set = np.random.randint(len(X), size = 10)
X_test = X[test_random_set,:]
y_test = y[test_random_set,:]
print(type(X_test))
print(np.round(model.predict(X_test)))
y_test

# Identifying citation worthy sentences in coleridge initiative.

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
temp_1 = pd.unique(train_df['dataset_title'])
temp_2 = pd.unique(train_df['dataset_label'])
temp_3 = pd.unique(train_df['cleaned_label'])


labels = np.unique(np.concatenate((temp_1, temp_2, temp_3)))
print(len(labels))

In [None]:
sample_sub['text'] = sample_sub['Id'].progress_apply(read_append_return)
sample_sub

In [None]:
def visualize_model_results(text):
    doc = nlp(text)
    for sent in doc.sents:
        vector = [nlp(str(sent)).vector]
        p = model.predict(np.array([vector,]))
        p = np.round(p).flatten()
        worthy = p[0] == 1
        if worthy:
            print(worthy, sent)

txt = sample_sub.iloc[0]['text']
visualize_model_results(txt)

In [None]:
def get_sentences_and_vectors(Id):
    text = sample_sub[sample_sub['Id'] == Id]['text'].iloc[0]
    doc = nlp(text)
    found_labels = []
    for sent in doc.sents:
        vector = [nlp(str(sent)).vector] # sentence vector
        p = model.predict(np.array([vector,])) # worthiness prediction
        p = np.round(p).flatten()
        worthy = p[0] == 1
        if worthy: #citation worthy
            for label in labels:
                if label in str(sent) and label not in found_labels:
                    found_labels.append(clean_text(label))
    return "|".join(found_labels)

sample_sub["PredictionString"] = sample_sub['Id'].progress_apply(get_sentences_and_vectors)

In [None]:
sample_sub = sample_sub[['Id', 'PredictionString']]
sample_sub.to_csv('submission.csv', index=False)

sample_sub.head()