### Install Libraries and Models 

In [None]:
#Install needed packages and NLP models
!pip install -U pysolr
!pip install -U scispacy
!pip install -U jsonpath-ng
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz
!pip install -U pyvis
    
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_craft_md-0.2.4.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_jnlpba_md-0.2.4.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bionlp13cg_md-0.2.4.tar.gz

In [None]:
!wget -O solr-8.5.0.zip "https://archive.apache.org/dist/lucene/solr/8.5.0/solr-8.5.0.zip";
!unzip solr-8.5.0.zip

In [None]:
!ls

### Data Preprocessing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
import os
import json
import glob
import re
import time
import pysolr
import csv
import time
import scipy
import spacy
import scispacy
import pandas as pd
import numpy as np
import networkx as nx
import ipywidgets as widgets
from os import path
from pandas import ExcelWriter
from pandas import ExcelFile
from jsonpath_ng.ext import parse
from collections import Counter
from collections import OrderedDict
from IPython.core.display import display, HTML
from pyvis.network import Network
from datetime import date
import dateutil.parser as dparser
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### Preprocessing data

In [None]:
def clean_text(text) :
    text = re.sub(r" ?\([^)]*\)", "", text)
    text = re.sub(r" ?\[[^)]*\]", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"  ", " ", text)
    return text

In [None]:
def preprocess_data() :
    start = time.time()

    meta_df = pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv', sep=',', header=0)
    meta_columns = list(meta_df.columns)
    meta_df.fillna('', inplace=True)

    # Filter to pick only needed sections
    include_set = ['Abstract','Introduction', 'background', 'Discussion', 'Results', 'Results and Discussion', 'methods,results']
    section_df = pd.read_excel('/kaggle/input/covidsectionheads/covid_section_heads.xlsx', sheet_name='Sheet1')
    section_df.dropna(subset=['Section Heads'], inplace=True)
    section_df = section_df[section_df['Section Heads'].isin(include_set)]

    # Categories/Targets to pick from Section Heads column
    introduction_categories = ["Introduction"]
    discussion_categories = ["Discussion"]
    result_categories = ["Results", "Results and Discussion", 'methods,results']

    introduction_df = section_df[section_df['Section Heads'].isin(introduction_categories)]
    discussion_df = section_df[section_df['Section Heads'].isin(discussion_categories)]
    result_df = section_df[section_df['Section Heads'].isin(result_categories)]

    intro_list = introduction_df.iloc[:, 0].tolist()
    discussion_list = discussion_df.iloc[:, 0].tolist()
    result_list = result_df.iloc[:, 0].tolist()

    intro_list = list(map(lambda x: str(x).strip(), intro_list))
    discussion_list = list(map(lambda x: str(x).strip(), discussion_list))
    result_list = list(map(lambda x: str(x).strip(), result_list))

    path = '/kaggle/input/CORD-19-research-challenge/'
    paths = [p for p in glob.glob(path + "**/*.json", recursive=True)]
    files_size = len(paths)

    col_names = ['paper_id','title','source', 'abstract','introduction','discussion','result','body', 'has_covid']
    clean_df = pd.DataFrame(columns=col_names)

    covid_syns = ['COVID-19','COVID19','2019-nCoV','2019nCoV','Coronavirus','SARS-CoV-2','SARSCov2','novel Coronavirus']

    target_empty_count = 0

    abstract_expr = parse('$.abstract[*].text')

    for path in paths:
        with open(path) as f:
            intro_text_list = list()
            discussion_text_list = list()
            result_text_list = list()

            data = json.load(f)

            abstract_texts = [match.value for match in abstract_expr.find(data)]

            body_nodes = data['body_text']

            for entry in body_nodes :
                section_name = entry['section']
                section_name = section_name.strip().lower()
                entry_text = entry['text']

                if section_name.strip() in intro_list:
                    intro_text_list.append(entry_text)

                if section_name.strip() in discussion_list:
                    discussion_text_list.append(entry_text)

                if section_name.strip() in result_list:
                    result_text_list.append(entry_text)

            if len(intro_text_list) == 0 and len(discussion_text_list) == 0 and len(result_text_list) == 0 :
                target_empty_count = target_empty_count + 1


            id = data['paper_id']
            title = data['metadata']['title']

            pubtime_df = meta_df[meta_df.sha == id]['publish_time']
            pubtime_dict = pubtime_df.to_dict()
            pubtime = ''
            for pubtime_field_key in pubtime_dict.keys():
                temp_pubtime_str = pubtime_dict.get(pubtime_field_key)
                orig_temp_pubtime_str = temp_pubtime_str
                try:
                    temppubdate = dparser.parse(orig_temp_pubtime_str,fuzzy=True).date()
                    pubtime = temppubdate.strftime("%Y-%m-%dT%H:%M:%SZ")
                except Exception as e:
                    temp_pubtime_str_parts = temp_pubtime_str.split(' ')
                    if len(temp_pubtime_str_parts) > 2 :
                        try :
                            temp_pubtime_str = temp_pubtime_str_parts[0] + ' ' + temp_pubtime_str_parts[1] + ' ' + temp_pubtime_str_parts[2]
                            temppubdate = dparser.parse(temp_pubtime_str,fuzzy=True).date()
                            pubtime = temppubdate.strftime("%Y-%m-%dT%H:%M:%SZ")
                        except Exception as ex:
                            pubtime = ''
                    else:
                        pubtime = ''      

            sha_df = meta_df[meta_df.sha == id]['source_x']
            meta_dict = sha_df.to_dict()
            source = ''
            for meta_field_key in meta_dict.keys():
                source = meta_dict.get(meta_field_key)

            if not source:
                title_df = meta_df[meta_df.title == title]['source_x']
                meta_dict = title_df.to_dict()
                for meta_field_key in meta_dict.keys():
                    source = meta_dict.get(meta_field_key)

            abstract = clean_text(" ".join(abstract_texts))
            introduction = clean_text(" ".join(intro_text_list))
            discussion = clean_text(" ".join(discussion_text_list))
            result = clean_text(" ".join(result_text_list))
            body = " ".join([introduction, discussion, result])

            has_covid = 'false'

            res = [ele for ele in covid_syns if (ele.lower() in body.lower())]
            if(len(res)  > 0):
                has_covid = 'true'

            if len(body.strip()) > 0 or len(abstract) > 0:
                new_row = {'paper_id': id, 'title': title.strip(), 'source': source,'abstract': abstract.strip(),
                           'introduction': introduction.strip(),'discussion': discussion.strip(),
                           'result': result.strip(), 'body': body.strip(), 'publish_time': pubtime,'has_covid': has_covid}
                clean_df = clean_df.append(new_row, ignore_index=True)

    # Drop duoplicate papers
    clean_df.drop_duplicates(subset=['title','abstract'], keep='first', inplace=False)
    clean_df.to_csv('/kaggle/working/CORD-19.csv', index=True)

    print('Final DataFrame Shape - ', clean_df.shape)
    print("Papers that dont have Intro, Discussion or Result  - ", target_empty_count)
    print('Total Papers processed - ', files_size)

    print('Time Elaspsed - ', time.time() - start)
    

### Configure Search Engine

In [None]:
!solr-8.5.0/bin/solr start -force

In [None]:
!solr-8.5.0/bin/solr create -c covid19 -s 1 -rf 1 -force

In [None]:
# Using _default configset with data driven schema functionality. NOT RECOMMENDED for production use.
!solr-8.5.0/bin/solr config -c covid19 -p 8983 -action set-user-property -property update.autoCreateFields -value false

In [None]:
#Set Up Synonyms

!echo 'COVID-19,covid19,2019-nCoV,2019nCoV,Coronavirus,SARS-CoV-2,SARSCov2,novel Coronavirus' >> solr-8.5.0/server/solr/covid19/conf/synonyms.txt;
!echo 'heart,cardiac,tachycardia,myocardial' >> solr-8.5.0/server/solr/covid19/conf/synonyms.txt;
!echo 'pulmonary,respiratory' >> solr-8.5.0/server/solr/covid19/conf/synonyms.txt;

In [None]:
!cat solr-8.5.0/server/solr/covid19/conf/synonyms.txt

In [None]:
#Reload the covid19 core/collection because we added new synonyms. Need reload as it will affect index
#Whenever new synonyms are added we need to reindex as synonyms are applied both on index and query analyzers
!curl 'http://localhost:8983/solr/admin/cores?action=RELOAD&core=covid19'

In [None]:
#Add custom field Type that wont tokenize phrases for fields like source etc
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field-type" : {"name":"keywordText","class":"solr.TextField", "positionIncrementGap":"100", "indexAnalyzer" : {"tokenizer":{"class":"solr.KeywordTokenizerFactory" }, "filters":[{"class":"solr.TrimFilterFactory"},{"class":"solr.StopFilterFactory", "ignoreCase":true, "words":"lang/stopwords_en.txt"},{"class":"solr.ManagedSynonymGraphFilterFactory", "managed":"english" },{"class":"solr.RemoveDuplicatesTokenFilterFactory"},{"class":"solr.FlattenGraphFilterFactory"}]},"queryAnalyzer" : {"tokenizer":{"class":"solr.KeywordTokenizerFactory" },"filters":[{"class":"solr.TrimFilterFactory"},{"class":"solr.StopFilterFactory", "ignoreCase":true, "words":"lang/stopwords_en.txt"},{"class":"solr.ManagedSynonymGraphFilterFactory", "managed":"english" },{"class":"solr.RemoveDuplicatesTokenFilterFactory"}]}}}' http://localhost:8983/solr/covid19/schema

In [None]:
#Create SOLR field definitions
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"title", "type":"text_en_splitting_tight", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"abstract", "type":"text_en_splitting_tight", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"source", "type":"keywordText", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"introduction", "type":"text_en_splitting_tight", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"discussion", "type":"text_en_splitting_tight", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"result", "type":"text_en_splitting_tight", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"body", "type":"text_en_splitting_tight", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"publish_time", "type":"pdate", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;
!curl -X POST -H 'Content-type:application/json' --data-binary '{"add-field": {"name":"has_covid", "type":"boolean", "multiValued":false, "stored":true, "indexed":true}}' http://localhost:8983/solr/covid19/schema;

### Create Search Index

In [None]:
solr = pysolr.Solr('http://localhost:8983/solr/covid19/', timeout=10)

In [None]:
generic_model = spacy.load('en_core_sci_md')

#Load preprocessed CSV data
csv_path = '/kaggle/input/working/CORD-19.csv'

if not path.exists(csv_path):
    preprocess_data()
    
df = pd.read_csv(csv_path, sep=',', header=0)
df.fillna('', inplace=True)
print('DF candidate_list size - ', df.shape)

df.head(2)

In [None]:
# Index each pandas row as a document into SOLR search engine

covid_syns = ('SARSCoV2','SARS-CoV-2', '2019-nCoV','2019nCoV','COVID-19', 'COVID19','coronavirus', 'corona virus' 'novel coronavirus')

list_for_solr=[]
counter = 0
for index, row in df.iterrows():
    id = row['paper_id']
    title = row["title"]
    source = row["source"]    
    abstract = row["abstract"]
    introduction = row["introduction"]
    discussion = row["discussion"]
    result = row["result"]
    publish_time = row["publish_time"]    
    body = row["body"]  # Cocatenated text of all text fields abstract, introduction, discussion, result
    
    if((title and title.isspace()) and (abstract and abstract.isspace()) and (body and body.isspace())):
        continue
        
    has_covid = 'false'
    if any(words in body for words in covid_syns):
        has_covid = 'true'
    
    solr_content = {}
    solr_content['id'] = id
    solr_content['title'] = title
    solr_content['source'] = source
    solr_content['abstract'] = abstract
    solr_content['introduction'] = introduction
    solr_content['discussion'] = discussion
    solr_content['result'] = result    
    solr_content['body'] = body
    solr_content['has_covid'] = has_covid
    solr_content['publish_time'] = publish_time    
        
    list_for_solr.append(solr_content)
    
    if index % 1000 == 0:
        solr.add(list_for_solr)
        list_for_solr = []
        counter = counter + 1000
        print('Counter ', counter)
        
#Commit is very costly use it sparingly        
solr.commit()
print('Indexing Finished !')

In [None]:
def extract_entities(models, text) :
    entities = {}
    
    for nlp in models :
        doc = nlp(text)
        for ent in doc.ents:
            entity = ent.text
            if ent.label_ in entities :
                if entities[ent.label_].count(ent.text) == 0:
                    entities[ent.label_].append(ent.text)
            else :
                entities[ent.label_] = [ent.text]

    return entities

In [None]:
def initilize_nlp_models(model_names):
    models = {}
    for name in model_names:
        models[name] = spacy.load(name)
    
    print('Models Loaded')
    return models

In [None]:
def search_task_answers(search_results) :
    answers_list = list()
    
    for search_result in search_results:
        doc_hl_dict = {}
        
        id = search_result.get('id', "MISSING")
        title = search_result.get('title', "MISSING")
        
        doc_highlights = search_results.highlighting[id]
        
        doc_hl_dict['id'] = id
        doc_hl_dict['title'] = title
        
        if len(doc_highlights) > 0:
            display(HTML(f'<h4><i>{title}\n</i></h4>'))

        for doc_hl_field in doc_highlights:
            hl_snippets = doc_highlights[doc_hl_field]
        
            if len(hl_snippets) > 0 :
                answer_snippet = ''
                
                #print('ID : ', id, '\nTITLE : ', title)
                #print('\t', doc_hl_field)
                display(HTML(f'\t<h5>{doc_hl_field}\n</h5>\n'))
            
                for index, snippet in enumerate(hl_snippets, start=1):
                    answer_snippet = answer_snippet.strip() + " " + snippet.strip()
                    
                    display(HTML(f'<blockquote>{index}. {snippet.strip()}\n</blockquote>'))
                    #print('\t\t', index , '. ' , snippet.strip(), '\n') 
                                  
                doc_hl_dict[doc_hl_field] = answer_snippet.strip()
                    
        if len(doc_hl_dict) > 0:
            answers_list.append(doc_hl_dict)
        
    return answers_list

In [None]:
def search(query, rows=5):
    # Search for data
    search_results = solr.search(query, rows, **{
    'fq':'has_covid:true',
    'qf':'title^50.0 abstract^40.0 introduction^30.0 discussion^20.0 result^50.0 body^10.0',
    'pf':'title^60.0 abstract^50.0 introduction^40.0 discussion^30.0 result^60.0 body^20.0',
    'hl': 'true',
    'hl.bs.type': 'SENTENCE',
    'hl.method' : 'unified',
    'hl.snippets' : 5,
    'hl.usePhraseHighlighter': 'true',
    'hl.highlightMultiTerm' : 'true',
    'hl.tag.pre':'',
    'hl.tag.post':'',
    'df':'body',
    'hl.fl':'introduction,discussion,result'
    })

    num_docs_found = search_results.hits
    num_search_results = len(search_results)
    display(HTML(f'<h3 style="color:blue">Top {num_search_results} search result(s) of {num_docs_found} total. \n</h3>'))
    
    return num_docs_found, search_results

In [None]:
def populate_labels(task_answers) :
    field_list = ['introduction','discussion', 'result']

    for doc_answer_dict in task_answers:
        all_entities = set()
        intro_entities = set()
        discussion_entities = set()
        result_entities = set() 

        for field_name, answer_text in doc_answer_dict.items():
            if field_name in field_list:
                ent_dict = extract_entities(chosen_models, answer_text)
                ent_list = ent_dict['ENTITY']

                all_entities.update(ent_list)
                if field_name == 'introduction' :
                    intro_entities.update(ent_list)
                elif field_name == 'discussion' :
                    discussion_entities.update(ent_list)
                else :
                    result_entities.update(ent_list)

        # Now set up labels for entities
        prior_newdata_entities = set()
        prior_strong_entities = set()
        prior_entities = set()
        speculative_entities = set()
        unknown_entities = set()    
        novel_entities = set()

        for a_ent in all_entities :
            if a_ent in intro_entities and a_ent in result_entities and a_ent in discussion_entities : 
                prior_newdata_entities.add(a_ent)
            elif a_ent in intro_entities and a_ent not in result_entities and a_ent in discussion_entities :
                prior_strong_entities.add(a_ent)
            elif a_ent in intro_entities and a_ent not in result_entities and a_ent not in discussion_entities :
                prior_entities.add(a_ent)
            elif a_ent not in intro_entities and a_ent in result_entities and a_ent not in discussion_entities : 
                unknown_entities.add(a_ent)
            elif a_ent not in intro_entities and a_ent in result_entities and a_ent in discussion_entities :
                novel_entities.add(a_ent)
            else :
                pass
            
        if(len(prior_newdata_entities) > 0) :
            doc_answer_dict['prior-newdata'] = list(prior_newdata_entities)

        if(len(prior_strong_entities) > 0) :
            doc_answer_dict['prior-strong'] = list(prior_strong_entities)          

        if(len(prior_entities) > 0) :
            doc_answer_dict['prior'] = list(prior_entities)             

        if(len(speculative_entities) > 0) :
            doc_answer_dict['speculative'] = list(speculative_entities)

        if(len(unknown_entities) > 0) :
            doc_answer_dict['unknown'] = list(unknown_entities)           

        if(len(novel_entities) > 0) :
            doc_answer_dict['novel'] = list(novel_entities)           

    return task_answers

### What do we know about COVID-19 risk factors?

Task Details

What do we know about COVID-19 risk factors? What have we learned from epidemiological studies?
Specifically, we want to know what the literature reports about:

* Data on potential risks factors
    1. Smoking, pre-existing pulmonary disease
    2. Co-infections (determine whether co-existing respiratory/viral infections make the virus more transmissible or virulent) and other co-morbidities
    3. Neonates and pregnant women
    4. Socio-economic and behavioral factors to understand the economic impact of the virus and whether there were differences.
* Transmission dynamics of the virus, including the basic reproductive number, incubation period, serial interval, modes of transmission and environmental factors
* Severity of disease, including risk of fatality among symptomatic hospitalized patients, and high-risk patient groups
* Susceptibility of populations
* Public health mitigation measures that could be effective for control


In [None]:
tasks = ['Smoking and pre-existing pulmonary disease', 
         'Co-infections, co-morbidities and respiratory infections',
         'Neonates and pregnant women',
         'Socio-economic and behavioral factors to understand the economic impact of the virus and whether there were differences.',
         'Transmission dynamics of the virus, including the basic reproductive number, incubation period, serial interval, modes of transmission and environmental factors',
         'Severity of disease, including risk of fatality among symptomatic hospitalized patients, and high-risk patient groups',
         'Susceptibility of populations',
         'Public health mitigation measures that could be effective for control'
        ]

queries = ['("Smoking COVID-19"~10 OR "pulmonary disease"~10)', 
           '("Co-infections COVID-19"~10 OR "co-morbidities COVID-19"~10 OR "respiratory infections COVID-19"~10)',
           '("Neonates COVID-19"~10 OR "pregnant women COVID-19"~10)',
           'Socio-economic, behavioral factors and economic impact',
           'Transmission dynamics, basic reproductive number, incubation period, serial interval, modes of transmission and environmental factors',
           'Severity of disease, risk of fatality among symptomatic hospitalized patients and high-risk patient groups',
           'Susceptibility of populations',
           'Public health mitigation measures that are effective for control'
        ]

In [None]:
#Load NLP Models only once
# model_names = ['en_ner_craft_md','en_ner_jnlpba_md', 'en_ner_bc5cdr_md','en_ner_bionlp13cg_md']
# models_dict = initilize_nlp_models(model_names)

In [None]:
# Map entities existence in intro-result-discussion respectively to label values
label_def = {'111':'prior-newdata','101':'prior-strong','100':'prior','001':'speculative','010':'unknown', '011':'novel'}
chosen_models = list()
chosen_models.append(generic_model)

In [None]:
label_def_list = list(label_def.values())

for task,query in zip(tasks, queries):
    label_entities = []
    display(HTML(f'<h3 style="color:red">Task - {task} \n</h3>'))
    numDocsFound, search_results = search(query)
    if numDocsFound > 0:
        task_answers = search_task_answers(search_results)
        task_answers = populate_labels(task_answers)


In [None]:
model_names = ['en_ner_craft_md','en_ner_jnlpba_md','en_ner_bc5cdr_md','en_ner_bionlp13cg_md']
models = initilize_nlp_models(model_names)

q_entity_list = []
for task,query in zip(tasks, queries):
    label_entities = []
    num_docs_found, search_results = search(q_text, 100)
    for search_result in search_results:
        body = search_result.get('body', "")
        for model in models.values(): 
            body_doc = model(body)        
            q_entity_list = q_entity_list + [e.text for e in body_doc.ents]

    nerCntr = Counter(q_entity_list)
    freq_ners = nerCntr.most_common(50)
    print(freq_ners)
    print(len(freq_ners))

    x,y = zip(*freq_ners)
    x,y = list(x),list(y)

    plt.figure(figsize=(15,10))
    ax= sns.barplot(x=x, y=y,palette = sns.cubehelix_palette(len(x)))
    plt.xlabel('Entity')
    plt.xticks(rotation=90)
    plt.ylabel('Frequency')
    plt.title(task)

In [None]:
def search_custom_query(query):
    numDocsFound, search_results = search(query)
    #display(search_results)
    if numDocsFound > 0:
        task_answers = search_task_answers(search_results)
        task_answers = populate_labels(task_answers)
        #display(task_answers)
        
        return task_answers

searchbar = widgets.interactive(search_custom_query, query='Pregnant women')
searchbar

## Visualization of Force Directed Graph codes

In [None]:
temp_query = searchbar.kwargs
searched_query = temp_query['query']
a = search_custom_query(searched_query)
task_answers = a

In [None]:
def cosine_score(x):
        d = []
        for i in range(len(x)):
            for j in range(i+1,len(x)):
                doc1= generic_model(x[i])
                doc2= generic_model(x[j])
                d.append({

                    'Title1': x[i],
                    'Title2': x[j],
                    'Score': doc1.similarity(doc2)*30
                }
            )

        return d

class network_graph:
    
    def __init__(self,search_result):
        data = pd.DataFrame(search_result)
        self.data = data
        col_list = data.columns
        drop_list = ['introduction','discussion','result']
        self.new_cols = []
        for cols in col_list:
            if cols not in drop_list:
                self.new_cols.append(cols)

        
    def extract_titles(self,search_result):
        title_list = []
        for title in range(len(search_result)):
            title_list.append(search_result[title]['title'])
            
        title_dict_temp = cosine_score(title_list)
        self.title_df = pd.DataFrame(title_dict_temp)
        title_df_temp1 = self.title_df[['Title1']]
        title_df_temp1.rename(columns={'Title1':'Title'},inplace=True)
        title_df_temp2 = self.title_df[['Title2']]
        title_df_temp2.rename(columns={'Title2':'Title'},inplace=True)
        self.merged_titles = pd.concat([title_df_temp1,title_df_temp2],axis=0)
        
    
    def extract_words(self,search_result):
        
        word_df = self.data[self.new_cols]
        
        word_df_temp = word_df.drop(['id','title'],axis=1)
        
        col_list = list(word_df_temp.columns)
        
        self.word_df_prior =pd.DataFrame()
        self.word_df_prior_strong =pd.DataFrame()
        self.word_df_prior_newdata =pd.DataFrame()
        self.word_df_speculative =pd.DataFrame()
        self.word_df_unknown =pd.DataFrame()
        self.word_df_novel =pd.DataFrame()
        
        word_df_prior_1 =pd.DataFrame()
        word_df_prior_strong_1 =pd.DataFrame()
        word_df_prior_newdata_1 =pd.DataFrame()
        word_df_speculative_1 =pd.DataFrame()
        word_df_unknown_1 =pd.DataFrame()
        word_df_novel_1 =pd.DataFrame()
            
        for col in col_list:
            if col =='prior':
                self.word_df_prior = word_df[['id','title','prior']]
                self.word_df_prior = self.word_df_prior.explode('prior')
                self.word_df_prior.dropna(subset = ["prior"], inplace=True)
                self.word_df_prior['Weight'] = 8
                
                word_df_prior_1 = self.word_df_prior[['prior']]
                word_df_prior_1.drop_duplicates(inplace=True)
                word_df_prior_1['Color'] = 'tomato'
                word_df_prior_1['Size'] = 10
                word_df_prior_1.columns = ['Words','Color','Size']
                
            elif col== 'prior-strong':
                self.word_df_prior_strong = word_df[['id','title','prior-strong']]
                self.word_df_prior_strong = self.word_df_prior_strong.explode('prior-strong')
                self.word_df_prior_strong.dropna(subset = ["prior-strong"], inplace=True)
                self.word_df_prior_strong['Weight'] = 6
                
                word_df_prior_strong_1 = self.word_df_prior_strong[['prior-strong']]
                word_df_prior_strong_1.drop_duplicates(inplace=True)
                word_df_prior_strong_1['Color'] = 'sienna'
                word_df_prior_strong_1['Size'] = 10
                word_df_prior_strong_1.columns = ['Words','Color','Size']
                
                
            elif col == 'prior_newdata':
                self.word_df_prior_newdata = word_df[['id','title','prior_newdata']]
                self.word_df_prior_newdata = self.word_df_prior_newdata.explode('prior_newdata')
                self.word_df_prior_newdata.dropna(subset = ["prior_newdata"], inplace=True)
                self.word_df_prior_newdata['Weight'] = 4
                
                word_df_prior_newdata_1 = self.word_df_prior_newdata[['prior_newdata']]
                word_df_prior_newdata_1.drop_duplicates(inplace=True)
                word_df_prior_newdata_1['Color'] = 'bisque'
                word_df_prior_newdata_1['Size'] = 10
                word_df_prior_newdata_1.columns = ['Words','Color','Size']
                
                
            elif col =='speculative':
                self.word_df_speculative = word_df[['id','title','speculative']]
                self.word_df_speculative = self.word_df_speculative.explode('speculative')
                word_df_speculative.dropna(subset = ["speculative"], inplace=True)
                self.word_df_speculative['Weight'] = 10
                
                word_df_speculative_1 = self.word_df_speculative[['speculative']]
                word_df_speculative_1.drop_duplicates(inplace=True)
                word_df_speculative_1['Color'] = 'dimgray'
                word_df_speculative_1['Size'] = 10
                word_df_speculative_1.columns = ['Words','Color','Size']
                
            elif col =='unknown':
                self.word_df_unknown = word_df[['id','title','unknown']]
                self.word_df_unknown = self.word_df_unknown.explode('unknown')
                self.word_df_unknown.dropna(subset = ["unknown"], inplace=True)
                self.word_df_unknown['Weight'] = 30
                
                word_df_unknown_1 = self.word_df_unknown[['unknown']]
                word_df_unknown_1.drop_duplicates(inplace=True)
                word_df_unknown_1['Color'] = 'black'
                word_df_unknown_1['Size'] = 10
                word_df_unknown_1.columns = ['Words','Color','Size']
                
            elif col =='novel':
                self.word_df_novel = word_df[['id','title','novel']]
                self.word_df_novel = self.word_df_novel.explode('novel')
                self.word_df_novel.dropna(subset = ["novel"], inplace=True)
                self.word_df_novel['Weight'] = 2
                
                word_df_novel_1 = self.word_df_novel[['novel']]
                word_df_novel_1.drop_duplicates(inplace=True)
                word_df_novel_1['Color'] = 'purple'
                word_df_novel_1['Size'] = 10
                word_df_novel_1.columns = ['Words','Color','Size']
                
    # Take all the titles and categorize them into one group
    
        title_df_1 = self.merged_titles[['Title']]
        title_df_1.drop_duplicates(inplace=True,keep='first')
        title_df_1['Color'] = 'firebrick'
        title_df_1['Size'] = 30
        title_df_1.columns = ['Words','Color','Size']
        
        # Combining all the dataframe together in one place

        frames = [title_df_1,word_df_prior_1,word_df_prior_strong_1,word_df_prior_newdata_1,word_df_speculative_1,word_df_unknown_1,word_df_novel_1]
         
        self.merged_df = pd.concat(frames)
        
        
    # Changing the index of merged_df to 'Words' so that can combine it with node

        self.merged_df_index = self.merged_df.drop_duplicates(subset='Words',keep='first')
        self.merged_df_index.set_index('Words',inplace=True)
        
        
    def network_creation(self):
        
        i = nx.Graph()
        
        

        if (self.title_df.empty ==False):
            for row in self.title_df.iterrows():
                i.add_edge(row[1]['Title1'], row[1]['Title1'], weight=row[1]['Score'])

        if (self.word_df_prior.empty ==False):
            for row in self.word_df_prior.iterrows():
                i.add_edge(row[1]['title'], row[1]['prior'], weight=row[1]['Weight'])

        if (self.word_df_prior_strong.empty ==False):
            for row in self.word_df_prior_strong.iterrows():
                i.add_edge(row[1]['title'], row[1]['prior-strong'], weight=row[1]['Weight'])
            
        if (self.word_df_novel.empty ==False):
            for row in self.word_df_novel.iterrows():
                i.add_edge(row[1]['title'], row[1]['novel'], weight=row[1]['Weight'])
                
        if (self.word_df_speculative.empty ==False):
            for row in self.word_df_speculative.iterrows():
                i.add_edge(row[1]['title'], row[1]['speculative'], weight=row[1]['Weight'])
                
        if (self.word_df_prior_newdata.empty ==False):
            for row in self.word_df_prior_newdata.iterrows():
                i.add_edge(row[1]['title'], row[1]['prior_newdata'], weight=row[1]['Weight'])
                
        if (self.word_df_unknown.empty ==False):
            for row in self.word_df_unknown.iterrows():
                i.add_edge(row[1]['title'], row[1]['unknown'], weight=row[1]['Weight'])
            
        merged_df_clr = self.merged_df_index.reindex(i.nodes())

        merged_df_clr['Color']=pd.Categorical(merged_df_clr['Color'])

        merged_df_clr['Color'].cat.codes
        
        
        #Plotting the force directed graph
        
        plt.figure(figsize=(22, 22))
        degrees = nx.degree(i)
        pos_node = nx.spring_layout(i,k=0.5)
        nx.draw_networkx(i,pos=pos_node,node_color=merged_df_clr['Color'].cat.codes, cmap=plt.cm.Set2,node_size=[(degrees[v] + 1) * 100 for v in i.nodes()],alpha = 0.7)
        
    def force_directed_graphs(self):
        
        net = Network(notebook=True)

        temp1 = self.merged_df
        temp2 = temp1.reset_index()

        temp2.drop(columns=['index'],inplace=True)

        net.add_nodes(temp2['Words'],title = temp2['Words'],color=temp2['Color'],size=temp2['Size'].to_list())

        if (self.title_df.empty ==False):
            for row in self.title_df.iterrows():
                net.add_edge(row[1]['Title1'], row[1]['Title1'], weight=row[1]['Score'])

        if (self.word_df_prior.empty ==False):
            for row in self.word_df_prior.iterrows():
                net.add_edge(row[1]['title'], row[1]['prior'], weight=row[1]['Weight'])

        if (self.word_df_prior_strong.empty ==False):
            for row in self.word_df_prior_strong.iterrows():
                net.add_edge(row[1]['title'], row[1]['prior-strong'], weight=row[1]['Weight'])
            
        if (self.word_df_novel.empty ==False):
            for row in self.word_df_novel.iterrows():
                net.add_edge(row[1]['title'], row[1]['novel'], weight=row[1]['Weight'])
                
        if (self.word_df_speculative.empty ==False):
            for row in self.word_df_speculative.iterrows():
                net.add_edge(row[1]['title'], row[1]['speculative'], weight=row[1]['Weight'])
                
        if (self.word_df_prior_newdata.empty ==False):
            for row in self.word_df_prior_newdata.iterrows():
                net.add_edge(row[1]['title'], row[1]['prior_newdata'], weight=row[1]['Weight'])
                
        if (self.word_df_unknown.empty ==False):
            for row in self.word_df_unknown.iterrows():
                net.add_edge(row[1]['title'], row[1]['unknown'], weight=row[1]['Weight'])


        # net.enable_physics(True)
        display(net.show("mygraph.html"))

In [None]:
#Calling all the functions
ng= network_graph(task_answers)
ng.extract_titles(task_answers)
ng.extract_words(task_answers)
ng.force_directed_graphs()
# ng.network_creation()