#### (3月27日) Import XML files

In [19]:
import numpy as np
import pandas as pd

from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup

pd.options.mode.chained_assignment = None

import os
file_list = os.listdir('refs')

In [None]:
def import_article(df, file):
    
    # Specify path of xml file
    path = 'refs/' + file
    
    # Read data inside xml file, parse with BeautifulSoup
    with open(path, 'r', encoding='utf-8') as f:
        data = f.read()
    Bs_data = BeautifulSoup(data, "xml")
    
    # Extract ID data
    ID = Bs_data.find('article-id', {'pub-id-type':'doi'}).text
    title = Bs_data.find('article-title').text
    author = Bs_data.find('surname').text
    year = Bs_data.find_all('pub-date')[1].find('year').text  #2nd element for collection publication
    
    # Get text of every <p> element (article texts)
    body_text = [i.text for i in Bs_data.findAll('p')]    
    
    # Insert extracted info in dataframe/corpus
    INFO = {'Author': author, 'Year': year, 'Title': title, 'Text': body_text, 'ID': ID}
    
    df_row = pd.DataFrame([INFO])
    df = pd.concat([df, df_row])
    
    return df    

In [None]:
# Get corpus as dataframe
# Done 27 Mar 2022!

df = pd.DataFrame()
for file in file_list:
    df = import_article(df, file)
    
corpus_raw = df.reset_index(drop=True)
corpus_raw

In [None]:
# print to inspect

print(corpus_raw.loc[0, 'Text'])

#### (4月7日) Removing unsuitable paragraphs

In [None]:
import copy
corpus_new = copy.deepcopy(corpus_raw)

In [None]:
def screenmanually(doc):
    
    from IPython.display import clear_output
    
    newdoc = ["DONE"]
    
    for para in doc:
        print(para)
        #print('\n')
        oper = input('Keep? 1 = Yes, 0 = No     ')
        if oper == '1':
            newdoc.append(para)
        clear_output()
        print('\n')
    
    return newdoc       

In [None]:
def replacer(n, corpus_raw=corpus_raw, corpus_new=corpus_new):
    
    doc = corpus_raw.loc[n, 'Text']
    newdoc = screenmanually(doc)
    corpus_new.loc[n, 'Text'] = newdoc
    print(newdoc)
    
    return corpus_new   

In [None]:
# Remove paras in each doc in corpus_raw not containing useful information
# Keep: Abstract, Body texts, Fig/Tab legends
# Remove: COI, Acknowledgments, Author contrib

# DONE - 0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19
# TODO - finished 4.9

# corpus_new = replacer()

In [None]:
# Save
# corpus_new_save = copy.deepcopy(corpus_new)

# Restore
# corpus_new = copy.deepcopy(corpus_new_save)

In [None]:
# Remove "DONE" tag

# for i in range(0,20):
#     corpus_new.loc[i, "Text"] = corpus_new.loc[i, "Text"][1:]

In [None]:
# Label docs by group (JP-EN, EN-EN)

# corpus_new['Group'] = pd.Series(['JP-EN', 'JP-EN', 'EN-EN', 'EN-EN', 'EN-EN',
#                                 'JP-EN', 'EN-EN', 'EN-EN', 'EN-EN', 'EN-EN',
#                                 'JP-EN', 'JP-EN', 'JP-EN', 'JP-EN', 'JP-EN',
#                                 'EN-EN', 'EN-EN', 'JP-EN', 'EN-EN', 'JP-EN'])

In [None]:
# Pickle
# corpus_new.to_pickle('savefiles/corpusfull_20220410.pkl')

# Unpickle
corpus_new = pd.read_pickle('savefiles/corpusfull_20220410.pkl')

In [None]:
corpus_new

#### (4月10日) Text preprocessing

In [1]:
import numpy as np
import pandas as pd
import copy

# Unpickle
corpus_new = pd.read_pickle('savefiles/corpusfull_20220410.pkl')

In [2]:
def collapse(text):
    
    """
    Paragraphs extracted from XML files contain different numbers of sentences,
    and even incomplete sentences. This function collapses a list of paragraphs
    into a list of sentences or sentence-equivalents in advance of NLP processing.
    """
    
    doc_raw = ""
    punc = set([".", ",", ";", ":"])
    for i, para in enumerate(text):
        if para[-1] in punc:
            doc_raw += (para + " ")
        else:
            doc_raw += (para + ". ")
        
    return doc_raw

In [3]:
def preprocess_gen(text):
    
    """
    This function is primarily for removing references, and fixing spacing
    between/within sentences in preparation for scispaCy language modeling.
    """
    
    import re

    text = text.replace('\u200a', '').replace('\n', '')   # remove weird space code, newlines
    text = re.sub('\[(\d+)\]', '', text)                  # remove refs ([1], [23], etc.)
    text = text.replace(' ,', '').replace(' .', '.')      # fix spaces created by prev line
    text = text.replace(' ;', ';').replace(' :', ':')     # 
    text = text.replace('  ', ' ')
    
    return text

In [4]:
# Converts each doc from list of paras to one long string

for i in range(0,20):
    corpus_new.loc[i, "Text"] = collapse(corpus_new.loc[i, "Text"])

# Preprocess each doc before Spacy modeling

for i in range(0,20):
    corpus_new.loc[i, "Text"] = preprocess_gen(corpus_new.loc[i, "Text"])

In [None]:
# For practicing on single texts

# text = corpus_new.loc[0, "Text"]
# text

#### (4月11日) Language modeling

In [5]:
import scispacy
import spacy

nlp = spacy.load("en_core_sci_md")

In [None]:
# function to get unique tokens from list
def unique(ls):
 
    unique_list = []
     
    for x in ls:
        if x not in unique_list:
            unique_list.append(x)
    
    return unique_list

In [13]:
def preprocess_ttr(doc):
    
    punc_to_skip = set(['±', '=', '>', '<'])
    
    # Collect lemmas not tagged by spaCy as 1. punctuation, 2. digits, 3. URLs, or 4. stop words
    tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
    # Remove any tokens containing mid-string digits (e.g. "P5-a") or punc ('t(are')
    tokens = [tok for tok in tokens if not re.search("\d", tok)]
    tokens = [tok for tok in tokens if not re.search("\(", tok)]
    tokens = [tok for tok in tokens if not re.search("\)", tok)]
    tokens = [tok for tok in tokens if tok not in punc_to_skip] # can skip?
    # Remove small words (e.g. 'a', 'P', 'mm')
    tokens = [tok for tok in tokens if len(tok) > 3]
    # Unify to lowercase (for matching)
    tokens = [tok.lower() for tok in tokens]
    
    return tokens

def type_token_ratio(doc):
    
    token_list = preprocess_ttr(doc)
    n_type = len(unique(token_list))
    n_token = len(token_list)
    ttr = n_type/n_token
    
    return ttr

In [6]:
# Sandbox
punc_to_skip = set(['±', '=', '>', '<'])

# Getting "amount"

# Pull text from df
text = corpus_new.loc[0, "Text"]
# Run scispaCy model
doc = nlp(text)

In [10]:
import re

tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
tokens = [tok for tok in tokens if not re.search("\d", tok)]
tokens = [tok for tok in tokens if not re.search("\(", tok)]
tokens = [tok for tok in tokens if not re.search("\)", tok)]
tokens = [tok for tok in tokens if tok not in punc_to_skip]
tokens = [tok for tok in tokens if len(tok) > 3]
print(tokens)

['emotional', 'maturity', 'social', 'awareness', 'important', 'adolescent', 'particularly', 'college', 'student', 'begin', 'face', 'challenge', 'risk', 'adult', 'world', 'relatively', 'little', 'research', 'personality', 'maturation', 'psychological', 'development', 'late', 'adolescence', 'neural', 'change', 'underlie', 'development', 'investigate', 'correlation', 'psychological', 'property', 'neuroticism', 'extraversion', 'anxiety', 'depression', 'late', 'adolescent', 'year', 'month', 'year', 'month', 'result', 'reveal', 'late', 'adolescent', 'neurotic', 'anxious', 'depressive', 'extraverted', 'aged', 'participant', 'observe', 'video', 'clip', 'depict', 'hand', 'movement', 'risk', 'harm', 'risk-taking', 'safe', 'action', 'functional', 'magnetic', 'resonance', 'imaging', 'fmri', 'result', 'reveal', 'risk-taking', 'action', 'elicit', 'significantly', 'strong', 'activation', 'bilateral', 'inferior', 'parietal', 'lobule', 'temporal', 'visual', 'region', 'superior/middle', 'temporal', 'are

In [12]:
counter = 0
for tok in tokens:
    if tok == "period":
        counter += 1
counter

15

In [None]:
tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
tokens = [tok for tok in tokens if not re.search("\d", tok)]
tokens = [tok for tok in tokens if not re.search("\(", tok)]
tokens = [tok for tok in tokens if not re.search("\)", tok)]
tokens = [tok for tok in tokens if tok not in punc_to_skip]
tokens = [tok for tok in tokens if len(tok) > 3]
print(tokens)

In [14]:
def lexeme_counter(doc, string):
    
    """
    Function for getting raw lemma count in a document.
    """
        
    tokens = preprocess_ttr(doc)
    counter = 0
    for tok in tokens:
        if str(tok) == str(string):
            counter += 1
    
    return counter

In [22]:
for lexeme in ["amount", "value", "level"]:
    
    new_name_n = "n_" + str(lexeme)
    new_name_adj = str(new_name_n) + "_adj"
    
    for i in range(0,20):
        # Pull text from df
        text = corpus_new.loc[i, "Text"]
        # Run scispaCy model
        doc = nlp(text)
        # Count lexeme (includes preprocessing)
        ct = lexeme_counter(doc, lexeme)
        # Put in results_df
        results_df.loc[i, new_name_n] = ct
        results_df.loc[i, new_name_adj] = (ct / results_df.loc[i, 'Word Count']) * np.mean(results_df['Word Count'])

results_df

Unnamed: 0,Group,Author,Title,n_amount,Word Count,n_amount_adj,n_value,n_value_adj,n_level,n_level_adj
0,JP-EN,Tamura,Neural Network Development in L,0.0,6926.0,0.0,4.0,3.842189,8.0,7.684378
1,JP-EN,Watanabe,Diminished Medial Prefrontal Ac,0.0,6918.0,0.0,1.0,0.961658,4.0,3.846632
2,EN-EN,Sobhani,Interpersonal Liking Modulates,0.0,5953.0,0.0,5.0,5.587729,14.0,15.645641
3,EN-EN,Majdandžić,The Human Factor: Behavioral an,0.0,9587.0,0.0,4.0,2.775738,4.0,2.775738
4,EN-EN,Dixon,The Decision to Engage Cognitiv,3.0,8620.0,2.315342,24.0,18.522738,5.0,3.858904
5,JP-EN,Ohta,Syntactic Computation in the Hu,1.0,8739.0,0.761271,7.0,5.328899,8.0,6.090171
6,EN-EN,Deeley,Using Hypnotic Suggestion to Mo,0.0,7103.0,0.0,2.0,1.873223,5.0,4.683056
7,EN-EN,Pawliczek,Anger under Control: Neural Cor,0.0,4688.0,0.0,7.0,9.933714,19.0,26.962937
8,EN-EN,Jansma,fMRI Guided rTMS Evidence for R,0.0,4865.0,0.0,0.0,0.0,3.0,4.102415
9,EN-EN,Lidzba,Complex Visual Search in Childr,0.0,4478.0,0.0,0.0,0.0,7.0,10.399565


#### 1. Type/token ratio (lexical diversity)

In [None]:
def compare_means(var):
    
    from scipy.stats import ttest_ind
    
    jp_stats = list(results_df[results_df['Group'] == 'JP-EN'].loc[:, var])
    en_stats = list(results_df[results_df['Group'] == 'EN-EN'].loc[:, var])
    P = ttest_ind(jp_stats, en_stats).pvalue
    
    print(f'Mean {var}, JP-EN:  {np.mean(jp_stats)}')
    print(f'Mean {var}, EN-EN:  {np.mean(en_stats)}')
    print(f'Sig. (unpaired t-test): {P}')
    print('\n')

In [15]:
# Prepare new df for inspecting results

results_df = corpus_new[['Group', 'Author', 'Title']]
results_df['Title'] = pd.Series([title[:31] for title in results_df['Title']])
results_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['Title'] = pd.Series([title[:31] for title in results_df['Title']])


Unnamed: 0,Group,Author,Title
0,JP-EN,Tamura,Neural Network Development in L
1,JP-EN,Watanabe,Diminished Medial Prefrontal Ac
2,EN-EN,Sobhani,Interpersonal Liking Modulates
3,EN-EN,Majdandžić,The Human Factor: Behavioral an
4,EN-EN,Dixon,The Decision to Engage Cognitiv
5,JP-EN,Ohta,Syntactic Computation in the Hu
6,EN-EN,Deeley,Using Hypnotic Suggestion to Mo
7,EN-EN,Pawliczek,Anger under Control: Neural Cor
8,EN-EN,Jansma,fMRI Guided rTMS Evidence for R
9,EN-EN,Lidzba,Complex Visual Search in Childr


In [None]:
# Calculate raw type/token ratio (lemmatized)

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Run scispaCy model
    doc = nlp(text)
    # Preprocess
    ttr = type_token_ratio(doc)
    # Put in results_df
    results_df.loc[i, 'Type/Token'] = ttr

results_df.head(5)

In [21]:
# Calculate adjusted type/token ratio (lemmatized) divided by word count

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Split by word (" " for simplicity)
    word_ct = len(text.split(' '))
    # Put in results_df
    results_df.loc[i, 'Word Count'] = word_ct
    
results_df

Unnamed: 0,Group,Author,Title,n_amount,Word Count,n_amount_adj
0,JP-EN,Tamura,Neural Network Development in L,0.0,6926.0,
1,JP-EN,Watanabe,Diminished Medial Prefrontal Ac,,6918.0,
2,EN-EN,Sobhani,Interpersonal Liking Modulates,,5953.0,
3,EN-EN,Majdandžić,The Human Factor: Behavioral an,,9587.0,
4,EN-EN,Dixon,The Decision to Engage Cognitiv,,8620.0,
5,JP-EN,Ohta,Syntactic Computation in the Hu,,8739.0,
6,EN-EN,Deeley,Using Hypnotic Suggestion to Mo,,7103.0,
7,EN-EN,Pawliczek,Anger under Control: Neural Cor,,4688.0,
8,EN-EN,Jansma,fMRI Guided rTMS Evidence for R,,4865.0,
9,EN-EN,Lidzba,Complex Visual Search in Childr,,4478.0,


In [None]:
# Divide by mean word count of all documents
results_df['TTR_adj'] = (results_df['Type/Token'] / results_df['Word Count']) * np.mean(results_df['Word Count'])
    
results_df.head(6)

In [None]:
# Compare mean stats between JP-EN and EN-EN corpora

compare_means('Type/Token')
compare_means('Word Count')
compare_means('TTR_adj')

Comments (4/10): H1 seems to be rejected. fMRI studies authored by Japanese scientists are just as lexically sophisticated as comparable docs authored by Anglophone counterparts. Perhaps this is a good thing: i.e., any differences discovered later are a product of linguistic features, rather than scientific knowledge/ignorance.

In [None]:
# Examine the entities extracted by the mention detector. Note that they don't have types
# like in SpaCy, and they are more general (e.g including verbs) - these are any spans
# which might be an entity in UMLS, a large biomedical database.
print(a.ents)


#>>> (Myeloid derived suppressor cells,
#     MDSC,
#     immature,
#     myeloid cells,
#     immunosuppressive activity,
#     accumulate,
#     tumor-bearing mice,
#     humans,
#     cancer,
#     hepatocellular carcinoma,
#     HCC)

In [None]:
# We can also visualise dependency parses
# (This renders automatically inside a jupyter notebook!):
from spacy import displacy
# displacy.render(next(doc2.sents), style='dep', jupyter=True)
displacy.render(a, style='dep', jupyter=True)

# See below for the generated SVG.
# Zoom your browser in a bit!