#### (3月27日) Import XML files

In [None]:
import numpy as np
import pandas as pd

from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup

pd.options.mode.chained_assignment = None

import os
file_list = os.listdir('refs')

In [None]:
def import_article(df, file):
    
    # Specify path of xml file
    path = 'refs/' + file
    
    # Read data inside xml file, parse with BeautifulSoup
    with open(path, 'r', encoding='utf-8') as f:
        data = f.read()
    Bs_data = BeautifulSoup(data, "xml")
    
    # Extract ID data
    ID = Bs_data.find('article-id', {'pub-id-type':'doi'}).text
    title = Bs_data.find('article-title').text
    author = Bs_data.find('surname').text
    year = Bs_data.find_all('pub-date')[1].find('year').text  #2nd element for collection publication
    
    # Get text of every <p> element (article texts)
    body_text = [i.text for i in Bs_data.findAll('p')]    
    
    # Insert extracted info in dataframe/corpus
    INFO = {'Author': author, 'Year': year, 'Title': title, 'Text': body_text, 'ID': ID}
    
    df_row = pd.DataFrame([INFO])
    df = pd.concat([df, df_row])
    
    return df    

In [None]:
# Get corpus as dataframe
# Done 27 Mar 2022!

df = pd.DataFrame()
for file in file_list:
    df = import_article(df, file)
    
corpus_raw = df.reset_index(drop=True)
corpus_raw

In [None]:
# print to inspect

print(corpus_raw.loc[0, 'Text'])

#### (4月7日) Removing unsuitable paragraphs

In [None]:
import copy
corpus_new = copy.deepcopy(corpus_raw)

In [None]:
def screenmanually(doc):
    
    from IPython.display import clear_output
    
    newdoc = ["DONE"]
    
    for para in doc:
        print(para)
        #print('\n')
        oper = input('Keep? 1 = Yes, 0 = No     ')
        if oper == '1':
            newdoc.append(para)
        clear_output()
        print('\n')
    
    return newdoc       

In [None]:
def replacer(n, corpus_raw=corpus_raw, corpus_new=corpus_new):
    
    doc = corpus_raw.loc[n, 'Text']
    newdoc = screenmanually(doc)
    corpus_new.loc[n, 'Text'] = newdoc
    print(newdoc)
    
    return corpus_new   

In [None]:
# Remove paras in each doc in corpus_raw not containing useful information
# Keep: Abstract, Body texts, Fig/Tab legends
# Remove: COI, Acknowledgments, Author contrib

# DONE - 0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19
# TODO - finished 4.9

# corpus_new = replacer()

In [None]:
# Save
# corpus_new_save = copy.deepcopy(corpus_new)

# Restore
# corpus_new = copy.deepcopy(corpus_new_save)

In [None]:
# Remove "DONE" tag

# for i in range(0,20):
#     corpus_new.loc[i, "Text"] = corpus_new.loc[i, "Text"][1:]

In [None]:
# Label docs by group (JP-EN, EN-EN)

# corpus_new['Group'] = pd.Series(['JP-EN', 'JP-EN', 'EN-EN', 'EN-EN', 'EN-EN',
#                                 'JP-EN', 'EN-EN', 'EN-EN', 'EN-EN', 'EN-EN',
#                                 'JP-EN', 'JP-EN', 'JP-EN', 'JP-EN', 'JP-EN',
#                                 'EN-EN', 'EN-EN', 'JP-EN', 'EN-EN', 'JP-EN'])

In [63]:
# Pickle
# corpus_new.to_pickle('savefiles/corpusfull_20220410.pkl')

# Unpickle
corpus_new = pd.read_pickle('savefiles/corpusfull_20220410.pkl')

In [64]:
corpus_new

Unnamed: 0,Author,Year,Title,Text,ID,Group
0,Tamura,2012,Neural Network Development in Late Adolescents...,[Emotional maturity and social awareness are i...,10.1371/journal.pone.0039527,JP-EN
1,Watanabe,2012,Diminished Medial Prefrontal Activity behind A...,[Individuals with autism spectrum disorders (A...,10.1371/journal.pone.0039561,JP-EN
2,Sobhani,2012,Interpersonal Liking Modulates Motor-Related N...,[Observing someone perform an action engages b...,10.1371/journal.pone.0046809,EN-EN
3,Majdandžić,2012,The Human Factor: Behavioral and Neural Correl...,[The extent to which people regard others as f...,10.1371/journal.pone.0047698,EN-EN
4,Dixon,2012,The Decision to Engage Cognitive Control Is Dr...,[Cognitive control is a fundamental skill refl...,10.1371/journal.pone.0051637,EN-EN
5,Ohta,2013,Syntactic Computation in the Human Brain: The ...,[Our goal of this study is to characterize the...,10.1371/journal.pone.0056230,JP-EN
6,Deeley,2013,Using Hypnotic Suggestion to Model Loss of Con...,[The feeling of voluntary control and awarenes...,10.1371/journal.pone.0078324,EN-EN
7,Pawliczek,2013,Anger under Control: Neural Correlates of Frus...,[Antisocial behavior and aggression are promin...,10.1371/journal.pone.0078503,EN-EN
8,Jansma,2013,fMRI Guided rTMS Evidence for Reduced Left Pre...,[Cognitive tasks that do not change the requir...,10.1371/journal.pone.0080256,EN-EN
9,Lidzba,2013,Complex Visual Search in Children and Adolesce...,[Complex visuospatial processing relies on dis...,10.1371/journal.pone.0085168,EN-EN


#### (4月10日) Text preprocessing

In [65]:
import numpy as np
import pandas as pd
import copy

# Unpickle
corpus_new = pd.read_pickle('savefiles/corpusfull_20220410.pkl')

In [14]:
def collapse(text):
    
    """
    Paragraphs extracted from XML files contain different numbers of sentences,
    and even incomplete sentences. This function collapses a list of paragraphs
    into a list of sentences or sentence-equivalents in advance of NLP processing.
    """
    
    doc_raw = ""
    punc = set([".", ",", ";", ":"])
    for i, para in enumerate(text):
        if para[-1] in punc:
            doc_raw += (para + " ")
        else:
            doc_raw += (para + ". ")
        
    return doc_raw

In [67]:
def preprocess_gen(text):
    
    """
    This function is primarily for removing references, and fixing spacing
    between/within sentences in preparation for scispaCy language modeling.
    """
    
    import re

    text = text.replace('\u200a', '').replace('\n', '')   # remove weird space code, newlines
    text = re.sub('\[(\d+)\]', '', text)                  # remove refs ([1], [23], etc.)
    text = text.replace(' ,', '').replace(' .', '.')      # fix spaces created by prev line
    text = text.replace(' ;', ';').replace(' :', ':')     # 
    text = text.replace('  ', ' ')
    
    return text

In [69]:
# Converts each doc from list of paras to one long string

for i in range(0,20):
    corpus_new.loc[i, "Text"] = collapse(corpus_new.loc[i, "Text"])

# Preprocess each doc before Spacy modeling

for i in range(0,20):
    corpus_new.loc[i, "Text"] = preprocess_gen(corpus_new.loc[i, "Text"])

In [None]:
# For practicing on single texts

# text = corpus_new.loc[0, "Text"]
# text

#### (4月11日) Language modeling

In [17]:
import scispacy
import spacy

nlp = spacy.load("en_core_sci_md")

In [22]:
doc = nlp(text)

list(doc.sents)
#print(list(doc.sents))

[Emotional maturity and social awareness are important for adolescents, particularly college students beginning to face the challenges and risks of the adult world.,
 However, there has been relatively little research into personality maturation and psychological development during late adolescence and the neural changes underlying this development.,
 We investigated the correlation between psychological properties (neuroticism, extraversion, anxiety, and depression) and age among late adolescents (n=25, from 18 years and 1 month to 22 years and 8 months).,
 The results revealed that late adolescents became less neurotic, less anxious, less depressive and more extraverted as they aged.,
 Participants then observed video clips depicting hand movements with and without a risk of harm (risk-taking or safe actions) during functional magnetic resonance imaging (fMRI).,
 The results revealed that risk-taking actions elicited significantly stronger activation in the bilateral inferior parieta

In [None]:
# function to get unique tokens from list
def unique(ls):
 
    unique_list = []
     
    for x in ls:
        if x not in unique_list:
            unique_list.append(x)
    
    return unique_list

In [52]:
def preprocess_ttr(doc):
    
    punc_to_skip = set(['±', '=', '>', '<'])
    
    # Collect lemmas not tagged by spaCy as 1. punctuation, 2. digits, 3. URLs, or 4. stop words
    tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
    # Remove any tokens containing mid-string digits (e.g. "P5-a") or punc ('t(are')
    tokens = [tok for tok in tokens if not re.search("\d", tok)]
    tokens = [tok for tok in tokens if not re.search("\(", tok)]
    tokens = [tok for tok in tokens if not re.search("\)", tok)]
    tokens = [tok for tok in tokens if tok not in punc_to_skip] # can skip?
    # Remove small words (e.g. 'a', 'P', 'mm')
    tokens = [tok for tok in tokens if len(tok) > 3]
    
    return tokens

def type_token_ratio(doc):
    
    token_list = preprocess_ttr(doc)
    n_type = len(unique(token_list))
    n_token = len(token_list)
    ttr = n_type/n_token
    
    return ttr

In [50]:
# Sandbox
punc_to_skip = set(['±', '=', '>', '<'])

tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
tokens = [tok for tok in tokens if not re.search("\d", tok)]
tokens = [tok for tok in tokens if not re.search("\(", tok)]
tokens = [tok for tok in tokens if not re.search("\)", tok)]
tokens = [tok for tok in tokens if tok not in punc_to_skip]
tokens = [tok for tok in tokens if len(tok) > 3]
print(tokens)

['emotional', 'maturity', 'social', 'awareness', 'important', 'adolescent', 'particularly', 'college', 'student', 'begin', 'face', 'challenge', 'risk', 'adult', 'world', 'relatively', 'little', 'research', 'personality', 'maturation', 'psychological', 'development', 'late', 'adolescence', 'neural', 'change', 'underlie', 'development', 'investigate', 'correlation', 'psychological', 'property', 'neuroticism', 'extraversion', 'anxiety', 'depression', 'late', 'adolescent', 'year', 'month', 'year', 'month', 'result', 'reveal', 'late', 'adolescent', 'neurotic', 'anxious', 'depressive', 'extraverted', 'aged', 'participant', 'observe', 'video', 'clip', 'depict', 'hand', 'movement', 'risk', 'harm', 'risk-taking', 'safe', 'action', 'functional', 'magnetic', 'resonance', 'imaging', 'fmri', 'result', 'reveal', 'risk-taking', 'action', 'elicit', 'significantly', 'strong', 'activation', 'bilateral', 'inferior', 'parietal', 'lobule', 'temporal', 'visual', 'region', 'superior/middle', 'temporal', 'are

#### 1. Type/token ratio (lexical diversity)

In [99]:
def compare_means(var):
    
    from scipy.stats import ttest_ind
    
    jp_stats = list(results_df[results_df['Group'] == 'JP-EN'].loc[:, var])
    en_stats = list(results_df[results_df['Group'] == 'EN-EN'].loc[:, var])
    P = ttest_ind(jp_stats, en_stats).pvalue
    
    print(f'Mean {var}, JP-EN:  {np.mean(jp_stats)}')
    print(f'Mean {var}, EN-EN:  {np.mean(en_stats)}')
    print(f'Sig. (unpaired t-test): {P}')
    print('\n')

In [75]:
# Prepare new df for inspecting results

results_df = corpus_new[['Group', 'Author', 'Title']]
results_df['Title'] = pd.Series([title[:31] for title in results_df['Title']])
results_df

Unnamed: 0,Group,Author,Title
0,JP-EN,Tamura,Neural Network Development in L
1,JP-EN,Watanabe,Diminished Medial Prefrontal Ac
2,EN-EN,Sobhani,Interpersonal Liking Modulates
3,EN-EN,Majdandžić,The Human Factor: Behavioral an
4,EN-EN,Dixon,The Decision to Engage Cognitiv
5,JP-EN,Ohta,Syntactic Computation in the Hu
6,EN-EN,Deeley,Using Hypnotic Suggestion to Mo
7,EN-EN,Pawliczek,Anger under Control: Neural Cor
8,EN-EN,Jansma,fMRI Guided rTMS Evidence for R
9,EN-EN,Lidzba,Complex Visual Search in Childr


In [97]:
# Calculate raw type/token ratio (lemmatized)

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Run scispaCy model
    doc = nlp(text)
    # Preprocess
    ttr = type_token_ratio(doc)
    # Put in results_df
    results_df.loc[i, 'Type/Token'] = ttr

results_df.head(5)

KeyboardInterrupt: 

In [102]:
# Calculate adjusted type/token ratio (lemmatized) divided by word count

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Split by word (" " for simplicity)
    word_ct = len(text.split(' '))
    # Put in results_df
    results_df.loc[i, 'Word Count'] = word_ct

# Divide by mean word count of all documents
results_df['TTR_adj'] = (results_df['Type/Token'] / results_df['Word Count']) * np.mean(results_df['Word Count'])
    
results_df.head(6)

Unnamed: 0,Group,Author,Title,Type/Token,Word Count,TTR_adj
0,JP-EN,Tamura,Neural Network Development in L,0.281772,6926.0,0.270656
1,JP-EN,Watanabe,Diminished Medial Prefrontal Ac,0.239601,6918.0,0.230414
2,EN-EN,Sobhani,Interpersonal Liking Modulates,0.27918,5953.0,0.311996
3,EN-EN,Majdandžić,The Human Factor: Behavioral an,0.248466,9587.0,0.172419
4,EN-EN,Dixon,The Decision to Engage Cognitiv,0.239557,8620.0,0.184885
5,JP-EN,Ohta,Syntactic Computation in the Hu,0.250934,8739.0,0.191028


In [100]:
# Compare mean stats between JP-EN and EN-EN corpora

compare_means('Type/Token')
compare_means('Word Count')
compare_means('TTR_adj')

Mean Type/Token, JP-EN:  0.2593239275616061
Mean Type/Token, EN-EN:  0.2680039190672282
Sig. (unpaired t-test): 0.546251531992626


Mean Word Count, JP-EN:  6852.3
Mean Word Count, EN-EN:  6453.2
Sig. (unpaired t-test): 0.5616242355640981


Mean TTR_adj, JP-EN:  0.26424782154984205
Mean TTR_adj, EN-EN:  0.2999937518697087
Sig. (unpaired t-test): 0.42224882523025575




Comments (4/10): H1 seems to be rejected. fMRI studies authored by Japanese scientists are just as lexically sophisticated as comparable docs authored by Anglophone counterparts. Perhaps this is a good thing: i.e., any differences discovered later are a product of linguistic features, rather than scientific knowledge/ignorance.

In [None]:
# Examine the entities extracted by the mention detector. Note that they don't have types
# like in SpaCy, and they are more general (e.g including verbs) - these are any spans
# which might be an entity in UMLS, a large biomedical database.
print(a.ents)


#>>> (Myeloid derived suppressor cells,
#     MDSC,
#     immature,
#     myeloid cells,
#     immunosuppressive activity,
#     accumulate,
#     tumor-bearing mice,
#     humans,
#     cancer,
#     hepatocellular carcinoma,
#     HCC)

In [None]:
# We can also visualise dependency parses
# (This renders automatically inside a jupyter notebook!):
from spacy import displacy
# displacy.render(next(doc2.sents), style='dep', jupyter=True)
displacy.render(a, style='dep', jupyter=True)

# See below for the generated SVG.
# Zoom your browser in a bit!