#### (4月12日) Corpus Analysis

In [1]:
import numpy as np
import pandas as pd
import copy
import re

pd.options.mode.chained_assignment = None

In [2]:
# Unpickle
corpus_new = pd.read_pickle('savefiles/corpusfull_20220410.pkl')

In [3]:
def collapse(text):
    
    """
    Paragraphs extracted from XML files contain different numbers of sentences,
    and even incomplete sentences. This function collapses a list of paragraphs
    into a list of sentences or sentence-equivalents in advance of NLP processing.
    """
    
    doc_raw = ""
    punc = set([".", ",", ";", ":"])
    for i, para in enumerate(text):
        if para[-1] in punc:
            doc_raw += (para + " ")
        else:
            doc_raw += (para + ". ")
        
    return doc_raw

In [4]:
def preprocess_gen(text):
    
    """
    This function is primarily for removing references, and fixing spacing
    between/within sentences in preparation for scispaCy language modeling.
    """
    
    import re

    text = text.replace('\u200a', '').replace('\n', '')   # remove weird space code, newlines
    text = re.sub('\[(\d+)\]', '', text)                  # remove refs ([1], [23], etc.)
    text = text.replace(' ,', '').replace(' .', '.')      # fix spaces created by prev line
    text = text.replace(' ;', ';').replace(' :', ':')     # 
    text = text.replace('  ', ' ')
    
    return text

In [6]:
# Converts each doc from list of paras to one long string
for i in range(0,20):
    corpus_new.loc[i, "Text"] = collapse(corpus_new.loc[i, "Text"])

# Preprocess each doc before Spacy modeling
for i in range(0,20):
    corpus_new.loc[i, "Text"] = preprocess_gen(corpus_new.loc[i, "Text"])
    
# For practicing on single texts
# text = corpus_new.loc[0, "Text"]
# text

In [32]:
def initialize_results_df():
    
    """
    Results DF should be reset for every hypothesis tested.
    """
    
    results_df = corpus_new[['Group', 'Author', 'Title']]
    results_df['Title'] = pd.Series([title[:31] for title in results_df['Title']])  # prune title for readability
    
    # with word count
    for i in range(0,20):
        # Pull text from df
        text = corpus_new.loc[i, "Text"]
        # Split by word (" " for simplicity)
        word_ct = len(text.split(' '))
        # Put in results_df
        results_df.loc[i, 'Word Count'] = word_ct
    
    return results_df

#### Language modeling

In [7]:
import scispacy
import spacy

nlp = spacy.load("en_core_sci_md")

In [8]:
def unique(ls):
    
    """
    Creates a list of unique items from an existing list.
    """
 
    unique_list = []
     
    for x in ls:
        if x not in unique_list:
            unique_list.append(x)
    
    return unique_list

In [9]:
def preprocess_ttr(doc):
    
    # Collect lemmas not tagged by spaCy as 1. punctuation, 2. digits, 3. URLs, or 4. stop words
    tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
    
    # Remove any tokens containing mid-string digits (e.g. "P5-a") or punc ('t(are')
    tokens = [tok for tok in tokens if not re.search("\d", tok)]
    tokens = [tok for tok in tokens if not re.search("\(", tok)]
    tokens = [tok for tok in tokens if not re.search("\)", tok)]
    
    # Remove punc and small words (e.g. 'a', 'P', 'mm')
    punc_to_skip = set(['±', '=', '>', '<'])
    tokens = [tok for tok in tokens if tok not in punc_to_skip]     # can skip?
    tokens = [tok for tok in tokens if len(tok) > 3]
    
    # Unify to lowercase (to simplify matching)
    tokens = [tok.lower() for tok in tokens]
    
    return tokens

In [10]:
def type_token_ratio(doc):
    
    token_list = preprocess_ttr(doc)
    n_type = len(unique(token_list))
    n_token = len(token_list)
    ttr = n_type/n_token
    
    return ttr

In [24]:
def find_tokens_unique_to_one_doc(i):
    
    other_docs = list(range(0,20))
    other_docs.remove(i)
    
    # Doc in question
    text_i = corpus_new.loc[i, "Text"]
    doc_i = nlp(text_i)
    tokens_i = unique(preprocess_ttr(doc_i))
    
    # Iterate thru all 19 other docs
    for j in other_docs:
        text_j = corpus_new.loc[j, "Text"]
        doc_j = nlp(text_j)
        tokens_j = set(unique(preprocess_ttr(doc_j)))
        
        for tok in tokens_i:
            if tok in tokens_j:
                tokens_i.remove(tok)
    
    return tokens_i            

In [42]:
toks = find_tokens_unique_to_one_doc(3)
toks

['full-blown',
 'humanization',
 'prosocial',
 'sacrifice',
 'save',
 'utilitarian',
 'victim',
 'humanness',
 'vicarious',
 'fictitious',
 'humanize',
 'humanized',
 'pgacc/mofc',
 'insula/ifg',
 'precuneus/pcc',
 'other-related',
 'human-like',
 'deeply',
 'living',
 'reciprocity',
 'coalition',
 'greatly',
 'belief',
 'conspecific',
 'anthropomorphism',
 'deny',
 'dehumanization',
 'outgroup',
 'dehumanized',
 'historic',
 'barbarian',
 'deserving',
 'justify',
 'cohesion',
 'harmful',
 'first-hand',
 'intense',
 'embody',
 'reluctance',
 'kill',
 'non-utilitarian',
 'refrain',
 'death',
 'respondent',
 'trolley',
 'rail',
 'worker',
 'confront',
 'pull',
 'modification',
 'footbridge',
 'heavy',
 'rucksack',
 'proximity',
 'mechanic',
 'greene',
 'agent-authored',
 'morally',
 'conceive',
 'shock',
 'underpinning',
 'morality',
 'valuation',
 'care-based',
 'justice-based',
 'abovementioned',
 'fronto-insular',
 'supposedly',
 'deliberation',
 'blend',
 'allegedly',
 'emotional-mot

### For counting 1-grams

In [28]:
def lexeme_counter(doc, string):
    
    """
    Function for getting raw lemma count in a document.
    """
        
    tokens = preprocess_ttr(doc)
    counter = 0
    for tok in tokens:
        if str(tok) == str(string):
            counter += 1
    
    return counter

In [51]:
results_df = initialize_results_df()

for lexeme in ["think", "consider", "report"]:
    
    new_name_n = "n_" + str(lexeme)
    new_name_adj = str(new_name_n) + "_adj"
    
    for i in range(0,20):
        # Pull text from df
        text = corpus_new.loc[i, "Text"]
        # Run scispaCy model
        doc = nlp(text)
        # Count lexeme (includes preprocessing)
        ct = lexeme_counter(doc, lexeme)
        # Put in results_df
        results_df.loc[i, new_name_n] = ct
        results_df.loc[i, new_name_adj] = (ct / results_df.loc[i, 'Word Count']) #* np.mean(results_df['Word Count'])

results_df

Unnamed: 0,Group,Author,Title,Word Count,n_think,n_think_adj,n_consider,n_consider_adj,n_report,n_report_adj
0,JP-EN,Tamura,Neural Network Development in L,6926.0,2.0,0.000289,4.0,0.000578,20.0,0.002888
1,JP-EN,Watanabe,Diminished Medial Prefrontal Ac,6918.0,1.0,0.000145,3.0,0.000434,7.0,0.001012
2,EN-EN,Sobhani,Interpersonal Liking Modulates,5953.0,1.0,0.000168,3.0,0.000504,3.0,0.000504
3,EN-EN,Majdandžić,The Human Factor: Behavioral an,9587.0,4.0,0.000417,7.0,0.00073,1.0,0.000104
4,EN-EN,Dixon,The Decision to Engage Cognitiv,8620.0,1.0,0.000116,3.0,0.000348,2.0,0.000232
5,JP-EN,Ohta,Syntactic Computation in the Hu,8739.0,2.0,0.000229,1.0,0.000114,10.0,0.001144
6,EN-EN,Deeley,Using Hypnotic Suggestion to Mo,7103.0,0.0,0.0,0.0,0.0,12.0,0.001689
7,EN-EN,Pawliczek,Anger under Control: Neural Cor,4688.0,0.0,0.0,2.0,0.000427,15.0,0.0032
8,EN-EN,Jansma,fMRI Guided rTMS Evidence for R,4865.0,0.0,0.0,1.0,0.000206,2.0,0.000411
9,EN-EN,Lidzba,Complex Visual Search in Childr,4478.0,1.0,0.000223,2.0,0.000447,0.0,0.0


In [52]:
compare_means('n_think')
compare_means('n_consider')
compare_means('n_think_adj')
compare_means('n_consider_adj')
compare_means('n_report')
compare_means('n_report_adj')

Mean n_think, JP-EN:  2.5
Mean n_think, EN-EN:  0.9
Sig. (unpaired t-test): 0.08096764680310031


Mean n_consider, JP-EN:  3.8
Mean n_consider, EN-EN:  3.0
Sig. (unpaired t-test): 0.5626770820919398


Mean n_think_adj, JP-EN:  0.00036150768325888323
Mean n_think_adj, EN-EN:  0.00012803460328847217
Sig. (unpaired t-test): 0.08375907208446082


Mean n_consider_adj, JP-EN:  0.0005628961972975831
Mean n_consider_adj, EN-EN:  0.0004480197983520875
Sig. (unpaired t-test): 0.5674801108857972


Mean n_report, JP-EN:  10.7
Mean n_report, EN-EN:  4.5
Sig. (unpaired t-test): 0.029472239581535983


Mean n_report_adj, JP-EN:  0.0015340292107768054
Mean n_report_adj, EN-EN:  0.0007649338089320477
Sig. (unpaired t-test): 0.07608897487248438




In [40]:
for i in range(0,20):
    results_df.loc[i, 'Combined'] = results_df.loc[i, 'n_think'] + results_df.loc[i, 'n_consider']
    results_df.loc[i, 'Combined_adj'] = results_df.loc[i, 'Combined'] / results_df.loc[i, 'Word Count']
    
compare_means('Combined')
compare_means('Combined_adj')

### For 2,3-grams

In [49]:
results_df = initialize_results_df()
cols = []

for lexeme in ["in this study", "therefore", "in addition", "bold response"]:
    
    new_name_n = "n_" + str(lexeme.replace(" ", ""))
    new_name_adj = str(new_name_n) + "_adj"
    cols.append(new_name_n)
    cols.append(new_name_adj)

    for i in range(0,20):
        text = corpus_new.loc[i, "Text"].lower()
        ct = text.count(lexeme)
        # Put in results_df
        results_df.loc[i, new_name_n] = ct
        results_df.loc[i, new_name_adj] = (ct / results_df.loc[i, 'Word Count']) #* np.mean(results_df['Word Count'])

results_df

Unnamed: 0,Group,Author,Title,Word Count,n_inthisstudy,n_inthisstudy_adj,n_therefore,n_therefore_adj,n_inaddition,n_inaddition_adj,n_boldresponse,n_boldresponse_adj
0,JP-EN,Tamura,Neural Network Development in L,6926.0,4.0,0.000578,1.0,0.000144,11.0,0.001588,0.0,0.0
1,JP-EN,Watanabe,Diminished Medial Prefrontal Ac,6918.0,1.0,0.000145,7.0,0.001012,3.0,0.000434,0.0,0.0
2,EN-EN,Sobhani,Interpersonal Liking Modulates,5953.0,3.0,0.000504,1.0,0.000168,4.0,0.000672,0.0,0.0
3,EN-EN,Majdandžić,The Human Factor: Behavioral an,9587.0,0.0,0.0,3.0,0.000313,5.0,0.000522,0.0,0.0
4,EN-EN,Dixon,The Decision to Engage Cognitiv,8620.0,0.0,0.0,3.0,0.000348,1.0,0.000116,3.0,0.000348
5,JP-EN,Ohta,Syntactic Computation in the Hu,8739.0,0.0,0.0,5.0,0.000572,2.0,0.000229,0.0,0.0
6,EN-EN,Deeley,Using Hypnotic Suggestion to Mo,7103.0,1.0,0.000141,8.0,0.001126,3.0,0.000422,0.0,0.0
7,EN-EN,Pawliczek,Anger under Control: Neural Cor,4688.0,0.0,0.0,6.0,0.00128,1.0,0.000213,0.0,0.0
8,EN-EN,Jansma,fMRI Guided rTMS Evidence for R,4865.0,1.0,0.000206,0.0,0.0,1.0,0.000206,0.0,0.0
9,EN-EN,Lidzba,Complex Visual Search in Childr,4478.0,1.0,0.000223,2.0,0.000447,4.0,0.000893,1.0,0.000223


In [50]:
for col in cols:
    compare_means(col)

Mean n_inthisstudy, JP-EN:  3.7
Mean n_inthisstudy, EN-EN:  1.1
Sig. (unpaired t-test): 0.3213334124330426


Mean n_inthisstudy_adj, JP-EN:  0.00048706861612406925
Mean n_inthisstudy_adj, EN-EN:  0.00018166228041333954
Sig. (unpaired t-test): 0.34355427166588526


Mean n_therefore, JP-EN:  4.3
Mean n_therefore, EN-EN:  2.8
Sig. (unpaired t-test): 0.22402026964038416


Mean n_therefore_adj, JP-EN:  0.0006323718848864725
Mean n_therefore_adj, EN-EN:  0.00044529173124458626
Sig. (unpaired t-test): 0.3293363734101986


Mean n_inaddition, JP-EN:  4.4
Mean n_inaddition, EN-EN:  3.1
Sig. (unpaired t-test): 0.31323440605148106


Mean n_inaddition_adj, JP-EN:  0.0006424914450415286
Mean n_inaddition_adj, EN-EN:  0.0004992948110014182
Sig. (unpaired t-test): 0.44839753078284084


Mean n_boldresponse, JP-EN:  4.2
Mean n_boldresponse, EN-EN:  0.5
Sig. (unpaired t-test): 0.39119419168286584


Mean n_boldresponse_adj, JP-EN:  0.0008943781942078365
Mean n_boldresponse_adj, EN-EN:  7.210648341095022e-

### Statistical testing
#### 1. Type/token ratio (lexical diversity)

In [35]:
def compare_means(var):
    
    from scipy.stats import ttest_ind
    
    jp_stats = list(results_df[results_df['Group'] == 'JP-EN'].loc[:, var])
    en_stats = list(results_df[results_df['Group'] == 'EN-EN'].loc[:, var])
    P = ttest_ind(jp_stats, en_stats).pvalue
    
    print(f'Mean {var}, JP-EN:  {np.mean(jp_stats)}')
    print(f'Mean {var}, EN-EN:  {np.mean(en_stats)}')
    print(f'Sig. (unpaired t-test): {P}')
    print('\n')

In [None]:
# Calculate raw type/token ratio (lemmatized)

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Run scispaCy model
    doc = nlp(text)
    # Preprocess
    ttr = type_token_ratio(doc)
    # Put in results_df
    results_df.loc[i, 'Type/Token'] = ttr

results_df.head(5)

In [None]:
# Calculate adjusted type/token ratio (lemmatized) divided by word count

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Split by word (" " for simplicity)
    word_ct = len(text.split(' '))
    # Put in results_df
    results_df.loc[i, 'Word Count'] = word_ct
    
results_df

In [None]:
# Divide by mean word count of all documents
results_df['TTR_adj'] = (results_df['Type/Token'] / results_df['Word Count']) * np.mean(results_df['Word Count'])
    
results_df.head(6)

In [None]:
# Compare mean stats between JP-EN and EN-EN corpora

compare_means('Type/Token')
compare_means('Word Count')
compare_means('TTR_adj')

Comments (4/10): H1 seems to be rejected. fMRI studies authored by Japanese scientists are just as lexically sophisticated as comparable docs authored by Anglophone counterparts. Perhaps this is a good thing: i.e., any differences discovered later are a product of linguistic features, rather than scientific knowledge/ignorance.

In [None]:
# Examine the entities extracted by the mention detector. Note that they don't have types
# like in SpaCy, and they are more general (e.g including verbs) - these are any spans
# which might be an entity in UMLS, a large biomedical database.
print(a.ents)


#>>> (Myeloid derived suppressor cells,
#     MDSC,
#     immature,
#     myeloid cells,
#     immunosuppressive activity,
#     accumulate,
#     tumor-bearing mice,
#     humans,
#     cancer,
#     hepatocellular carcinoma,
#     HCC)

In [None]:
# We can also visualise dependency parses
# (This renders automatically inside a jupyter notebook!):
from spacy import displacy
# displacy.render(next(doc2.sents), style='dep', jupyter=True)
displacy.render(a, style='dep', jupyter=True)

# See below for the generated SVG.
# Zoom your browser in a bit!