#### (4月12日) Corpus Analysis

In [1]:
import numpy as np
import pandas as pd
import copy
import re
from tqdm import tqdm 

pd.options.mode.chained_assignment = None

# Project-specific functions
from functions import collapse, preprocess_gen, initialize_results_df, unique

In [2]:
# Unpickle
corpus_new = pd.read_pickle('savefiles/corpusfull_20220410.pkl')
toks_df = pd.read_pickle('savefiles/toksdf_20220413.pkl')

In [3]:
# Converts each doc from list of paras to one long string
for i in range(0,20):
    corpus_new.loc[i, "Text"] = collapse(corpus_new.loc[i, "Text"])

# Preprocess each doc before Spacy modeling
for i in range(0,20):
    corpus_new.loc[i, "Text"] = preprocess_gen(corpus_new.loc[i, "Text"])

In [4]:
# Row indices of docs in respective corpora
jpen_i = [0,  1,  5, 10, 11, 12, 13, 14, 17, 19]
enen_i = [2,  3,  4,  6,  7,  8,  9, 15, 16, 18]

### (4月26日) Token results

In [34]:
# Enter token (lemma) as string to compare between JP-EN/EN-EN

toks_df[toks_df['Token'] == 'active']

Unnamed: 0,Token,JP-EN Mean Count,JP-EN Mean Freq,EN-EN Mean Count,EN-EN Mean Freq,P (count),P (freq)
723,active,0.5,0.079231,2.8,0.425237,0.039044,0.028967


### (4月23日) Search for tokens/strings

In [7]:
import scispacy
import spacy

nlp = spacy.load("en_core_sci_md")

In [8]:
# Get each text as scispaCy doc 
corpus_new['spacy_docs'] = [nlp(text) for text in corpus_new['Text']]

# Get each text as list of scispaCy sentences
# sent_list = []
# for i in range(0,20):
#     s = list([x for x in corpus_new.spacy_docs[i].sents])
#     sent_list.append(s)
# corpus_new['spacy_sents'] = pd.Series(sent_list)
# print([x for x in corpus_new.spacy_docs[i].sents])

In [32]:
# Get list of matching sentences (JP-EN)
from spacy.matcher import Matcher

for n in jpen_i:
    doc = corpus_new['spacy_docs'][n]

    pattern = [{'LEMMA': 'activate'}]

    matcher = Matcher(nlp.vocab)
    matcher.add("SCREENER", [pattern])
    matches = matcher(doc)

    # Iterate over the matches
    print(f'{len(matches)} match(es) in {corpus_new["Author"][n]} [doc {n}]')
    for match_id, start, end in matches:
        # Get the matched span
        matched_span = doc[start:end]
        print('- ', matched_span.sent.text)
    print('')

3 match(es) in Tamura [doc 0]
-  The posterior cingulate cortex (PCC) was also activated by risk-taking vs. safe actions.
-  These reports are consistent with the present finding that the observation of risk-taking actions (compared to safe ones) activated the PCC.
-  For example, the precuneus is activated during ‘forgivability’ judgments in social scenarios and in the attribution of emotions to the self and others.

0 match(es) in Watanabe [doc 1]

14 match(es) in Ohta [doc 5]
-  Lastly, we established that nonlinguistic order-related and error-related factors significantly activated the right (R.) lateral premotor cortex and R. F3op/F3t, respectively.
-  Other significantly activated regions were the right (R.) F3op/F3t, R. LPMC, anterior cingulate cortex (ACC), and R. SMG.
-  Seed masks were set in the pair of L. F3op/F3t and L. SMG, both of which were significantly activated in Nested’(L)>Simple’(S).
-  Other significantly activated regions were L. LPMC/F3op and ACC under the sent

In [35]:
# Get list of matching sentences (EN-EN)

for n in enen_i:
    doc = corpus_new['spacy_docs'][n]

    pattern = [{'LEMMA': 'detail'}]

    matcher = Matcher(nlp.vocab)
    matcher.add("SCREENER", [pattern])
    matches = matcher(doc)

    # Iterate over the matches
    print(f'{len(matches)} match(es) in {corpus_new["Author"][n]} [doc {n}]')
    for match_id, start, end in matches:
        # Get the matched span
        matched_span = doc[start:end]
        print('- ', matched_span.sent.text)
    print('')

3 match(es) in Sobhani [doc 2]
-  Each of these study components is described in detail below.
-  Participants were also asked to identify each of the targets and recount details of the target’s story as a way of ensuring that all the targets were equally and accurately remembered.
-  ROIs were hand-drawn for each subject based upon anatomical boundaries detailed in (See Figure 2 for locations; see Table S1 for boundaries).

6 match(es) in Majdandžić [doc 3]
-  In order to examine the effects of our manipulation in more detail, we presented the participants with a questionnaire after the scanning session in which we explored their feelings towards the fictitious persons.
-  Planned comparisons and post-hoc t-tests were applied to assess specific effects in more detail.
-  The comparison between dilemma decisions involving Humanized versus Neutral persons (contrast Humanized > Neutral Dilemmas) revealed the following significant clusters (see Table 1, Figure 4, for details).
-  The PPI 

In [None]:
import re

matches = []

for i, sent in enumerate(corpus_new.spacy_sents[0]):
    if 'report' in sent.lemma_:
        #words = re.findall('report', sent.text)
        matches.append(tuple((i, sent.text)))
    
matches    

In [None]:
def preprocess_tok(doc):
    
    # Collect lemmas not tagged by spaCy as 1. punctuation, 2. digits, 3. URLs, or 4. stop words
    tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
    
    # Remove any tokens containing mid-string digits (e.g. "P5-a") or punc ('t(are')
    tokens = [tok for tok in tokens if not re.search("\d", tok)]
    tokens = [tok for tok in tokens if not re.search("\(", tok)]
    tokens = [tok for tok in tokens if not re.search("\)", tok)]
    
    # (4.13) Break apart hyphen- or slash-separated compounds
    seps = ['-', '–', '―',
            ';', ':',
            '\]', '\[', 
            '’', '”', 
            '>', '<', '/']
    for sep in seps:
        new_toks = []
        for tok in tokens:
            new_toks += tok.split(sep)
        tokens = new_toks
    
    # (4.13) Remove remaining abbreviations
    tokens = [tok for tok in tokens if not re.search("[a-zA-Z]\.[a-zA-Z]\.", tok)]
    tokens = [tok for tok in tokens if not re.search("\+", tok)]
    
    # Remove punc and small words (e.g. 'a', 'P', 'mm')
    punc_to_skip = set(['±', '=', '>', '<'])
    tokens = [tok for tok in tokens if tok not in punc_to_skip]     # can skip?
    tokens = [tok for tok in tokens if len(tok) > 3]    
       
    # Unify to lowercase (to simplify matching)
    tokens = [tok.lower() for tok in tokens]
    
    return tokens

In [None]:
def find_tokens_unique_to_one_doc(i):
    
    other_docs = list(range(0,20))
    other_docs.remove(i)
    
    # Doc in question
    text_i = corpus_new.loc[i, "Text"]
    doc_i = nlp(text_i)
    tokens_i = unique(preprocess_tok(doc_i))
    
    # Iterate thru all 19 other docs
    for j in other_docs:
        text_j = corpus_new.loc[j, "Text"]
        doc_j = nlp(text_j)
        tokens_j = set(unique(preprocess_tok(doc_j)))
        
        for tok in tokens_i:
            if tok in tokens_j:
                tokens_i.remove(tok)
    
    return tokens_i            

In [None]:
# Check performance of preprocessing function
toks_unique = find_tokens_unique_to_one_doc(5)
toks_unique

### Word-level analysis (4/18)

In [None]:
## Lemmas more common in JP-EN than EN-EN
toks_jp = toks_df[(toks_df["JP-EN Mean Freq"] > toks_df["EN-EN Mean Freq"]) &
                 (toks_df["JP-EN Mean Count"] >= 1)]
toks_jp.sort_values("P (freq)", inplace=True)
results_tokjp = toks_jp[["Token", "JP-EN Mean Freq",
                         "EN-EN Mean Freq", "P (freq)"]].reset_index(drop=True)

results_tokjp.head(60)
#results_tokjp[:40].to_csv("savefiles/results_tokjp.csv")

In [None]:
## Lemmas less common in JP-EN than EN-EN
toks_en = toks_df[(toks_df["JP-EN Mean Freq"] < toks_df["EN-EN Mean Freq"]) &
                 (toks_df["EN-EN Mean Count"] >= 1)]
toks_en.sort_values("P (freq)", inplace=True)
results_token = toks_en[["Token", "JP-EN Mean Freq",
                         "EN-EN Mean Freq", "P (freq)"]].reset_index(drop=True)

results_token.head(60)
#results_token[:40].to_csv("savefiles/results_token.csv")

In [None]:
compare_means('n_think')
compare_means('n_consider')
compare_means('n_think_adj')
compare_means('n_consider_adj')
compare_means('n_report')
compare_means('n_report_adj')

In [None]:
for i in range(0,20):
    results_df.loc[i, 'Combined'] = results_df.loc[i, 'n_think'] + results_df.loc[i, 'n_consider']
    results_df.loc[i, 'Combined_adj'] = results_df.loc[i, 'Combined'] / results_df.loc[i, 'Word Count']
    
compare_means('Combined')
compare_means('Combined_adj')

### For 2,3-grams

In [None]:
results_df = initialize_results_df()
cols = []

for lexeme in ["in this study", "therefore", "in addition", "bold response"]:
    
    new_name_n = "n_" + str(lexeme.replace(" ", ""))
    new_name_adj = str(new_name_n) + "_adj"
    cols.append(new_name_n)
    cols.append(new_name_adj)

    for i in range(0,20):
        text = corpus_new.loc[i, "Text"].lower()
        ct = text.count(lexeme)
        # Put in results_df
        results_df.loc[i, new_name_n] = ct
        results_df.loc[i, new_name_adj] = (ct / results_df.loc[i, 'Word Count']) #* np.mean(results_df['Word Count'])

results_df

In [None]:
for col in cols:
    compare_means(col)

### Statistical testing
#### 1. Type/token ratio (lexical diversity)

In [None]:
def type_token_ratio(doc):
    
    token_list = preprocess_tok(doc)
    n_type = len(unique(token_list))
    n_token = len(token_list)
    ttr = n_type/n_token
    
    return ttr

In [None]:
def compare_means(var):
    
    from scipy.stats import ttest_ind
    
    jp_stats = list(results_df[results_df['Group'] == 'JP-EN'].loc[:, var])
    en_stats = list(results_df[results_df['Group'] == 'EN-EN'].loc[:, var])
    P = ttest_ind(jp_stats, en_stats).pvalue
    
    print(f'Mean {var}, JP-EN:  {np.mean(jp_stats)}')
    print(f'Mean {var}, EN-EN:  {np.mean(en_stats)}')
    print(f'Sig. (unpaired t-test): {P}')
    print('\n')

In [None]:
# Calculate raw type/token ratio (lemmatized)

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Run scispaCy model
    doc = nlp(text)
    # Preprocess
    ttr = type_token_ratio(doc)
    # Put in results_df
    results_df.loc[i, 'Type/Token'] = ttr

results_df.head(5)

In [None]:
# Calculate adjusted type/token ratio (lemmatized) divided by word count

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Split by word (" " for simplicity)
    word_ct = len(text.split(' '))
    # Put in results_df
    results_df.loc[i, 'Word Count'] = word_ct
    
results_df

In [None]:
# Divide by mean word count of all documents
results_df['TTR_adj'] = (results_df['Type/Token'] / results_df['Word Count']) * np.mean(results_df['Word Count'])
    
results_df.head(6)

In [None]:
# Compare mean stats between JP-EN and EN-EN corpora

compare_means('Type/Token')
compare_means('Word Count')
compare_means('TTR_adj')

Comments (4/10): H1 seems to be rejected. fMRI studies authored by Japanese scientists are just as lexically sophisticated as comparable docs authored by Anglophone counterparts. Perhaps this is a good thing: i.e., any differences discovered later are a product of linguistic features, rather than scientific knowledge/ignorance.