#### (4月12日) Corpus Analysis

In [1]:
import numpy as np
import pandas as pd
import copy
import re
from tqdm import tqdm 

pd.options.mode.chained_assignment = None

In [2]:
# Unpickle
corpus_new = pd.read_pickle('savefiles/corpusfull_20220410.pkl')

In [24]:
# Get custom functions
from functions import collapse, preprocess_gen, initialize_results_df, unique, find_tokens_unique_to_one_doc

In [5]:
# Converts each doc from list of paras to one long string
for i in range(0,20):
    corpus_new.loc[i, "Text"] = collapse(corpus_new.loc[i, "Text"])

# Preprocess each doc before Spacy modeling
for i in range(0,20):
    corpus_new.loc[i, "Text"] = preprocess_gen(corpus_new.loc[i, "Text"])
    
# For practicing on single texts
# text = corpus_new.loc[0, "Text"]
# text

In [6]:
# Row indices of docs in respective corpora
jpen_i = [0,  1,  5, 10, 11, 12, 13, 14, 17, 19]
enen_i = [2,  3,  4,  6,  7,  8,  9, 15, 16, 18]

#### Language modeling

In [8]:
import scispacy
import spacy

nlp = spacy.load("en_core_sci_md")

In [10]:
def preprocess_tok(doc):
    
    # Collect lemmas not tagged by spaCy as 1. punctuation, 2. digits, 3. URLs, or 4. stop words
    tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
    
    # Remove any tokens containing mid-string digits (e.g. "P5-a") or punc ('t(are')
    tokens = [tok for tok in tokens if not re.search("\d", tok)]
    tokens = [tok for tok in tokens if not re.search("\(", tok)]
    tokens = [tok for tok in tokens if not re.search("\)", tok)]
    
    # (4.13) Break apart hyphen- or slash-separated compounds
    seps = ['-', '–', '―',
            ';', ':',
            '\]', '\[', 
            '’', '”', 
            '>', '<', '/']
    for sep in seps:
        new_toks = []
        for tok in tokens:
            new_toks += tok.split(sep)
        tokens = new_toks
    
    # (4.13) Remove remaining abbreviations
    tokens = [tok for tok in tokens if not re.search("[a-zA-Z]\.[a-zA-Z]\.", tok)]
    tokens = [tok for tok in tokens if not re.search("\+", tok)]
    
    # Remove punc and small words (e.g. 'a', 'P', 'mm')
    punc_to_skip = set(['±', '=', '>', '<'])
    tokens = [tok for tok in tokens if tok not in punc_to_skip]     # can skip?
    tokens = [tok for tok in tokens if len(tok) > 3]    
       
    # Unify to lowercase (to simplify matching)
    tokens = [tok.lower() for tok in tokens]
    
    return tokens

In [None]:
# Check performance of preprocessing function
toks_unique = find_tokens_unique_to_one_doc(5)
toks_unique

### (4.13) Comparing tokens not specific to one doc or corpus

In [None]:
# Collect all tokens found in 2+ docs in JP-EN

tokens_shared_jpen = []

for i in tqdm(jpen_i):
    text_i = corpus_new.loc[i, "Text"]
    doc_i = nlp(text_i)
    tokens_i = preprocess_tok(doc_i)
    tokens_i_unique = find_tokens_unique_to_one_doc(i)
    tokens_shared_i = [tok for tok in tokens_i if tok not in set(tokens_i_unique)]
    for tok in tokens_shared_i:
        tokens_shared_jpen.append(tok)

tokens_shared_jpen_unique = unique(tokens_shared_jpen) # remove duplicates

print(f'Non-document-specific tokens in JP-EN: {len(tokens_shared_jpen_unique)}')

In [None]:
# Collect all tokens found in 2+ docs in EN-EN

tokens_shared_enen = []

for i in tqdm(enen_i):
    text_i = corpus_new.loc[i, "Text"]
    doc_i = nlp(text_i)
    tokens_i = preprocess_tok(doc_i)
    tokens_i_unique = find_tokens_unique_to_one_doc(i)
    tokens_shared_i = [tok for tok in tokens_i if tok not in set(tokens_i_unique)]
    for tok in tokens_shared_i:
        tokens_shared_enen.append(tok)

tokens_shared_enen_unique = unique(tokens_shared_enen)
print(f'Non-document-specific tokens in EN-EN: {len(tokens_shared_enen_unique)}')

In [None]:
# Find intersection of tokens appearing in ≧4 docs (≧2 in each corpus) 

toks_to_analyze = [tok for tok in tokens_shared_jpen_unique if tok in set(tokens_shared_enen_unique)]
print(f'{len(toks_to_analyze)} tokens to analyze')

### (4.13) Get token counts for selected tokens
(Needed to run overnight)

In [None]:
# Get token counts for every token of interest in every RA in each corpus

from scipy.stats import ttest_ind

docs_df = initialize_results_df()
toks_df = pd.DataFrame()

for tok in tqdm(toks_to_analyze):
    for i in range(0,20):
        text = corpus_new.loc[i, "Text"]
        doc = nlp(text)
        toks_in_doc = preprocess_tok(doc)
        counter = 0
        for doctok in toks_in_doc:
            if str(doctok) == str(tok):
                counter += 1
        docs_df.loc[i, "count"] = counter
        docs_df.loc[i, "count_adj"] = (counter / docs_df.loc[i, 'Word Count']) * 1000  # per 1000 words
    
    # Separate values by group (JP-EN v. EN-EN)
    jp_ct    = list(docs_df[docs_df['Group'] == 'JP-EN'].loc[:, "count"])
    jp_ctadj = list(docs_df[docs_df['Group'] == 'JP-EN'].loc[:, "count_adj"])
    en_ct    = list(docs_df[docs_df['Group'] == 'EN-EN'].loc[:, "count"])
    en_ctadj = list(docs_df[docs_df['Group'] == 'EN-EN'].loc[:, "count_adj"])
    
    # Calculate means and run t-tests
    jp_ct_mean    = np.mean(jp_ct)
    jp_ctadj_mean = np.mean(jp_ctadj)
    en_ct_mean    = np.mean(en_ct)
    en_ctadj_mean = np.mean(en_ctadj)
        
    P_ct    = ttest_ind(jp_ct, en_ct).pvalue
    P_ctadj = ttest_ind(jp_ctadj, en_ctadj).pvalue
    
    # Add to DF
    entry = {'Token': tok,
             'JP-EN Mean Count': jp_ct_mean, 'JP-EN Mean Freq': jp_ctadj_mean,
             'EN-EN Mean Count': en_ct_mean, 'EN-EN Mean Freq': en_ctadj_mean,
             'P (count)': P_ct, 'P (freq)': P_ctadj}
    toks_df = toks_df.append(entry, ignore_index = True)    

In [None]:
toks_df.to_pickle('savefiles/toksdf_20220413.pkl')

### Word-level analysis (4/18)

In [None]:
toks_df = pd.read_pickle('savefiles/toksdf_20220413.pkl')

In [None]:
toks_df

In [None]:
## Lemmas more common in JP-EN than EN-EN
toks_jp = toks_df[(toks_df["JP-EN Mean Freq"] > toks_df["EN-EN Mean Freq"]) &
                 (toks_df["JP-EN Mean Count"] >= 1)]
toks_jp.sort_values("P (freq)", inplace=True)
results_tokjp = toks_jp[["Token", "JP-EN Mean Freq",
                         "EN-EN Mean Freq", "P (freq)"]].reset_index(drop=True)
results_tokjp[:40].to_csv("savefiles/results_tokjp.csv")


In [None]:
## Lemmas less common in JP-EN than EN-EN
toks_en = toks_df[(toks_df["JP-EN Mean Freq"] < toks_df["EN-EN Mean Freq"]) &
                 (toks_df["EN-EN Mean Count"] >= 1)]
toks_en.sort_values("P (freq)", inplace=True)
results_token = toks_en[["Token", "JP-EN Mean Freq",
                         "EN-EN Mean Freq", "P (freq)"]].reset_index(drop=True)
results_token[:40].to_csv("savefiles/results_token.csv")

## Statistical testing
#### 1. Type/token ratio (lexical diversity)

In [12]:
def type_token_ratio(doc):
    
    token_list = preprocess_tok(doc)
    n_type = len(unique(token_list))
    n_token = len(token_list)
    ttr = n_type/n_token
    
    return ttr

In [13]:
def compare_means(var):
    
    from scipy.stats import ttest_ind
    
    jp_stats = list(results_df[results_df['Group'] == 'JP-EN'].loc[:, var])
    en_stats = list(results_df[results_df['Group'] == 'EN-EN'].loc[:, var])
    P = ttest_ind(jp_stats, en_stats).pvalue
    
    print(f'Mean {var}, JP-EN:  {np.mean(jp_stats)}')
    print(f'Mean {var}, EN-EN:  {np.mean(en_stats)}')
    print(f'Sig. (unpaired t-test): {P}')
    print('\n')

In [17]:
# Calculate raw type/token ratio (lemmatized)

results_df = initialize_results_df()

for i in range(0,20):
    # Pull text from df
    text = corpus_new.loc[i, "Text"]
    # Run scispaCy model
    doc = nlp(text)
    # Preprocess
    ttr = type_token_ratio(doc)
    # Put in results_df
    results_df.loc[i, 'Type/Token'] = ttr

# Calculate adjusted type/token ratio (lemmatized) divided by word count

# Divide by mean word count of all documents
results_df['TTR_adj'] = (results_df['Type/Token'] / results_df['Word Count']) * 10000
    
results_df

Unnamed: 0,Group,Author,Title,Word Count,Type/Token
0,JP-EN,Tamura,Neural Network Development in L,6926.0,0.261733
1,JP-EN,Watanabe,Diminished Medial Prefrontal Ac,6918.0,0.218375
2,EN-EN,Sobhani,Interpersonal Liking Modulates,5953.0,0.260666
3,EN-EN,Majdandžić,The Human Factor: Behavioral an,9587.0,0.233118
4,EN-EN,Dixon,The Decision to Engage Cognitiv,8620.0,0.217311


In [21]:
# Compare mean stats between JP-EN and EN-EN corpora

compare_means('Type/Token')
compare_means('Word Count')
compare_means('TTR_adj')

Mean Type/Token, JP-EN:  0.24102634474435575
Mean Type/Token, EN-EN:  0.25306356406579766
Sig. (unpaired t-test): 0.3898003303172157


Mean Word Count, JP-EN:  6852.3
Mean Word Count, EN-EN:  6453.2
Sig. (unpaired t-test): 0.5616242355640981


Mean TTR_adj, JP-EN:  0.3693009163913344
Mean TTR_adj, EN-EN:  0.42663807247111507
Sig. (unpaired t-test): 0.37048811653384883




Comments (4/10): H1 seems to be rejected. fMRI studies authored by Japanese scientists are just as lexically sophisticated as comparable docs authored by Anglophone counterparts. Perhaps this is a good thing: i.e., any differences discovered later are a product of linguistic features, rather than scientific knowledge/ignorance.

## N-gram comparisons
### For comparing 1-grams

In [None]:
def lexeme_counter(doc, string):
    
    """
    Function for getting raw lemma count in a document.
    """
        
    tokens = preprocess_tok(doc)
    counter = 0
    for tok in tokens:
        if str(tok) == str(string):
            counter += 1
    
    return counter

In [None]:
results_df = initialize_results_df()

for lexeme in ["think", "consider", "report"]:
    
    new_name_n = "n_" + str(lexeme)
    new_name_adj = str(new_name_n) + "_adj"
    
    for i in range(0,20):
        # Pull text from df
        text = corpus_new.loc[i, "Text"]
        # Run scispaCy model
        doc = nlp(text)
        # Count lexeme (includes preprocessing)
        ct = lexeme_counter(doc, lexeme)
        # Put in results_df
        results_df.loc[i, new_name_n] = ct
        results_df.loc[i, new_name_adj] = (ct / results_df.loc[i, 'Word Count']) #* np.mean(results_df['Word Count'])

results_df

In [None]:
compare_means('n_think')
compare_means('n_consider')
compare_means('n_think_adj')
compare_means('n_consider_adj')
compare_means('n_report')
compare_means('n_report_adj')

In [None]:
for i in range(0,20):
    results_df.loc[i, 'Combined'] = results_df.loc[i, 'n_think'] + results_df.loc[i, 'n_consider']
    results_df.loc[i, 'Combined_adj'] = results_df.loc[i, 'Combined'] / results_df.loc[i, 'Word Count']
    
compare_means('Combined')
compare_means('Combined_adj')

### For comparing 2+ grams

In [None]:
results_df = initialize_results_df()
cols = []

for lexeme in ["in this study", "therefore", "in addition", "bold response"]:
    
    new_name_n = "n_" + str(lexeme.replace(" ", ""))
    new_name_adj = str(new_name_n) + "_adj"
    cols.append(new_name_n)
    cols.append(new_name_adj)

    for i in range(0,20):
        text = corpus_new.loc[i, "Text"].lower()
        ct = text.count(lexeme)
        # Put in results_df
        results_df.loc[i, new_name_n] = ct
        results_df.loc[i, new_name_adj] = (ct / results_df.loc[i, 'Word Count']) #* np.mean(results_df['Word Count'])

results_df

In [None]:
for col in cols:
    compare_means(col)