In [1]:
import numpy as np
import pandas as pd
import copy
import re
from tqdm import tqdm 

pd.options.mode.chained_assignment = None

# Project-specific functions
from functions import collapse, preprocess_gen, initialize_results_df, unique

In [2]:
# Load corpora and token list from pickle files
corpus_new = pd.read_pickle('savefiles/corpusfull_20220410.pkl')
toks_df = pd.read_pickle('savefiles/toksdf_20220413.pkl')

In [3]:
# Converts each doc from list of paras to one long string
for i in range(0,20):
    corpus_new.loc[i, "Text"] = collapse(corpus_new.loc[i, "Text"])

# Preprocess each doc before Spacy modeling
for i in range(0,20):
    corpus_new.loc[i, "Text"] = preprocess_gen(corpus_new.loc[i, "Text"])

In [4]:
# Row indices of docs in respective corpora
jpen_i = [0,  1,  5, 10, 11, 12, 13, 14, 17, 19]
enen_i = [2,  3,  4,  6,  7,  8,  9, 15, 16, 18]

### Statistical results by token (lemma)

In [15]:
# Enter token (lemma) as string to compare between JP-EN/EN-EN

toks_df[toks_df['Token'] == 'think']

Unnamed: 0,Token,JP-EN Mean Count,JP-EN Mean Freq,EN-EN Mean Count,EN-EN Mean Freq,P (count),P (freq)
714,think,2.5,0.361508,0.9,0.128035,0.080968,0.083759


### Sentences containing tokens/strings of interest

In [6]:
import scispacy
import spacy

nlp = spacy.load("en_core_sci_md")

In [7]:
# Get each text as scispaCy doc 
corpus_new['spacy_docs'] = [nlp(text) for text in corpus_new['Text']]

# Get each text as list of scispaCy sentences
# sent_list = []
# for i in range(0,20):
#     s = list([x for x in corpus_new.spacy_docs[i].sents])
#     sent_list.append(s)
# corpus_new['spacy_sents'] = pd.Series(sent_list)
# print([x for x in corpus_new.spacy_docs[i].sents])

In [30]:
# Get list of matching sentences (JP-EN)
from spacy.matcher import Matcher

for n in jpen_i:
    doc = corpus_new['spacy_docs'][n]

    pattern = [{'LEMMA': 'report'}]   # ← insert token of interest here

    matcher = Matcher(nlp.vocab)
    matcher.add("SCREENER", [pattern])
    matches = matcher(doc)

    # Iterate over the matches
    print(f'{len(matches)} match(es) in {corpus_new["Author"][n]} [doc {n}]')
    for match_id, start, end in matches:
        # Get the matched span
        matched_span = doc[start:end]
        print('- ', matched_span.sent.text)
    print('')

20 match(es) in Tamura [doc 0]
-  For example, increases in white matter volume have been reported throughout childhood and adolescence, particularly in the prefrontal and parietal cortices (e.g.,.
-  In addition, grey matter volume has been reported to increase in the prefrontal and parietal cortices during the preadolescent stage, followed by a steady decline during late adolescence.
-  For example, a behavioral study using a mentalizing task requiring theory of mind and executive function reported that social abilities like ‘theory of mind’ continue to improve from adolescence to adulthood further suggesting that developmental changes continue throughout the late adolescent phase.
-  Another study using a gambling task reported that the rate of risky choices did not significantly change between early (12–15 y.o.) and mid (15–18 y.o.) adolescence, but was significantly reduced in adulthood (25–35 y.o.).
-  Functional neuroimaging studies of mental-state attribution have reported decr

22 match(es) in Higashiyama [doc 14]
-  Recently, stroke patients with isolated typing impairment without aphasia, apraxia, or visuospatial impairment and with relative preservation of writing ability have been reported, and this phenomenon has been termed dystypia.
-  The patient reported by Otsuki et al. had a lesion in the left frontal lobe involving the foot of the second frontal convolution and frontal operculum.
-  Ryu et al. reported a 64-year-old right-handed man with acute infarcts in the bilateral border-zone regions, predominantly the left frontal subcortical area, who developed a sudden typing disturbance without aphasia or neglect.
-  In addition, Cooks et al. reported a 68-year-old patient with Parkinson’s disease who had a stroke in the left temporoparietal cortex and exhibited disproportionately affected typing relative to handwriting.
-  However, it is still unknown what brain lesions are crucial for dystypia because the reported cases had multiple lesions and the loca

In [31]:
# Get list of matching sentences (EN-EN)

for n in enen_i:
    doc = corpus_new['spacy_docs'][n]

    pattern = [{'LEMMA': 'report'}]    # ← insert token of interest here

    matcher = Matcher(nlp.vocab)
    matcher.add("SCREENER", [pattern])
    matches = matcher(doc)

    # Iterate over the matches
    print(f'{len(matches)} match(es) in {corpus_new["Author"][n]} [doc {n}]')
    for match_id, start, end in matches:
        # Get the matched span
        matched_span = doc[start:end]
        print('- ', matched_span.sent.text)
    print('')

3 match(es) in Sobhani [doc 2]
-  Further information on these scales and correlations between scores on these questionnaires and classification accuracy are reported in the Supporting Materials (Figure S1 and Figure S2).
-  We also performed 2 additional types of exploratory classifications, the results of which are reported in the Supporting Materials.
-  The subjects were instructed to remember how they felt about the targets while viewing the video clips in the scanner, and they did, in fact, report feeling differently about how much they liked the different groups of targets.

1 match(es) in Majdandžić [doc 3]
-  In post-fMRI ratings participants reported that they felt more similar and connected to humanized persons and felt to understand and know them better than neutral persons.

2 match(es) in Dixon [doc 4]
-  Interestingly, we found that the tendency to engage cognitive control differed across our three behavioral tasks, being lowest for the Stroop task—the hardest task based

0 match(es) in Neale [doc 15]

8 match(es) in Kim [doc 16]
-  Context-rich multimodal stimuli may enhance affect recognition for patients with traumatic brain injury and autism but the reverse pattern was also reported for schizophrenic patients possibly due to the failure of information integration across the two modalities.
-  All participants in the fMRI experiment reported normal hearing, normal or corrected to normal vision, and no history of neurological diseases.
-  The dimensions reflected the degree to which the participant reported feeling excited, positive, calm, anxious, negative, or sad.
-  The average classification accuracy for the four presentations of the test exemplars was reported.
-  R, right; L, left; cluster size reported in voxels; T indicates peak t values; Z indicates peak z values; OFC: anterior part of orbitofrontal cortex; PC: precuneus; mPFC: medial prefrontal cortex; PCC: posterior part of the cingulate cortex; STG/MTG: superior/middle temporal gyrus; MFG:

In [None]:
import re

matches = []

for i, sent in enumerate(corpus_new.spacy_sents[0]):
    if 'report' in sent.lemma_:
        #words = re.findall('report', sent.text)
        matches.append(tuple((i, sent.text)))
    
matches    

In [None]:
def preprocess_tok(doc):
    
    # Collect lemmas not tagged by spaCy as 1. punctuation, 2. digits, 3. URLs, or 4. stop words
    tokens = [tok.lemma_ for tok in doc if not (tok.is_punct | tok.is_digit | tok.like_url | tok.is_stop)]
    
    # Remove any tokens containing mid-string digits (e.g. "P5-a") or punc ('t(are')
    tokens = [tok for tok in tokens if not re.search("\d", tok)]
    tokens = [tok for tok in tokens if not re.search("\(", tok)]
    tokens = [tok for tok in tokens if not re.search("\)", tok)]
    
    # (4.13) Break apart hyphen- or slash-separated compounds
    seps = ['-', '–', '―',
            ';', ':',
            '\]', '\[', 
            '’', '”', 
            '>', '<', '/']
    for sep in seps:
        new_toks = []
        for tok in tokens:
            new_toks += tok.split(sep)
        tokens = new_toks
    
    # (4.13) Remove remaining abbreviations
    tokens = [tok for tok in tokens if not re.search("[a-zA-Z]\.[a-zA-Z]\.", tok)]
    tokens = [tok for tok in tokens if not re.search("\+", tok)]
    
    # Remove punc and small words (e.g. 'a', 'P', 'mm')
    punc_to_skip = set(['±', '=', '>', '<'])
    tokens = [tok for tok in tokens if tok not in punc_to_skip]     # can skip?
    tokens = [tok for tok in tokens if len(tok) > 3]    
       
    # Unify to lowercase (to simplify matching)
    tokens = [tok.lower() for tok in tokens]
    
    return tokens

In [None]:
def find_tokens_unique_to_one_doc(i):
    
    other_docs = list(range(0,20))
    other_docs.remove(i)
    
    # Doc in question
    text_i = corpus_new.loc[i, "Text"]
    doc_i = nlp(text_i)
    tokens_i = unique(preprocess_tok(doc_i))
    
    # Iterate thru all 19 other docs
    for j in other_docs:
        text_j = corpus_new.loc[j, "Text"]
        doc_j = nlp(text_j)
        tokens_j = set(unique(preprocess_tok(doc_j)))
        
        for tok in tokens_i:
            if tok in tokens_j:
                tokens_i.remove(tok)
    
    return tokens_i            

In [None]:
# Check performance of preprocessing function
toks_unique = find_tokens_unique_to_one_doc(5)
toks_unique