# Pubmed Paper Clasification - Feature Extraction

## Import relevant libraries

In [1]:
import os
import time
import string
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm as _tqdm
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

no_punc = str.maketrans('', '', string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
porter = nltk.stem.porter.PorterStemmer()

In [2]:
input_dt = '8.26.21'
train_frac = 0.9
output_dt = '10.20.21'
output_sfx = '%s-%.1f-%s' % (input_dt, train_frac, output_dt)
print(output_sfx)

input_fn = '../input/abstracts-%s.pickle' % input_dt

8.26.21-0.9-10.20.21


## Read in abstract data

In [3]:
pm = pd.read_pickle(input_fn)
pm.groupby('Category').nunique()

Unnamed: 0_level_0,PMID,Abstract
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
1,128,128
3,114,114
4,9713,9713
5,17,17
6,8,8
7,2019,2019
8,116,116
9,122,122
10,1459,1459


## Derive features

In [4]:
def isletters(word):
    for c in word:
        if c < 'a' or c > 'z':
            return False
    return True

def isletterornum(word):
    for c in word:
        if (c < 'a' or c > 'z') and (c < '0' or c > '9'):
            return False
    return True

In [5]:
isletters('ψ secondari structur sequenc'), isletters('yeet')

(False, True)

### Clean abstracts

In [6]:
def clean_abstract(abstract):
    custom_drop = []
    cleaned = [w.lower().translate(no_punc) for w in abstract.split(' ')]
    dropped = [w for w in cleaned if w not in stop_words and w not in custom_drop]
    return ' '.join(dropped)

In [7]:
pm['Cleaned Abstract'] = pm.apply(lambda x: clean_abstract(x.Abstract), axis=1)
pm

Unnamed: 0,PMID,Category,Abstract,Cleaned Abstract
0,25553339,1,Cerebellar ataxia is a progressive neuro-degen...,cerebellar ataxia progressive neurodegenerativ...
1,26663098,1,Facial analysis systems are becoming available...,facial analysis systems becoming available hea...
2,27014455,1,Patients with Williams-Beuren Syndrome can be ...,patients williamsbeuren syndrome recognized cl...
3,27112773,1,The genetic basis of numerous intellectual dis...,genetic basis numerous intellectual disability...
4,27356087,1,We report four individuals from two unrelated ...,report four individuals two unrelated consangu...
...,...,...,...,...
13691,34231311,10,Deep learning (DL) has shown rapid advancement...,deep learning dl shown rapid advancement consi...
13692,34231533,10,Systemic retinal biomarkers are biomarkers ide...,systemic retinal biomarkers biomarkers identif...
13693,34233515,10,"In the last few years, artificial intelligence...",last years artificial intelligence ai research...
13694,34234854,10,Despite the significant progress in diagnosis ...,despite significant progress diagnosis treatme...


### Stem abstracts

In [8]:
def stem_abstract(abstract):
    # sentence tokenize and remove punctuation, non-alpha, stop-words
    tmp_sent = nltk.sent_tokenize(abstract)
    tmp_sent = [nltk.word_tokenize(s.translate(no_punc).lower()) for s in tmp_sent]
#     tmp_sent = [[w for w in s if not w in stop_words and w.isalpha()] for s in tmp_sent]
#     tmp_sent = [[w for w in s if not w in stop_words and isletters(w)] for s in tmp_sent]
#     tmp_sent = [[w for w in s if not w in stop_words and isletterornum(w)] for s in tmp_sent]
#     tmp_sent = [[w for w in s if not w in stop_words and not w.isdigit()] for s in tmp_sent]
    tmp_sent = [[w for w in s if not w in stop_words and not w.isdigit() and isletterornum(w)] for s in tmp_sent]
    tmp_stem = [[porter.stem(w) for w in s] for s in tmp_sent]
#     tmp_gram = [' '.join(g) for s in tmp_stem for g in nltk.ngrams(s, n)]
#     return ' '.join([' '.join(s) for s in tmp_stem]), tmp_gram
    return tmp_stem

stemmed = [stem_abstract(a) for a in _tqdm(pm.Abstract.tolist())]

100%|████████████████████████████████████| 13696/13696 [00:59<00:00, 231.51it/s]


In [9]:
# pm['Stemmed Abstract'] = pm.apply(lambda x: stem_abstract(x.Abstract, n=n_grams), axis=1)
# pm['Stemmed ngram'] = pm.apply(lambda x: stem_abstract(x.Abstract, n=n_grams), axis=1)


pm['Stemmed Abstract'] = [' '.join([' '.join(s) for s in ab]) for ab in stemmed]
pm['Stemmed 2ngram'] = [[' '.join(g) for s in ab for g in nltk.ngrams(s, 2)] for ab in stemmed]
pm['Stemmed 3ngram'] = [[' '.join(g) for s in ab for g in nltk.ngrams(s, 3)] for ab in stemmed]
pm['Stemmed 4ngram'] = [[' '.join(g) for s in ab for g in nltk.ngrams(s, 4)] for ab in stemmed]
pm

Unnamed: 0,PMID,Category,Abstract,Cleaned Abstract,Stemmed Abstract,Stemmed 2ngram,Stemmed 3ngram,Stemmed 4ngram
0,25553339,1,Cerebellar ataxia is a progressive neuro-degen...,cerebellar ataxia progressive neurodegenerativ...,cerebellar ataxia progress neurodegen diseas m...,"[cerebellar ataxia, ataxia progress, progress ...","[cerebellar ataxia progress, ataxia progress n...","[cerebellar ataxia progress neurodegen, ataxia..."
1,26663098,1,Facial analysis systems are becoming available...,facial analysis systems becoming available hea...,facial analysi system becom avail healthcar pr...,"[facial analysi, analysi system, system becom,...","[facial analysi system, analysi system becom, ...","[facial analysi system becom, analysi system b..."
2,27014455,1,Patients with Williams-Beuren Syndrome can be ...,patients williamsbeuren syndrome recognized cl...,patient williamsbeuren syndrom recogn clinic g...,"[patient williamsbeuren, williamsbeuren syndro...","[patient williamsbeuren syndrom, williamsbeure...","[patient williamsbeuren syndrom recogn, willia..."
3,27112773,1,The genetic basis of numerous intellectual dis...,genetic basis numerous intellectual disability...,genet basi numer intellectu disabl id syndrom ...,"[genet basi, basi numer, numer intellectu, int...","[genet basi numer, basi numer intellectu, nume...","[genet basi numer intellectu, basi numer intel..."
4,27356087,1,We report four individuals from two unrelated ...,report four individuals two unrelated consangu...,report four individu two unrel consanguin fami...,"[report four, four individu, individu two, two...","[report four individu, four individu two, indi...","[report four individu two, four individu two u..."
...,...,...,...,...,...,...,...,...
13691,34231311,10,Deep learning (DL) has shown rapid advancement...,deep learning dl shown rapid advancement consi...,deep learn dl shown rapid advanc consider prom...,"[deep learn, learn dl, dl shown, shown rapid, ...","[deep learn dl, learn dl shown, dl shown rapid...","[deep learn dl shown, learn dl shown rapid, dl..."
13692,34231533,10,Systemic retinal biomarkers are biomarkers ide...,systemic retinal biomarkers biomarkers identif...,system retin biomark biomark identifi retina r...,"[system retin, retin biomark, biomark biomark,...","[system retin biomark, retin biomark biomark, ...","[system retin biomark biomark, retin biomark b..."
13693,34233515,10,"In the last few years, artificial intelligence...",last years artificial intelligence ai research...,last year artifici intellig ai research rapidl...,"[last year, year artifici, artifici intellig, ...","[last year artifici, year artifici intellig, a...","[last year artifici intellig, year artifici in..."
13694,34234854,10,Despite the significant progress in diagnosis ...,despite significant progress diagnosis treatme...,despit signific progress diagnosi treatment pa...,"[despit signific, signific progress, progress ...","[despit signific progress, signific progress d...","[despit signific progress diagnosi, signific p..."


In [10]:
unique = [g for ab in pm['Stemmed 2ngram'].tolist() for g in ab]
print('2ngram | pre-dedupe', len(unique), '| post-dedupe', len(list(set(unique))))

unique = [g for ab in pm['Stemmed 3ngram'].tolist() for g in ab]
print('3ngram | pre-dedupe', len(unique), '| post-dedupe', len(list(set(unique))))

unique = [g for ab in pm['Stemmed 4ngram'].tolist() for g in ab]
print('4ngram | pre-dedupe', len(unique), '| post-dedupe', len(list(set(unique))))

2ngram | pre-dedupe 1318972 | post-dedupe 566633
3ngram | pre-dedupe 1223441 | post-dedupe 1013520
4ngram | pre-dedupe 1128023 | post-dedupe 1061826


### Fold split

In [11]:
n_folds = 9

output_dir = '../input/%dfold/%s' % (n_folds, '%s')
Path(output_dir % '').mkdir(parents=True, exist_ok=True)

train_all_fn = output_dir % ('train-all-%s.pickle' % output_sfx)
train_fn = output_dir % ('train-f%s-%s.pickle' % ('%d', output_sfx))
valid_fn = output_dir % ('valid-f%s-%s.pickle' % ('%d', output_sfx))
test_fn = output_dir % ('test-%s.pickle' % output_sfx)

In [12]:
train = []
for i in pm.Category.unique():
    train.append(pm[pm.Category == i].sample(frac=train_frac, random_state=420))

train = pd.concat(train)
train.shape

(12326, 8)

In [13]:
splits = [[] for _ in range(n_folds)]

for i in pm.Category.unique():
    split_tmp = np.array_split(train[train.Category == i].sample(frac=1, random_state=420), n_folds)
    for n in range(n_folds):
        splits[n].append(split_tmp[n])
        
splits = [pd.concat(s).sample(frac=1, random_state=69) for s in splits]

folds = [[] for _ in range(n_folds)]
valids = []
test = list(range(n_folds))
for n in range(n_folds):
    valids.append(splits[n])
    if splits[:n]:
        folds[n].append(pd.concat(splits[:n]))
    if splits[n+1:]:
        folds[n].append(pd.concat(splits[n+1:]))

folds = [pd.concat(f).sample(frac=1, random_state=1337) for f in folds]
['train %d valid %d total %d match %r' % (folds[n].shape[0], valids[n].shape[0], folds[n].shape[0] + valids[n].shape[0], 
                                          folds[n].shape[0] + valids[n].shape[0] == train.shape[0]) for n in range(n_folds)]

['train 10953 valid 1373 total 12326 match True',
 'train 10953 valid 1373 total 12326 match True',
 'train 10954 valid 1372 total 12326 match True',
 'train 10955 valid 1371 total 12326 match True',
 'train 10956 valid 1370 total 12326 match True',
 'train 10957 valid 1369 total 12326 match True',
 'train 10958 valid 1368 total 12326 match True',
 'train 10960 valid 1366 total 12326 match True',
 'train 10962 valid 1364 total 12326 match True']

In [14]:
test = pm.drop(train.index)
test.shape

(1370, 8)

## Save files

In [15]:
if os.path.isfile(train_all_fn):
    os.remove(train_all_fn)
    
train.to_pickle(train_all_fn)

In [16]:
for n in range(n_folds):
    if os.path.isfile(train_fn % n):
        os.remove(train_fn % n)
    if os.path.isfile(valid_fn % n):
        os.remove(valid_fn % n)

    folds[n].to_pickle(train_fn % n)
    valids[n].to_pickle(valid_fn % n)

In [17]:
if os.path.isfile(test_fn):
    os.remove(test_fn)

test.to_pickle(test_fn)