In [4]:
import pandas as pd
#data_df = pd.read_csv('clean_newsgroups.csv')
data_df = pd.read_csv('../Output/PubMedArticle(normalize+lemmatize).csv', encoding='utf-8')
data_df.head(10)

Unnamed: 0.1,Unnamed: 0,Document,lemmatized,Target Label,Target Name
0,0,Cancer and cure: A critical analysis. Cancer i...,"['cancer', 'cure', 'critical', 'analysis', 'ca...",1,Cancer
1,1,Tumor microenvironment: recent advances in var...,"['tumor', 'microenvironment', 'recent', 'advan...",1,Cancer
2,2,Global Cancer Incidence and Mortality Rates an...,"['global', 'cancer', 'incidence', 'mortality',...",1,Cancer
3,3,Recent Updates on the Relationship between Can...,"['recent', 'update', 'relationship', 'cancer',...",1,Cancer
4,4,What Is Cancer? This essay focuses on themes i...,"['cancer', 'essay', 'focus', 'theme', 'explain...",1,Cancer
5,5,New methods in the diagnosis of cancer and gen...,"['new', 'method', 'diagnosis', 'cancer', 'gene...",1,Cancer
6,6,Cancer-associated fibroblasts in tumor microen...,"['cancerassociated', 'fibroblast', 'tumor', 'm...",1,Cancer
7,7,"Clinical, Prognostic and Therapeutic Significa...","['clinical', 'prognostic', 'therapeutic', 'sig...",1,Cancer
8,8,Cancer prevention: from 1727 to milestones of ...,"['cancer', 'prevention', 'milestone', 'past', ...",1,Cancer
9,9,,['nan'],1,Cancer


In [5]:
data_labels_map = dict(enumerate(pd.unique(data_df['Target Name'])))
data_labels_map

{0: 'Cancer', 1: 'COVID-19', 2: 'Genome', 3: 'Vaccine', 4: 'Virus'}

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
                                 train_test_split(np.array(data_df['lemmatized']), np.array(data_df['Target Label']),
                                                       np.array(data_df['Target Name']), test_size=0.33, random_state=42)

train_corpus.shape, test_corpus.shape

((33500,), (16500,))

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus.astype('U'))
# transform test articles into features
tv_test_features = tv.transform(test_corpus.astype('U'))
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (33500, 85199)  Test features shape: (16500, 85199)


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC ,SVC
from sklearn.model_selection import GridSearchCV


svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=42))])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 1)],
              'svm__C': [0.01, 0.1, 1, 5]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2)
gs_svm = gs_svm.fit(train_corpus.astype('U'), train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   6.6s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   5.4s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   5.2s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   5.3s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   5.1s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   5.3s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   5.4s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   5.8s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   6.1s
[CV] END .............svm__C=0.01, tfidf__ngram_range=(1, 1); total time=   5.3s
[CV] END ..............svm__C=0.1, tfidf__ngram_range=(1, 1); total time=   5.2s
[CV] END ..............svm__C=0.1, tfidf__ngram_r

In [10]:
gs_svm.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()),
  ('svm', LinearSVC(C=0.1, random_state=42))],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'svm': LinearSVC(C=0.1, random_state=42),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'svm__C': 0.1,
 'svm__class_weight': None,
 'svm__dual': True,
 'svm__fit_intercept': True,
 'svm__intercept_scaling': 1,
 'svm__loss': 'squared_hinge',
 'svm__max_iter': 1000,
 'svm__multi_class': '

In [11]:
best_svm_test_score = gs_svm.score(test_corpus.astype('U'), test_label_names)
print('Test Accuracy :', best_svm_test_score)

Test Accuracy : 0.8323636363636364


In [12]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

lr_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('lr', LogisticRegression(penalty='l2', max_iter=1000, random_state=42))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 1)],
              'lr__C': [1, 5, 10]
}

gs_lr = GridSearchCV(lr_pipeline, param_grid, cv=5, verbose=2)
gs_lr = gs_lr.fit(train_corpus.astype('U'), train_label_names)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  23.6s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  19.9s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  21.2s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  20.1s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  22.6s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  23.4s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  19.4s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  21.2s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  20.8s
[CV] END .................lr__C=1, tfidf__ngram_range=(1, 1); total time=  23.4s
[CV] END .................lr__C=5, tfidf__ngram_range=(1, 1); total time=  38.2s
[CV] END .................lr__C=5, tfidf__ngram_r

In [13]:
gs_lr.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()),
  ('lr', LogisticRegression(C=1, max_iter=1000, random_state=42))],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'lr': LogisticRegression(C=1, max_iter=1000, random_state=42),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'lr__C': 1,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 1

In [14]:
best_lr_test_score = gs_lr.score(test_corpus.astype('U'), test_label_names)
print('Test Accuracy :', best_lr_test_score)

Test Accuracy : 0.8306666666666667


In [15]:
import model_evaluation_utils as meu


In [16]:
svm_predictions = gs_svm.predict(test_corpus.astype('U'))
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=svm_predictions)

Accuracy: 0.8324
Precision: 0.895
Recall: 0.8324
F1 Score: 0.843


In [18]:
meu.display_classification_report(true_labels=test_label_names, 
                                  predicted_labels=svm_predictions, classes=unique_classes)

              precision    recall  f1-score   support

       Virus       0.98      0.71      0.82      3285
     Vaccine       0.56      0.98      0.71      3305
      Cancer       0.98      0.88      0.93      3374
      Genome       0.99      0.89      0.94      3245
    COVID-19       0.97      0.71      0.82      3291

    accuracy                           0.83     16500
   macro avg       0.90      0.83      0.84     16500
weighted avg       0.90      0.83      0.84     16500



In [19]:
lr_predictions = gs_lr.predict(test_corpus.astype('U'))
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=lr_predictions)

Accuracy: 0.8307
Precision: 0.8938
Recall: 0.8307
F1 Score: 0.8412


In [20]:
meu.display_classification_report(true_labels=test_label_names, 
                                  predicted_labels=lr_predictions, classes=unique_classes)

              precision    recall  f1-score   support

       Virus       0.98      0.71      0.82      3285
     Vaccine       0.56      0.98      0.71      3305
      Cancer       0.98      0.87      0.93      3374
      Genome       0.98      0.89      0.93      3245
    COVID-19       0.97      0.71      0.82      3291

    accuracy                           0.83     16500
   macro avg       0.89      0.83      0.84     16500
weighted avg       0.89      0.83      0.84     16500



In [22]:
label_data_map = {v:k for k, v in data_labels_map.items()}
label_map_df = pd.DataFrame(list(label_data_map.items()), columns=['Label Name', 'Label Number'])
label_map_df

Unnamed: 0,Label Name,Label Number
0,Cancer,0
1,COVID-19,1
2,Genome,2
3,Vaccine,3
4,Virus,4


In [26]:
train_idx, test_idx = train_test_split(np.array(range(len(data_df['lemmatized']))), test_size=0.33, random_state=42)
test_idx

array([33553,  9427,   199, ..., 33321, 40225, 28203])

In [37]:
predict_probas = gs_lr.predict_proba(test_corpus.astype('U')).max(axis=1)
test_df = data_df.loc[test_idx]
test_df['Predicted Name'] = lr_predictions
test_df['Predicted Confidence'] = predict_probas
test_df.head()

Unnamed: 0.1,Unnamed: 0,Document,lemmatized,Target Label,Target Name,Predicted Name,Predicted Confidence
33553,33553,Vaccines for International Travel. The pretravel management of the international traveler should be based on risk management principles. Prevention strategies and medical interventions should be b...,"['vaccine', 'international', 'travel', 'pretravel', 'management', 'international', 'traveler', 'based', 'risk', 'management', 'principle', 'prevention', 'strategy', 'medical', 'intervention', 'bas...",4,Vaccine,Vaccine,0.750842
9427,9427,Diffusion-weighted magnetic resonance imaging of urinary epithelial cancer with upper urinary tract obstruction: preliminary results. BACKGROUND: Various malignant tumors of the body show high sig...,"['diffusionweighted', 'magnetic', 'resonance', 'imaging', 'urinary', 'epithelial', 'cancer', 'upper', 'urinary', 'tract', 'obstruction', 'preliminary', 'result', 'background', 'various', 'malignan...",1,Cancer,Cancer,0.958875
199,199,Synchronous cancers in patients with head and neck cancer: risks in the era of human papillomavirus-associated oropharyngeal cancer. BACKGROUND: Second primary malignancies (SPMs) are the leading ...,"['synchronous', 'cancer', 'patient', 'head', 'neck', 'cancer', 'risk', 'era', 'human', 'papillomavirusassociated', 'oropharyngeal', 'cancer', 'background', 'second', 'primary', 'malignancy', 'spms...",1,Cancer,Cancer,0.987422
12447,12447,,['nan'],2,COVID-19,Vaccine,0.314664
39489,39489,,['nan'],4,Vaccine,Vaccine,0.314664


In [38]:
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'Cancer') & (test_df['Predicted Name'] == 'Vaccine')]
       .sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df

Unnamed: 0.1,Unnamed: 0,Document,lemmatized,Target Label,Target Name,Predicted Name,Predicted Confidence
83,83,"[Prevention of cervical cancer (II): prophylactic HPV vaccination, current knowledge, practical procedures and new issues]. Despite the considerable success of early screening for prevention of ce...","['prevention', 'cervical', 'cancer', 'ii', 'prophylactic', 'hpv', 'vaccination', 'current', 'knowledge', 'practical', 'procedure', 'new', 'issue', 'despite', 'considerable', 'success', 'early', 's...",1,Cancer,Vaccine,0.892501
5356,5356,"9-Valent HPV vaccine for cancers, pre-cancers and genital warts related to HPV. Human papillomavirus (HPV) is the causative agent of nearly all cervical cancer cases as well as a substantial propo...","['valent', 'hpv', 'vaccine', 'cancer', 'precancers', 'genital', 'wart', 'related', 'hpv', 'human', 'papillomavirus', 'hpv', 'causative', 'agent', 'nearly', 'cervical', 'cancer', 'case', 'well', 's...",1,Cancer,Vaccine,0.755714
5712,5712,Cancer vaccines generated by photodynamic therapy. The development of photodynamic therapy (PDT)-generated cancer vaccines is potentially one of the most significant achievements in the field of P...,"['cancer', 'vaccine', 'generated', 'photodynamic', 'therapy', 'development', 'photodynamic', 'therapy', 'pdtgenerated', 'cancer', 'vaccine', 'potentially', 'one', 'significant', 'achievement', 'fi...",1,Cancer,Vaccine,0.628414
231,231,[Human papillomavirus nonavalent vaccine. Update 2017]. Human papillomavirus (HPV) is the causative agent of 5% of human cancers. HPV infection is necessary for the development of cervical cancer ...,"['human', 'papillomavirus', 'nonavalent', 'vaccine', 'update', 'human', 'papillomavirus', 'hpv', 'causative', 'agent', 'human', 'cancer', 'hpv', 'infection', 'necessary', 'development', 'cervical'...",1,Cancer,Vaccine,0.535143
13,13,,['nan'],1,Cancer,Vaccine,0.314664


In [39]:
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'Virus') & (test_df['Predicted Name'] == 'Covid-19')]
       .sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df

Unnamed: 0.1,Unnamed: 0,Document,lemmatized,Target Label,Target Name,Predicted Name,Predicted Confidence


In [40]:
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'Virus') & (test_df['Predicted Name'] == 'Genome')]
       .sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df

Unnamed: 0.1,Unnamed: 0,Document,lemmatized,Target Label,Target Name,Predicted Name,Predicted Confidence
45338,45338,"VirusSeq: software to identify viruses and their integration sites using next-generation sequencing of human cancer tissue. SUMMARY: We developed a new algorithmic method, VirusSeq, for detecting ...","['virusseq', 'software', 'identify', 'virus', 'integration', 'site', 'using', 'nextgeneration', 'sequencing', 'human', 'cancer', 'tissue', 'summary', 'developed', 'new', 'algorithmic', 'method', '...",5,Virus,Genome,0.952566
42885,42885,System to assess genome sequencing needs for viral protein diagnostics and therapeutics. Computational analyses of genome sequences may elucidate protein signatures unique to a target pathogen. We...,"['system', 'ass', 'genome', 'sequencing', 'need', 'viral', 'protein', 'diagnostics', 'therapeutic', 'computational', 'analysis', 'genome', 'sequence', 'may', 'elucidate', 'protein', 'signature', '...",5,Virus,Genome,0.942942
45590,45590,Frequency analysis techniques for identification of viral genetic data. Environmental metagenomic samples and samples obtained as an attempt to identify a pathogen associated with the emergence of...,"['frequency', 'analysis', 'technique', 'identification', 'viral', 'genetic', 'data', 'environmental', 'metagenomic', 'sample', 'sample', 'obtained', 'attempt', 'identify', 'pathogen', 'associated'...",5,Virus,Genome,0.94043
45051,45051,"Genome Analysis of a Novel Clade II.b Alphabaculovirus Obtained from Artaxa digramma. Artaxa digramma is a lepidopteran pest distributed throughout southern China, Myanmar, Indonesia, and India. A...","['genome', 'analysis', 'novel', 'clade', 'iib', 'alphabaculovirus', 'obtained', 'artaxa', 'digramma', 'artaxa', 'digramma', 'lepidopteran', 'pest', 'distributed', 'throughout', 'southern', 'china'...",5,Virus,Genome,0.92302
43312,43312,Microbial virus genome annotation-mustering the troops to fight the sequence onslaught. The revolution in virus genome sequencing promises to effectively map the extant biological universe and rev...,"['microbial', 'virus', 'genome', 'annotationmustering', 'troop', 'fight', 'sequence', 'onslaught', 'revolution', 'virus', 'genome', 'sequencing', 'promise', 'effectively', 'map', 'extant', 'biolog...",5,Virus,Genome,0.91963
