In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


#modeling
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
filename =  '../data/pubmed_vectorized_jargon.csv'

In [3]:
df = pd.read_csv(filename).set_index('pmid')

In [4]:
df.head()

Unnamed: 0_level_0,citations,fulltext,A/B testing,accuracy,action,activation function,active learning,AdaGrad,agent,agglomerative clustering,...,validation,validation set,vanishing gradient problem,Wasserstein loss,weight,Weighted Alternating Least Squares (WALS),wide model,width,total_jargon_count,log_citations
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22627698,35,extracting biological information computationa...,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,11,3.555348
22952238,4,uncovering transcription factor modules using ...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,1.386294
22944687,19,understanding substrate specificity convention...,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,2.944439
22075226,12,membrane protein structural bioinformatics des...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,2.484907
23012584,10,future medical diagnostics: large digitized da...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2.302585


### Modeling Attempt I: Citations as Target

#### Transform the Target into a Binary Variable

In [6]:
#set a threshold for 'highly-cited' established during EDA 
thresh = 10

10

In [8]:
#check how many posts meet that criterion
(df['citations'] > thresh).sum()

678

In [9]:
citations = [1 if citation > thresh else 0 for citation in df['citations']]

In [10]:
df['citations'] = citations

In [11]:
df['citations'].value_counts(normalize=True)

0    0.703152
1    0.296848
Name: citations, dtype: float64

In [13]:
df.columns

Index(['citations', 'fulltext', 'A/B testing', 'accuracy', 'action',
       'activation function', 'active learning', 'AdaGrad', 'agent',
       'agglomerative clustering',
       ...
       'validation', 'validation set', 'vanishing gradient problem',
       'Wasserstein loss', 'weight',
       'Weighted Alternating Least Squares (WALS)', 'wide model', 'width',
       'total_jargon_count', 'log_citations'],
      dtype='object', length=416)

#### Classification Modeling

In [12]:
#confirm no nulls 
df.isnull().sum().sum()

0

In [14]:
X = df.drop(columns=['citations','fulltext','log_citations'])
y = df['citations']

In [15]:
#train-test split
X_train, X_test, y_train, y_test=train_test_split(X,
                                                 y,
                                                 test_size=.2,
                                                  stratify=y,
                                                 random_state=42)

#### Run Through Classifiers

In [16]:
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    SVC(),
    BernoulliNB()
]

results_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj
   
    #fit the model
    model.fit(X_train, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train, y_train)
    results_dict['test_score'] = model.score(X_test, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train, y_train, cv = 5).mean()
    results_dict['precision_score'] = precision_score(y_test, model.predict(X_test))
    results_dict['train_f1_score'] = f1_score(y_train, model.predict(X_train))  
    results_dict['test_f1_score'] = f1_score(y_test, model.predict(X_test))  
    
    #add the dictionary to the list
    results_list.append(results_dict)

In [17]:
results = pd.DataFrame(results_list)
results

Unnamed: 0,model_name,train_score,test_score,cv_score,precision_score,train_f1_score,test_f1_score
0,LogisticRegression(),0.713191,0.693654,0.684728,0.409091,0.13245,0.113924
1,DecisionTreeClassifier(),0.974822,0.601751,0.602639,0.328358,0.95619,0.325926
2,BaggingClassifier(),0.944718,0.678337,0.664479,0.4,0.900493,0.230366
3,RandomForestClassifier(),0.974822,0.684902,0.676519,0.366667,0.956604,0.13253
4,AdaBoostClassifier(),0.727422,0.678337,0.673235,0.342857,0.229102,0.140351
5,SVC(),0.705528,0.702407,0.703887,0.0,0.014652,0.0
6,BernoulliNB(),0.717022,0.702407,0.689108,0.5,0.153846,0.105263


None of these models are better than baseline at predicting whether a post will be highly-cited or not.