In [184]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
engstopwords = stopwords.words('English') 
engstopwords.extend(['mg', 'kg', 'mg kg', 'hcc', 'aarc'])
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_validate,LeaveOneOut, train_test_split

In [161]:
df = pd.read_csv("/Users/smruthi/Desktop/TechTogether2020/data/docs_100_share.csv")

In [162]:
#Cleaning up non-ascii chars from abstract

newabstracts = []
for abstract in df['abstract']:
    test_list = (abstract.split())
    for i in range(len(test_list)):
        if not test_list[i].isascii():
            test_list[i] = re.sub(r'[^\x00-\x7F]+',' ', test_list[i])
    newabstracts.append(' '.join(test_list))
    
df['abstract'] = newabstracts

In [191]:
df.head()

Unnamed: 0,pmid,title,abstract,include/exclude
0,27989839,Cilostazol and enzymatically modified isoquerc...,We previously reported the anti-inflammatory e...,include
1,26878064,Zinc Ionophore (Clioquinol) Inhibition of Huma...,Prostate cancer remains the second leading cau...,include
2,23999004,Antitumor activity of histamine and clozapine ...,BACKGROUND: Functional presence of histamine H...,include
3,20798593,Use of dabigatran etexilate to reduce breast c...,Coagulation proteases and the generation of th...,include
4,30685222,A study in a rat initiation-promotion bladder ...,"Dapagliflozin, a sodium-glucose co-transporter...",include


In [238]:
def createFeatures(corpus):
    
    newcorpus = []
    
    for example in corpus:
        words = nltk.word_tokenize(example)
        words = nltk.pos_tag(words)
        wordlist = []
        for word in words:
            #filtering stopwords, checking for alphabetic chars
            if word[0].isalpha() and word[0] not in engstopwords:
                #each feature is word+position tag
                wordlist.append(word[0]+"_"+word[1])
                
        newcorpus.append(" ".join(wordlist))
    
    #adding maximum document frequency of 5 
    #since high frequency words in every abstract will be terms like "patient"
    #these don't tell us anything about relevancy
    vectorizer = CountVectorizer(max_df=5)
    texts = vectorizer.fit_transform(newcorpus).toarray()
    vocab = vectorizer.get_feature_names()

    return texts,vocab

In [235]:
def evaluateModel(X,y,vocab,penalty="l1"):
    
    #create and fit the model
    model1 = LogisticRegression(penalty=penalty,solver="liblinear")
    results1 = cross_validate(model1,X,y,cv=LeaveOneOut())
    
    model2 = LinearSVC(random_state=0, tol=1e-5)
    results2 = cross_validate(model2,X,y,cv=LeaveOneOut())

  
    #determine the average accuracy
    scores1 = results1["test_score"]
    avg_score1 = sum(scores1)/len(scores1)
    
    scores2 = results2["test_score"]
    avg_score2 = sum(scores2)/len(scores2)

    model1.fit(X,y)
    class1_prob_sorted1 = model1.coef_[0, :].argsort()
    class2_prob_sorted1 = (-model1.coef_[0, :]).argsort()

    model2.fit(X,y)
    class1_prob_sorted2 = model2.coef_[0, :].argsort()
    class2_prob_sorted2 = (-model2.coef_[0, :]).argsort()
    
    termsToTake = 20
    class1_indicators1 = [vocab[i] for i in class1_prob_sorted1[:termsToTake]]
    class2_indicators1 = [vocab[i] for i in class1_prob_sorted1[:termsToTake]]
    
    class1_indicators2 = [vocab[i] for i in class2_prob_sorted2[:termsToTake]]
    class2_indicators2 = [vocab[i] for i in class2_prob_sorted2[:termsToTake]]

    return avg_score1,class1_indicators1,class2_indicators1,avg_score2,class1_indicators2,class2_indicators2

def runEvaluation(X,y,vocab):
    print("----------L2 Norm-----------")
    avg_score1,class1_indicators1,class2_indicators1,avg_score2,class1_indicators2,class2_indicators2 = evaluateModel(X,y,vocab,"l2")
    print("Logistic")
    print("The model's average accuracy is %f"%avg_score1)
    print("The most informative terms for exclude are: %s"%class1_indicators1)
    print("The most informative terms for include are: %s"%class1_indicators2)
    print("SVM")
    print("The model's average accuracy is %f"%avg_score2)
    print("The most informative terms for exclude are: %s"%class2_indicators1)
    print("The most informative terms for include are: %s"%class2_indicators2)
    

In [236]:
corpus = list(df['abstract'])
corpus = [x.split(". ")[-2:] for x in corpus]
corpus = [x[0]+". "+x[1] for x in corpus]
y = df['include/exclude']

In [239]:
X,vocab = createFeatures(corpus)
runEvaluation(X, y, vocab)

----------L2 Norm-----------
Logistic
The model's average accuracy is 0.730000
The most informative terms for exclude are: ['tolerated_vbn', 'trial_nnp', 'safe_jj', 'postoperative_jj', 'morphine_nn', 'opioid_jj', 'registration_nnp', 'asp_nnp', 'regimens_nns', 'safety_nn', 'analgesia_nn', 'cancer_nnp', 'registration_nn', 'children_nns', 'management_nn', 'undergoing_vbg', 'aggressive_jj', 'neutropenia_nn', 'consumption_nn', 'improve_vb']
The most informative terms for include are: ['demonstrated_vbd', 'metformin_nn', 'potent_jj', 'hcc_nnp', 'antitumor_nn', 'angiogenesis_nn', 'new_jj', 'promising_jj', 'docetaxel_nn', 'development_nn', 'preclinical_jj', 'inhibitory_jj', 'malignant_jj', 'synergistic_jj', 'human_jj', 'found_vbd', 'ato_nnp', 'dfo_nnp', 'induce_vb', 'beneficial_jj']
SVM
The model's average accuracy is 0.740000
The most informative terms for exclude are: ['tolerated_vbn', 'trial_nnp', 'safe_jj', 'postoperative_jj', 'morphine_nn', 'opioid_jj', 'registration_nnp', 'asp_nnp', 'reg