### Assignment 4: MNB and SVM for Causal Language Detection 

Importing the libraries and the data

In [1]:
import numpy as np
import pandas as p
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
train=p.read_csv("pubmed_causal_language_use.csv")
y=train['label'].values
X=train['sentence'].values

Cross validating using 5 runs for different vectorizers and picking the vectroizer with the highest average score

In [2]:
##MNB TFIDF
mNB_tfidf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False, stop_words='english')),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tfidf_pipe,X,y,cv=5)
print(sum(scores)/len(scores))

##MNB TF
mNB_tf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=False,binary=False, stop_words='english')),('nb',MultinomialNB())])
scores = cross_val_score(mNB_tf_pipe,X,y,cv=5)
print(sum(scores)/len(scores))

##MNB with Bool
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1',ngram_range =(1,2), binary=True, stop_words='english')),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.6504446150401433
0.6618793248675218
0.6736392327458443


Splitting the data into train and test for class prediction

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

Training the data using the count boolean vectorizer

In [4]:
ngram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=True, stop_words='english')
X_train_vec1 = ngram_count_vectorizer.fit_transform(X_train)
X_test_vec1 = ngram_count_vectorizer.transform(X_test)
print(X_test_vec1.shape)

(1225, 5318)


In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

# initialize the MNB model
nb_clf= MultinomialNB()

# use the training data to train the MNB model
# feature_log_prob_ stores the conditional probs for all categories
# if the labels are strings, the index is in alphabetic order
# e.g. 'f' comes before 't' in alphabet, so 'f' is in [0] dimension and 't' in [1]

nb_clf.fit(X_train_vec1,y_train)
print(nb_clf.classes_)
print(nb_clf.feature_log_prob_.shape)

[0 1 2 3]
(4, 5318)


Top 10 features for all the classes:

In [6]:
feature_ranks0 = sorted(zip(nb_clf.feature_log_prob_[0], ngram_count_vectorizer.get_feature_names()))
no_relationship_features = feature_ranks0[-10:]
print(no_relationship_features)

[(-5.6636113099669245, 'treatment'), (-5.6636113099669245, 'trial'), (-5.642992022764188, 'findings'), (-5.642992022764188, 'high'), (-5.474369310328396, 'needed'), (-5.423938456701504, 'clinical'), (-5.2721324438335, 'risk'), (-5.165364468407794, 'study'), (-5.092161064384499, 'studies'), (-4.636972520923904, 'patients')]


In [7]:
feature_ranks1 = sorted(zip(nb_clf.feature_log_prob_[1], ngram_count_vectorizer.get_feature_names()))
direct_causal_features = feature_ranks1[-10:]
print(direct_causal_features)

[(-6.031122227714141, 'did'), (-6.031122227714141, 'effect'), (-6.031122227714141, 'improved'), (-5.986670465143307, 'cancer'), (-5.9441108507245115, 'effective'), (-5.789960170897253, 'study'), (-5.789960170897253, 'treatment'), (-5.754868851085982, 'weight'), (-5.625657119605977, 'risk'), (-5.027820118850356, 'patients')]


In [8]:
feature_ranks2 = sorted(zip(nb_clf.feature_log_prob_[2], ngram_count_vectorizer.get_feature_names()))
conditional_causal_features = feature_ranks2[-10:]
print(conditional_causal_features)

[(-6.5466419284444735, 'suggest'), (-6.451331748640149, 'disease'), (-6.451331748640149, 'reduce'), (-6.451331748640149, 'results'), (-6.3643203716505194, 'cancer'), (-6.3643203716505194, 'role'), (-6.284277663976983, 'increase'), (-6.2101696918232605, 'improve'), (-5.55339015543419, 'patients'), (-5.383491118638792, 'risk')]


In [9]:
feature_ranks3 = sorted(zip(nb_clf.feature_log_prob_[3], ngram_count_vectorizer.get_feature_names()))
correlational_features = feature_ranks3[-10:]
print(correlational_features)

[(-5.5838961648388485, 'cancer'), (-5.5838961648388485, 'diabetes'), (-5.54222346843828, 'women'), (-5.502218133824581, 'higher'), (-5.373600756002488, 'levels'), (-5.373600756002488, 'study'), (-5.290909040157374, 'increased'), (-4.63002272181565, 'patients'), (-4.436831492784792, 'risk'), (-4.263559771510756, 'associated')]


Confusion Matrix

In [10]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = nb_clf.fit(X_train_vec1, y_train).predict(X_test_vec1)
cm=confusion_matrix(y_test, y_pred, labels=[0,1,2,3])
print(cm)

[[451  32   2  87]
 [ 53  71   0  65]
 [ 31  21   3  21]
 [ 37  10   0 341]]


Classification report

In [11]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['0','1','2','3']
print(classification_report(y_test, y_pred, target_names=target_names))

[0.78846154 0.52985075 0.6        0.66342412]
[0.78846154 0.37566138 0.03947368 0.87886598]
              precision    recall  f1-score   support

           0       0.79      0.79      0.79       572
           1       0.53      0.38      0.44       189
           2       0.60      0.04      0.07        76
           3       0.66      0.88      0.76       388

    accuracy                           0.71      1225
   macro avg       0.65      0.52      0.51      1225
weighted avg       0.70      0.71      0.68      1225



Errors

In [12]:
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==3 and y_pred[i]==1):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

The worst QOL domain related to environmental stimuli and the best QOL domain to limitations of the activities.
Incident PD patients treated with low GDP solution have less severe systemic inflammation but trends of less ultrafiltration, and more fluid accumulation.
Weight loss, satiety and adverse symptoms demonstrated only slight changes between 3 and 8\\xa0years post-operatively.
Epidemiological trends are more or less common to those of developing countries with a predominance of invasive ductal carcinoma.
Reductions in LDL-C were greater among women randomized to both calcium+vitamin D and hormone therapy than for those randomized to either intervention alone or to placebo.
Compared to a cognitive-behavioral program, after the intervention, adolescents who received mindfulness showed greater reductions in depressive symptoms and better insulin resistance.
Team leader and team member are under moderate workloads during a pediatric sepsis scenario with team leader under high workloa

In [13]:
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==0 and y_pred[i]==2):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

However, most had playground equipment, courts, and outdoor play areas.
The potential disease-modifying effects of simvastatin on CSF phospho-tau should be further investigated in persons with hypercholesterolemia.
errors: 2


Cross validating using 5 runs for different vectorizers and picking the vectroizer with the highest average score

In [14]:
##SVC TFIDF
svc_tfidf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=True,binary=False, stop_words='english')),('svc',LinearSVC())])
scores = cross_val_score(svc_tfidf_pipe,X,y,cv=5)
print(sum(scores)/len(scores))

##SVC TF
svc_tf_pipe = Pipeline([('nb_tf',TfidfVectorizer(encoding='latin-1',use_idf=False,binary=False, stop_words='english')),('svc',LinearSVC())])
scores = cross_val_score(svc_tf_pipe,X,y,cv=5)
print(sum(scores)/len(scores))

##SVC with Bool
svc_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1',ngram_range =(1,3), binary=True, stop_words='english')),('svc', LinearSVC())])
scores = cross_val_score(svc_clf_pipe, X, y, cv=5)
avg=sum(scores)/len(scores)
print(avg)

0.6948767979187325
0.7066393713548498
0.7082738913945132


Splitting the data into train and test for class prediction

In [15]:
ngram_count_vectorizer = CountVectorizer(encoding='latin-1',ngram_range =(1,3), binary=True, stop_words='english')
X_train_vec2 = ngram_count_vectorizer.fit_transform(X_train)
X_test_vec2 = ngram_count_vectorizer.transform(X_test)
print(X_test_vec1.shape)

(1225, 5318)


In [16]:
svm_clf = LinearSVC(C=1)

# use the training data to train the model
svm_clf.fit(X_train_vec2,y_train)

LinearSVC(C=1)

Top 10 features for all the classes:

In [17]:
feature_ranks01 = sorted(zip(svm_clf.coef_[0], ngram_count_vectorizer.get_feature_names()))
no_relationship_features = feature_ranks01[-10:]
print(no_relationship_features)

[(0.27118103767448165, 'implications'), (0.281271136429408, 'appropriate'), (0.28239645665549507, 'warranted'), (0.320370516940447, 'required'), (0.32501153522390197, 'need'), (0.3345731182977772, 'necessary'), (0.35255033282683756, 'research'), (0.41359608663428654, 'safety'), (0.45821902063676995, 'needed'), (0.47300240932011867, 'studies')]


In [18]:
feature_ranks11 = sorted(zip(svm_clf.coef_[1], ngram_count_vectorizer.get_feature_names()))
direct_causal_features = feature_ranks11[-10:]
print(direct_causal_features)

[(0.2784801419768663, 'contributed'), (0.2830401830237102, 'does'), (0.29554792850273665, 'benefits'), (0.3158617837774444, 'did'), (0.3197973841858111, 'improves'), (0.3280390480421855, 'effect'), (0.38021941884597116, 'effects'), (0.42590354631187605, 'resulted'), (0.49800960155085194, 'effective'), (0.49841973633059633, 'improved')]


In [19]:
feature_ranks21 = sorted(zip(svm_clf.coef_[2], ngram_count_vectorizer.get_feature_names()))
coditional_causal_features = feature_ranks21[-10:]
print(coditional_causal_features)

[(0.23330942239722516, 'play role'), (0.234466007540729, 'appears'), (0.2676226172478021, 'useful'), (0.27806354736097505, 'protective'), (0.2801402147969131, 'play'), (0.32616222202520345, 'mediated bmi'), (0.32616222202520345, 'relations'), (0.32616222202520345, 'relations mediated'), (0.32616222202520345, 'relations mediated bmi'), (0.33072537426796417, 'improve')]


In [20]:
feature_ranks31 = sorted(zip(svm_clf.coef_[3], ngram_count_vectorizer.get_feature_names()))
correlational_features = feature_ranks31[-10:]
print(correlational_features)

[(0.3214848901794152, 'variable'), (0.32382960595114096, 'associations'), (0.3429875861288185, 'increased'), (0.36042548512064326, 'higher'), (0.41992066195961864, 'predict'), (0.4976041950824333, 'predictor'), (0.5058405320508966, 'correlated'), (0.5415959560489805, 'association'), (0.5592446227604638, 'related'), (0.9350804907517982, 'associated')]


Confusion Matrix:

In [23]:
# print confusion matrix (row: ground truth; col: prediction)

from sklearn.metrics import confusion_matrix
y_pred = svm_clf.fit(X_train_vec2, y_train).predict(X_test_vec2)
cm=confusion_matrix(y_test, y_pred, labels=[0,1,2,3])
print(cm)

[[515  13   2  42]
 [ 76  73   5  35]
 [ 46   9   4  17]
 [ 70   6   1 311]]


Classification Report

In [22]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['0','1','2','3']
print(classification_report(y_test, y_pred, target_names=target_names))

[0.78846154 0.52985075 0.6        0.66342412]
[0.78846154 0.37566138 0.03947368 0.87886598]
              precision    recall  f1-score   support

           0       0.79      0.79      0.79       572
           1       0.53      0.38      0.44       189
           2       0.60      0.04      0.07        76
           3       0.66      0.88      0.76       388

    accuracy                           0.71      1225
   macro avg       0.65      0.52      0.51      1225
weighted avg       0.70      0.71      0.68      1225



Errors:

In [24]:
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==3 and y_pred[i]==1):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

"Condom use  did increase over time in both groups."
Patients with body mass index >40\\xa0kg/m2 have greater than twice the risk for complications with odds ratios increasing with increasing body mass index class.
Dietary cholesterol intake did not have an association with LDL-C level or with risk for coronary artery calcification in apparently healthy Korean adults.
Weight loss, satiety and adverse symptoms demonstrated only slight changes between 3 and 8\\xa0years post-operatively.
Overall hypoglycemia rates were similar except for an increase in 0-2-h postmeal hypoglycemia with faster aspart.
During the same time period, the weight gain of very preterm infants improved, significantly.
errors: 6
