In [40]:
#Import Libraries

import numpy
import sys
from gensim.models import KeyedVectors
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn import metrics
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

#Please check the path of the folder 'Dataset' before running the code.

In [2]:
folder_path="./Dataset"

In [0]:
#Retrieving files from drive

umls_semantic_types_files = folder_path + "/UMLS Semantic File Types.txt"
w2v_model_file = folder_path + "Health_2.5mreviews.s200.w10.n5.v15.cbow.bin"
W2V_FEATURES_NUM = 200
mpqa_lexicon = folder_path + "/subjclueslen1-HLTEMNLP05.txt"
senti_word_net = folder_path + "/SentiWordNet_3.0.0.txt"
negative_words = folder_path + "/negative_words.txt"
sentiment_score_dict = folder_path + "/sentiment_score_dic_with_pmi.txt"


In [0]:
#Extract Window Word Features

from sklearn.base import BaseEstimator

class ExtractWindowWordsFeature(BaseEstimator):
  def __init__(self):
    pass

  def get_feature_names(self):
    return 'extract_window_words'

  def extract_window_words(self, entities):
    window_words = []

    for entity in entities:
      window_words.append(entity['text'])

    return window_words

  def fit(self, entities, y=None):
    return self

  def transform(self, entities):
    print("extract word transform")
    return self.extract_window_words(entities)

  def fit_transform(self, entities, y= None):
    print ("extract word fit_transform")
    return self.extract_window_words(entities)

In [0]:
#PoS Tag Features

import numpy as np
from scipy import sparse
import nltk
from sklearn import preprocessing
from sklearn.base import BaseEstimator
from pandas import DataFrame

class PosTagFeatures(BaseEstimator):

  def __init__(self):
    pass

  def get_feature_names(self):
    return 'pos_tag'
  
  def fit(self, documents, y=None):
    return self

  def find_pos_tag_index(self, pos_tag, pos_dict):
    index = 0
    
    for pos in pos_dict:
      if pos.find(pos_tag[1]) != -1:
        return index

      index += 1

    return -1
  
  def transform(self, window_words, y=None):
    features = []
    pos_dict = ["NN", "JJ", "VB", "RB"]

    for window_word in window_words:
      feature = [0]*len(pos_dict)
      tokens = nltk.tokenize.word_tokenize(window_word)
      pos_tags = nltk.pos_tag(tokens)

      for pos_tag in pos_tags:
        index = self.find_pos_tag_index(pos_tag, pos_dict)

        if index != -1:
          feature[index] += 1
        
      features.append(feature)

    return features

In [0]:
#Sentiment Features

import nltk
import numpy
#from dill.dill import FileNotFoundError
from pandas import DataFrame
from scipy import sparse
from sklearn import preprocessing

from sklearn.base import BaseEstimator

class SentimentFeature(BaseEstimator):

  def __init__(self):
    pass

  def get_feature_names(self):
    return 'sentiment'

  def fit(self, documents, y=None):
    return self

  def load_sentiment_dict_subj(self):
    sentiment_dict = {}
    f = open(mpqa_lexicon)
    
    for line in f:
      terms = line.split(" ")
      sentiment_dict[terms[2][terms[2].index("=") + 1:]] = terms[len(terms) - 1][terms[len(terms) - 1].index("=") + 1:].strip()
        
    return sentiment_dict

  def load_sentiment_dict_word_net(self):
    result = {}
    f = open(senti_word_net)
    
    for line in f:
      if not (line.startswith("#") or line.startswith(";")):
        line_parts = line.split("\t")
        pos_score = line_parts[2]
        neg_score = line_parts[3]
        syn_terms_split = line_parts[4].split(" ")
        for syn_term_split in syn_terms_split:
          result[syn_term_split.split("#")[0]] = {'pos_score' : float(pos_score), 'neg_score' : abs(float(neg_score))}
                        
    return result

  def create_sentiment_feature_word_net(self, sentiment_dict, window_word, negative_words, punctuation):
    feature = [0] * 8
    zero_pmi_score = 0
    total_score = 0
    max_score = 0
    last_score = 0
    zero_pmi_score_neg = 0
    total_score_neg = 0
    max_score_neg = 0
    last_score_neg = 0
    is_context_negated = False
    
    for word in window_word:
      if word in negative_words:
        is_context_negated = True
      elif word in punctuation:
        is_context_negated = False
            
      if word in sentiment_dict:
        sentiment = sentiment_dict[word]
        pos = float(sentiment['pos_score'])
        neg = float(sentiment['neg_score'])
        score = pos - neg
        
        if is_context_negated:
          if score != 0:
            zero_pmi_score_neg += 1
            
          total_score_neg += score
            
          if score != 0 and (score > max_score_neg or (abs(score) > max_score_neg and max_score_neg == 0)):
            max_score_neg = score
              
          last_score_neg = score
          
        else:
          if score != 0:
            zero_pmi_score += 1
            
            if score > max_score or ( abs(score) > max_score and max_score == 0):
              max_score = score
              
            last_score = score
            
          total_score += score
        
    feature[0] = zero_pmi_score
    feature[1] = total_score
    feature[2] = max_score
    feature[3] = last_score
    feature[4] = zero_pmi_score_neg
    feature[5] = total_score_neg
    feature[6] = max_score_neg
    feature[7] = last_score_neg
    
    return feature


  def load_negated_words(self):
    f = open(negative_words)
    negative_words_dict = []

    for line in f:
      negative_words_dict.append(line.strip())
        
    return negative_words_dict

  def create_sentiment_feature_subj(self, sentiment_dict, window_word, negative_words, punctuation):
    feature = [0] * 4
    positive_affirmative = 0
    negative_affirmative = 0
    positive_negated = 0
    negative_negated = 0
    is_negative_context = False
        
    for word in window_word:
      if word in negative_words:
        is_negative_context = True
      elif word in punctuation:
        is_negative_context = False
            
      if word in sentiment_dict:
        sentiment = sentiment_dict[word].strip()
                
        if sentiment == "negative":
          if is_negative_context:
            negative_negated += 1
          else:
            negative_affirmative += 1
                
        elif sentiment == "positive":
          if is_negative_context:
            positive_negated += 1
          else:
            positive_affirmative += 1
        
    feature[0] = positive_affirmative
    feature[1] = negative_affirmative
    feature[2] = positive_negated
    feature[3] = negative_negated
    
    return feature

    
  def transform(self, window_words, y=None):
    features = []
    sentiment_dict_subj = self.load_sentiment_dict_subj()
    sentiment_dict_word_net = self.load_sentiment_dict_word_net()
    negative_words = self.load_negated_words()
    punctuation = [',', '.', '!', '?']
    
    for window_word in window_words:
      window_word_parts = nltk.word_tokenize(window_word)
      feature_sent_word_net = self.create_sentiment_feature_word_net(sentiment_dict_word_net, window_word_parts, negative_words, punctuation)
      feature_sent_subj = self.create_sentiment_feature_subj(sentiment_dict_subj, window_word_parts, negative_words, punctuation)
      feature = []
      feature.extend(feature_sent_word_net)
      feature.extend(feature_sent_subj)
      features.append(feature)
    
    return features

In [0]:
#Sentiment Score with PMI

from sklearn.base import BaseEstimator

class SentimentScore(BaseEstimator):

  def __init__(self):
    pass

  def get_feature_names(self):
    return "sentiment_score"

  def fit(self, documents, y=None):
    return self

  def transform(self, window_words, y=None):
    f = open(sentiment_score_dict)
    word_dict = {}

    for line in f:
      line_parts = line.split('\t')
      word_dict[line_parts[0]] = float(line_parts[3])

    features = []
    i=0
    max_len = 0

    for window_word in window_words:
      window_word_parts = window_word.split(" ")
      
      if len(window_word_parts) > max_len:
        max_len = len(window_word_parts)

    print("Sentiment score = " + str(max_len))

    for window_word in window_words:
      window_word_parts = window_word.split(" ")
      feature = [0]
      max=0
      min=0

      for word in window_word_parts:
        if word in word_dict:
          pmi = word_dict[word]
          feature[0] += pmi
      
      features.append(feature)

    return features

In [0]:
#Entities Feature

from sklearn.base import BaseEstimator

class ExtractEntitiesFeature(BaseEstimator):

  def __init__(self):
    pass

  def get_feature_name(self):
    return "extract_entities"

  def extract_entities(self, entities):
    entity_text = []

    for entity in entities:
      entity_text.append(entity['entity'])

    return entity_text

  def fit(self, entities, y=None):
    return self

  def transform(self, entities):
    return self.extract_entities(entities)

  def fit_transform(self, entities, y= None):
    return self.extract_entities(entities)

In [0]:
# Word2Vec Features

import nltk
from sklearn.base import BaseEstimator
import numpy
import re
import codecs

class W2VFeatures(BaseEstimator):

  def __init__(self, model = None, num_features = None):
    self.model = model
    self.num_features = num_features
    pass

  def get_feature_name(self):
    return "w2v"

  def fit(self, entities, y=None):
    return self

  def makeFeatureVec(self, words, model, num_features):
    featureVec = numpy.zeros((num_features,), dtype = "float32")
    nwords = 0
    index2word_set = set(model.index2word)

    for word in words:
      if word in index2word_set:
        nwords +=1
        featureVec = numpy.add(featureVec, model[word])

    featureVec = numpy.divide(featureVec, nwords)

    return featureVec

  def getAvgFeatureVecs(self, reviews, model, num_features):
    counter = 0.
    wout = codecs.open("PubMed-and-PMC-w2v_mesh_words.txt", "w", encoding = "utf-8")
    reviewFeatureVecs = numpy.zeros((len(reviews), num_features), dtype = "float32")

    for review in reviews:
      clean_train_reviews =[]
      w = re.sub("\+", " ", review)
      clean_train_reviews = [w.split()]

      for word in clean_train_reviews:
        if counter%1000 == 0:
          print("Review %d of %d" %(counter, len(reviews)))

        vec1 = self.makeFeatureVec(word, model, self.num_features)

        reviewFeatureVecs[counter] = vec1
      
      vec1 = [str(l) for l in vec1.tolist()]
      print(vec1)

      wout.write(review + "\t" + "\t".join(vec1) + "\n")
      counter+=1

    return reviewFeatureVecs

  def transform(self, entities):
    return self.create_w2v_feature(entities)

  def create_w2v_feature(self, entities):
    features = []

    for entity in entities:
      text = entity
      words = nltk.word_tokenize(text)
      feature = self.makeFeatureVec(words, self.model, self.num_features)
      feature_clear = []

      for x in feature:
        if np.math.isnan(x):
          feature_clear.append(0.0)
        else:
          feature_clear.append(x)

      features.append(feature_clear)

    return features

  def convert_to_list(self, features):
    result = []

    for feature in features:
      result.append(list(list(feature.toarray())[0]))
    
    return result

  def fit_transform(self, entities, y=None):
    return self.create_w2v_feature(entities)

In [0]:
#UMLS Semantic Type Features

import nltk
from pandas import DataFrame
from sklearn.base import BaseEstimator

class UMLSemanticTypeFeature(BaseEstimator):

  def __init__(self, semantic_types):
    self.semantic_types = semantic_types
    pass

  def get_feature_names(self):
    return "umls_semantic_types"

  def create_semantic_type_features(self,entities):
    features = []
    semantic_types_set = set()
    keys = self.semantic_types.keys()

    for key in keys:
      semantic_types_set.add(self.semantic_types[key])
    
    cluster_number_list = list(semantic_types_set)

    for entity in entities:
      feature = [0]*len(self.semantic_types)

      if entity in self.semantic_types:
        semantic_type = self.semantic_types[entity]
        feature[cluster_number_list.index(semantic_type)] = 1

      features.append(feature)

    return features

  def fit(self, documents, y=None):
    return self

  def transform(self, entities):
    return self.create_semantic_type_features(entities)

  def fit_transform(self, entities, y=None):
    return self.create_semantic_type_features(entities)

In [0]:
#Load UMLS Semantic Types

def load_umls_semantic_types():
  f = open(umls_semantic_types_files)
  dict_cluss = {}
  
  for line in f:
    terms = line.split("\t")
    dict_cluss[terms[0]] = terms[2].strip()

  return dict_cluss

In [0]:
#Load Dictionaries

def load_dict(file_name):
  f = open(file_name)
  adv_dict = []

  for line in f:
    adv_dict.append(line.strip())

  return adv_dict

In [0]:
#Load Data

def load_data(f):
  reviews = []

  for line in f:
    reviews.append(eval(line))

  return reviews;


In [0]:
#Extract Labels

def extract_labels(reviews):
  labels=[]

  for review in reviews:
    if(review["label"] == "Adverse"):
      labels.append("Adverse")
    else:
      labels.append("Unknown")
    #labels.append(review["label"])

  return labels

In [0]:
#Extract Entities

def extract_entities(reviews):
  entities = []

  for review in reviews:
    entities.append(review['entity'])

  return entities

In [0]:
#Feature Extraction

def extract_features_for_twitter_corpus(entities, is_train):
  window_words = ExtractWindowWordsFeature().transform(entities)
  pos_tag_features = numpy.array(PosTagFeatures().transform(window_words))
  sentiment_feature = numpy.array(SentimentFeature().transform(window_words))
  sentiment_score = numpy.array(SentimentScore().transform(window_words))
  entities_text = ExtractEntitiesFeature().transform(entities)
  w2v_feature = numpy.array(W2VFeatures(model=w2v_model, num_features=W2V_FEATURES_NUM).transform(entities_text))
  umls_semantic_type_feature = numpy.array(UMLSemanticTypeFeature(umls_semantic_types).transform(entities_text))

  if is_train:
    X = vectorizer.fit_transform(window_words)
  else:
    X = vectorizer.transform(window_words)

  X = X.toarray()
  

  features = numpy.concatenate((X, pos_tag_features), axis = 1)
  features = numpy.concatenate((features, sentiment_feature), axis = 1)
  features = numpy.concatenate((features, sentiment_score), axis = 1)
  features = numpy.concatenate((features, umls_semantic_type_feature), axis = 1)
  features = numpy.concatenate((features, w2v_feature), axis = 1)
   
  return features

In [0]:
#SVM Classifier for Twitter Corpus

def svm_classifier_for_twitter_corpus():
  svc = LinearSVC(penalty='l2')
  f_measure = []
  entities = []
  predicted = []
  right = []
  train_data = Bunch()
  test_data = Bunch()

  for i in range(1, 6):
    f_train = open(folder_path + '/Twitter Corpus/' + str(i) + '/train.txt')
    f_test = open(folder_path + '/Twitter Corpus/' + str(i) + '/test.txt')
    train_data.reviews = load_data(f_train)
    test_data.reviews = load_data(f_test)
    train_data.labels = extract_labels(train_data.reviews)
    test_data.labels = extract_labels(test_data.reviews)
    train_data.entities = extract_entities(train_data.reviews)

    features_train = extract_features_for_twitter_corpus(train_data.reviews, True)
    
    svc.fit(numpy.array(features_train), numpy.array(train_data.labels))

    features_test = extract_features_for_twitter_corpus(test_data.reviews, False)

    predicted_block = svc.predict(numpy.array(features_test))
    predicted.extend(predicted_block)
    right.extend(test_data.labels)

    print(metrics.f1_score(test_data.labels, predicted_block, average='macro'))
    f_measure.append(metrics.f1_score(test_data.labels, predicted_block, average='macro'))

    entities.extend(test_data.reviews)

  
  print(str(f_measure))
  print("\033[1m" + "SVM Classification Report")
  print(classification_report(right, predicted, digits=3))
  print (metrics.precision_score(right, predicted, average='macro'))
  print (metrics.recall_score(right, predicted, average='macro'))
  print (metrics.f1_score(right, predicted, average='macro'))

In [0]:
#LR Classifier for Twitter Corpus

def lr_classifier_for_twitter_corpus():
  lr = LogisticRegression(penalty='l2')
  f_measure = []
  entities = []
  predicted = []
  right = []
  train_data = Bunch()
  test_data = Bunch()

  for i in range(1, 6):
    f_train = open(folder_path + '/Twitter Corpus/' + str(i) + '/train.txt')
    f_test = open(folder_path + '/Twitter Corpus/' + str(i) + '/test.txt')
    train_data.reviews = load_data(f_train)
    test_data.reviews = load_data(f_test)
    train_data.labels = extract_labels(train_data.reviews)
    test_data.labels = extract_labels(test_data.reviews)
    train_data.entities = extract_entities(train_data.reviews)

    features_train = extract_features_for_twitter_corpus(train_data.reviews, True)
    
    lr.fit(numpy.array(features_train), numpy.array(train_data.labels))

    features_test = extract_features_for_twitter_corpus(test_data.reviews, False)

    predicted_block = lr.predict(numpy.array(features_test))
    predicted.extend(predicted_block)
    right.extend(test_data.labels)

    print(metrics.f1_score(test_data.labels, predicted_block, average='macro'))
    f_measure.append(metrics.f1_score(test_data.labels, predicted_block, average='macro'))

    entities.extend(test_data.reviews)

  
  print(str(f_measure))
  print("\033[1m" + "Logistic regression Classification Report")
  print(classification_report(right, predicted, digits=3))
  print (metrics.precision_score(right, predicted, average='macro'))
  print (metrics.recall_score(right, predicted, average='macro'))
  print (metrics.f1_score(right, predicted, average='macro'))

In [58]:
# Main Function

if __name__ == '__main__':
  vectorizer = CountVectorizer(ngram_range = (1,2))
  transformer = TfidfTransformer()
  umls_semantic_types = load_umls_semantic_types()
  w2v_model = KeyedVectors.load_word2vec_format(w2v_model_file, binary=True)
  
  
  svm_classifier_for_twitter_corpus()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


extract word transform
Sentiment score = 14




extract word transform
Sentiment score = 14
0.841248303934871
extract word transform




Sentiment score = 15




extract word transform
Sentiment score = 13
0.6295793758480325
extract word transform




Sentiment score = 13




extract word transform
Sentiment score = 11
0.5821428571428571
extract word transform




Sentiment score = 15




extract word transform
Sentiment score = 12
0.7692307692307693
extract word transform




Sentiment score = 20




extract word transform
Sentiment score = 12
0.5989974937343359
[0.841248303934871, 0.6295793758480325, 0.5821428571428571, 0.7692307692307693, 0.5989974937343359]
[1mSVM Classification Report
              precision    recall  f1-score   support

     Adverse      0.904     0.910     0.907       156
     Unknown      0.548     0.531     0.540        32

    accuracy                          0.846       188
   macro avg      0.726     0.721     0.724       188
weighted avg      0.844     0.846     0.845       188

0.7264228477501541
0.7207532051282051
0.7235153912470206




In [59]:
lr_classifier_for_twitter_corpus()

extract word transform
Sentiment score = 14




extract word transform
Sentiment score = 14
0.8852941176470589
extract word transform




Sentiment score = 15




extract word transform
Sentiment score = 13
0.6859903381642511
extract word transform




Sentiment score = 13




extract word transform
Sentiment score = 11
0.607645875251509
extract word transform




Sentiment score = 15




extract word transform
Sentiment score = 12
0.7450980392156863
extract word transform




Sentiment score = 20




extract word transform
Sentiment score = 12
0.4666666666666667
[0.8852941176470589, 0.6859903381642511, 0.607645875251509, 0.7450980392156863, 0.4666666666666667]
[1mLogistic regression Classification Report
              precision    recall  f1-score   support

     Adverse      0.902     0.942     0.922       156
     Unknown      0.640     0.500     0.561        32

    accuracy                          0.867       188
   macro avg      0.771     0.721     0.742       188
weighted avg      0.857     0.867     0.860       188

0.770920245398773
0.7211538461538461
0.7415168014079085


