In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 31 12:55:35 2019

@author: iab
"""

# import necessary packages
import torch
import transformers as ppb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import warnings
import pickle
import boto3
import io

# ignore warnings
warnings.filterwarnings('ignore')

class keymessage_affinityscore_model:
    
  ##########################################################################################################################################
  # Learn the representation for sentences.
  def pubsent_representation_labels(self, dfTextLabels):

    try:

      ############
      # Load BERT models

      # Load DistilBERT:
      #model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

      # Load BERT
      model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

      # Load pretrained model/tokenizer
      tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
      model = model_class.from_pretrained(pretrained_weights)

      df_1_text = dfTextLabels['Text']

      # Tokenize
      tokenized = df_1_text.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

      dftokenized = pd.DataFrame(tokenized)

      # list of indexes (from labels column) which need to be dropped (rows with more than 145 tokens)
      dropIndices = dftokenized.loc[dftokenized['Text'].map(len) > 145]

      # drop tokens whose length is more than 145
      dftokenized = dftokenized[dftokenized['Text'].map(len) < 145]

      #dftokenized = dftokenized[dftokenized['Text'].map(len) != 148]
      #dftokenized = dftokenized[dftokenized['Text'].map(len) != 328]
      #dftokenized = dftokenized[dftokenized['Text'].map(len) != 338]

      tokenized = dftokenized['Text']

      max_len = 0
      for i in tokenized.values:
          if len(i) > max_len:
              max_len = len(i)

      padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

      #Masking
      attention_mask = np.where(padded != 0, 1, 0)

      input_ids = torch.tensor(padded)
      attention_mask = torch.tensor(attention_mask)

      with torch.no_grad():
          last_hidden_states = model(input_ids, attention_mask=attention_mask)
          
      features = last_hidden_states[0][:,0,:].numpy()

      dfLabels = dfTextLabels[['Brand Information','Comparative Info','Cost','Disease Management','Dosing & Administration','Efficacy','Indication','Pathology','Patient Profiles','Pharmacology','Physician Enablement','Risk Factors','Safety & Tolerability', 'Study Design', 'Unmet Needs', 'Anatomy', 'Physiology', 'Subjective Evidence', 'Novel Therapy', 'Counselling']]

      # drop indices from dfLabels (those indices which were dropped in dfTokenized)
      for i in dropIndices.index:
        dfLabels.drop(i, inplace=True)

      #dfLabels.drop(518, inplace=True)
      #dfLabels.drop(520, inplace=True)
      #dfLabels.drop(521, inplace=True)

      return features, dfLabels

    except Exception as e:
      raise Exception('exception in pubsent_representation_labels() function: ' + str(e)) 

  ################################################################################################################
  # fit the model to sentence representations
  def fit(self, dfHCPTextLabels):

    try:

      features, dfLabels = self.pubsent_representation_labels(dfHCPTextLabels)

      train_features, test_features, train_labels, test_labels = train_test_split(features, dfLabels)

      ########### 
      key_message_list = ['Brand Information','Comparative Info','Cost','Disease Management','Dosing & Administration','Efficacy','Indication','Pathology','Patient Profiles','Pharmacology','Physician Enablement','Risk Factors','Safety & Tolerability', 'Study Design', 'Summary', 'Unmet Needs', 'Anatomy', 'Physiology', 'Subjective Evidence', 'Novel Therapy', 'Counselling']

      abstract_score_overall_list = []

      for i in range(train_labels.shape[0]):
        abstract_score_list = []
        for j in range(len(key_message_list)):
          abstract_score_list.append(train_labels.iloc[i, j])
        abstract_score_overall_list.append(abstract_score_list)
      
      key_message_mapping = []
      for score_list in abstract_score_overall_list:
            key_message_mapping.append(key_message_list[score_list.index(max(score_list))])
            
      df_labels_max = pd.DataFrame(key_message_mapping)

      ###########
      key_message_list = ['Brand Information','Comparative Info','Cost','Disease Management','Dosing & Administration','Efficacy','Indication','Pathology','Patient Profiles','Pharmacology','Physician Enablement','Risk Factors','Safety & Tolerability', 'Study Design', 'Unmet Needs', 'Anatomy', 'Physiology', 'Subjective Evidence', 'Novel Therapy', 'Counselling']

      abstract_score_overall_list = []

      for i in range(test_labels.shape[0]):
        abstract_score_list = []
        for j in range(len(key_message_list)):
          abstract_score_list.append(test_labels.iloc[i, j])
        abstract_score_overall_list.append(abstract_score_list)
          
      key_message_mapping = []
      for score_list in abstract_score_overall_list:
            key_message_mapping.append(key_message_list[score_list.index(max(score_list))])
            
      df_test_labels_max = pd.DataFrame(key_message_mapping)

      ############    
      # Create hyperparameter space

      # create regularization penalty space
      penalty = ['l1', 'l2']

      # create regualrization hyperparameter space
      C = np.logspace(0, 4, 10)

      hyperparameters = dict(C=C, penalty=penalty)

      lr_clf = LogisticRegression()

      # Create grid search using 5-fold cross validation
      clf = GridSearchCV(lr_clf, hyperparameters, cv=5, verbose=0)

      # Fit grid search
      best_model = clf.fit(train_features, df_labels_max)

      #print('score: ', best_model.score(test_features, df_test_labels_max))
      
      return best_model

    except Exception as e:
      raise Exception('exception in model() function: ' + str(e)) 

  ####################################################################################################
  # prepare the sentences_predicted_prob file from df_Prob as a df
  def createAuthorAbstractProbabilties(self, dfHCPTextLabels, df_Prob):
    
    try:
      
        df_text_author = dfHCPTextLabels[['Text', 'author']]
        df_text_author_prob = pd.concat([df_text_author, df_Prob], axis=1)
        
        return df_text_author_prob

    except Exception as e:
      raise Exception('exception in createAuthorAbstractNumProbabilties() function: ' + str(e)) 


  ####################################################################################################
  def predict(self, features, dfLabels):

    try:
      
      # make sentence score dataframe
      lstProbabilities = list()
      for feature in features:
        lstProbabilities.append(model.predict_proba([feature]))

      newlst = list()
      for i in range(len(lstProbabilities)):
        newlst.append(lstProbabilities[i][0])

      dfSentencesProb = pd.DataFrame(newlst)
      dfSentencesProb.columns = ['Brand_Information','Comparative_Info','Cost','Disease_Management','Dosing_&_Administration','Efficacy','Indication','Pathology','Patient_Profiles','Pharmacology','Physician_Enablement','Risk_Factors','Safety_&_Tolerability', 'Study_Design', 'Unmet_Needs', 'Anatomy', 'Physiology', 'Subjective_Evidence', 'Novel_Therapy', 'Counselling']
 
      return dfSentencesProb

    except Exception as e:
      raise Exception('exception in predict() function: ' + str(e))

  #####################################################################################################
  def predict_sentence_level(self, dfHCPTextLabels):

    try:
      
      features, dfLabels = self.pubsent_representation_labels(dfHCPTextLabels)
      dfProb = self.predict(features, dfLabels)
      
      #########
      # aggregating at the author level
      
      dfSentencesProb = self.createAuthorAbstractProbabilties(dfHCPTextLabels, dfProb)
            
      return dfSentencesProb

    except Exception as e:
      raise Exception('exception in predict_sentence_level() function: ' + str(e)) 

  

In [2]:
!pip install transformers



In [0]:
import pandas as pd
dfTextLabels = pd.read_excel('sentences_scores.xlsx')

In [0]:
dfTextLabels.drop(['Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26'], axis=1, inplace=True)

In [0]:
cls = keymessage_affinityscore_model()

In [6]:
dfTextLabels.head()

Unnamed: 0,Text,Brand Information,Comparative Info,Cost,Disease Management,Dosing & Administration,Efficacy,Indication,Pathology,Patient Profiles,Pharmacology,Physician Enablement,Risk Factors,Safety & Tolerability,Study Design,Unmet Needs,Anatomy,Physiology,Subjective Evidence,Novel Therapy,Counselling
0,OBJECTIVE: To determine if clinically used bot...,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.1,0.0,0.25,0.0,0.0,0.0,0.0
1,DESIGN: Prospective study.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"SETTING: Academic medical center in St Louis, Mo.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SUBJECTS: Twenty-nine adult patients treated w...,0.0,0.0,0.0,0.3,0.0,0.0,0.35,0.0,0.2,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0
4,INTERVENTION: The eyebrow position at 13 diffe...,0.0,0.0,0.0,0.1,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.2,0.0,0.35,0.0,0.0


In [0]:
model = cls.pubsent_representation_labels(dfTextLabels)

In [11]:
len(model[0][0])

768