This notebook presents a framework for concatenation of multiple features (feature union) in a pipeline for logistic regression. The data we use is webpages from health organization websites that need to be classified as related to contraception or not. We feed the pipeline with a cosine similarity score of every document computed with relevant clinical vocabulary in an attempt to train a model better to identify the class of a document.

Cosine similarity is a metric that considers only the orientation of the vectors and not their magnitude. Hence document size does not influence the computation of cosine similarity. Since we are dealing with specific vocabulary in our study, our hypothesis was that cosine similarity is a fair measure to assess the presence of specific information.

In [81]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from nltk.stem import PorterStemmer
import re

df = pd.read_csv("stemmed_data_df.csv")

df.head()


Unnamed: 0.1,Unnamed: 0,Category,Text,Cosine,Cosine_expandedVocab,Cosine_expandedVocab_stemmed
0,0,Other Class,b hyperthyroid caus symptom test diagnosi trea...,0.001644,0.003462,0.004109
1,1,Other Class,b health medic new and doctor s view A Z index...,0.0,0.0,0.0
2,2,Other Class,b leg pain symptom sign caus treatment ndoctor...,0.002355,0.002267,0.002075
3,3,Other Class,b what Is compart syndrom surgeri symptom trea...,0.001552,0.002179,0.002024
4,4,Other Class,b diseas condit A Z list P on medicinenet com ...,0.023041,0.021658,0.026727


In [82]:
training_data = df
training_labels = training_data['Category'].values

In [83]:
class NumberSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]


class TextSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]


text = Pipeline([
    ('selector', TextSelector(key='Text')),
    ('tfidf', TfidfVectorizer(ngram_range=(2, 2), use_idf=True, stop_words = 'english', max_df=0.95,  min_df=0.05))
])


text.fit_transform(training_data)

cosine_expandedVocab_stemmed = Pipeline([
    ('selector', NumberSelector(key='Cosine_expandedVocab_stemmed'))

])

cosine_expandedVocab_stemmed.fit_transform(training_data)


def get_stemmed_data(data):
    corpus = []
    for i in range(len(data)):

        text = re.sub('[^a-zA-Z]', ' ', data['Text'].values[i])  # for 1st review i = 0
        text = text.split()
        #print(text)
        ps = PorterStemmer()
        text = [ps.stem(word) for word in text]
        text = ' '.join(text)
        #print(text)
        corpus.append(text)
        #print(corpus)
    return corpus

def store_goldStd_against_pred(goldStd_labels, pred_labels):
    pred_goldStd = [[[goldStd_labels[i]], pred] for i, pred in enumerate(pred_labels)]
    return pred_goldStd


def compute_num_correct_preds(pred_goldStd_list):
    correct = 0
    
    for item in pred_goldStd_list:
        
        goldStd = item[0]
        pred = item[1]

        for label in goldStd:
            if label in pred:
                correct += 1
               
    
    return correct

def compute_metrics( LARC_class_goldStd_labels, other_class_goldStd_labels, LARC_class_predictions, other_class_predictions):
    TPpred = store_goldStd_against_pred(LARC_class_goldStd_labels, LARC_class_predictions)
    FNpred = store_goldStd_against_pred(other_class_goldStd_labels, LARC_class_predictions)
    TNpred = store_goldStd_against_pred(other_class_goldStd_labels, other_class_predictions)
    FPpred = store_goldStd_against_pred(LARC_class_goldStd_labels, other_class_predictions)

    TP = compute_num_correct_preds(TPpred)
    FN = compute_num_correct_preds(FNpred)
    TN = compute_num_correct_preds(TNpred)
    FP = compute_num_correct_preds(FPpred)

    Metric = []
    Score = []

    precision = TP / (TP + FP)
    Metric.append("Precision")
    Score.append(precision)
    
    recall = TP / (TP + FN)
    Metric.append("Recall")
    Score.append(recall)
    
    accuracy = (TP + TN) / (len(LARC_class_goldStd_labels) + len(other_class_goldStd_labels))
    Metric.append("accuracy")
    Score.append(accuracy)
    
    f1 = (2 * precision * recall) / (precision + recall)
    Metric.append("f1")
    Score.append(f1)
    
   
    
    
    data = [['TP', TP], ['FP', FP], ['TN', TN], ['FN', FN], ['precision', precision], ['recall', recall],
                ['accuracy', accuracy], ['f1', f1]]

    df_metricScores = pd.DataFrame(data, columns=['Metrics', 'Score'])
    accuracy_f1 = [accuracy, f1]
    return df_metricScores, accuracy_f1

In [84]:


other_class = pd.read_csv("other class test file.csv")
other_class_df = pd.DataFrame(index=range(len(other_class)))

other_class_stemmed = get_stemmed_data(other_class)
other_class_stemmed_df = pd.DataFrame(other_class_stemmed, columns = ['Text'])

arr = []
arr2 = []
cosine = []

#create a tuple for predictions using a Cosine Similarity score that is already created and the text from the page 
for i in range(len(other_class)):
    arr = []
    arr.append(other_class_stemmed_df['Text'].iloc[i])
    arr.append(other_class['Cosine_expandedVocab_stemmed'].iloc[i])
    arr2.append(arr)

dfObj_of_otherClass = pd.DataFrame(arr2, columns=['Text', 'Cosine_expandedVocab_stemmed'])


LARC_class = pd.read_csv("larc test file.csv")
LARC_class_df = pd.DataFrame(index=range(len(LARC_class)))

LARC_class_stemmed = get_stemmed_data(LARC_class)
LARC_class_stemmed_df = pd.DataFrame(LARC_class_stemmed, columns = ['Text'])

arr = []
arr2 = []
cosine = []

for i in range(len(LARC_class)):
    arr = []
    arr.append(LARC_class_stemmed_df['Text'].iloc[i])
    arr.append(LARC_class['Cosine_expandedVocab_stemmed'].iloc[i])
    arr2.append(arr)

dfObj_of_LARC_class = pd.DataFrame(arr2, columns=['Text', 'Cosine_expandedVocab_stemmed'])



In [85]:
#choose the features, here we choose only text features to train the model

feats = FeatureUnion([('text', text)
                      ])
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(training_data)


pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(verbose=1, solver='liblinear', random_state=0, C=5, penalty='l2', max_iter=1000))
])

pipeline.fit(training_data, training_labels)

other_class_predictions = pipeline.predict(dfObj_of_otherClass)
LARC_class_predictions = pipeline.predict(dfObj_of_LARC_class)

[LibLinear]

In [86]:
other_class_goldStd_labels = other_class['Class']
LARC_class_goldStd_labels = LARC_class['Class']


df_metricScores, accuracy_f1 = compute_metrics( LARC_class_goldStd_labels, other_class_goldStd_labels, LARC_class_predictions, other_class_predictions)
df_metricScores

Unnamed: 0,Metrics,Score
0,TP,115.0
1,FP,13.0
2,TN,138.0
3,FN,36.0
4,precision,0.898438
5,recall,0.761589
6,accuracy,0.837748
7,f1,0.824373


In [87]:
#choose the features, here we choose text features as well as the cosine similarity score for every document to train the model


feats = FeatureUnion([('text', text),
                      ('cosine_expandedVocab_stemmed', cosine_expandedVocab_stemmed)])


feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(training_data)


pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(verbose=1, solver='liblinear', random_state=0, C=5, penalty='l2', max_iter=1000))
])

pipeline.fit(training_data, training_labels)

other_class_predictions = pipeline.predict(dfObj_of_otherClass)
LARC_class_predictions = pipeline.predict(dfObj_of_LARC_class)

[LibLinear]

In [88]:
other_class_goldStd_labels = other_class['Class']
LARC_class_goldStd_labels = LARC_class['Class']


df_metricScores, accuracy_f1 = compute_metrics( LARC_class_goldStd_labels, other_class_goldStd_labels, LARC_class_predictions, other_class_predictions)
df_metricScores

Unnamed: 0,Metrics,Score
0,TP,106.0
1,FP,7.0
2,TN,144.0
3,FN,45.0
4,precision,0.938053
5,recall,0.701987
6,accuracy,0.827815
7,f1,0.80303


In this case, it seems that adding cosine similarity as a feature does not improve the performance. Further error analysis needs to be done to engineer the features better.