In [75]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV  # or GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
import re, string,os
from glob import glob as gb
import pandas as pd
from collections import Counter
from tqdm import tqdm
from utils.functions import *
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords

import spacy 
nlp = spacy.load("en_core_web_sm")

# https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff


In [22]:
# Load annotations from TagTog format
translator = {"m_10":"cost","m_11":"accountability","m_12":"freedom","m_13":"inefficiency","m_14":"irrationality",
              "m_15":"centralisation","m_16":"power","m_8":"neutral","m_9":"size"}

def load_full(id_):
    f = pd.read_csv(f'/media/ruben/Elements/PhD/data/hansard/lemmatized_pm/uk.proc.d.{id_[:10]}.txt',sep='\t')
    return [list(f[f['id'] == 'uk.proc.d.' + id_]['text'])[0],list(f[f['id'] == 'uk.proc.d.' + id_]['text_lemmatized'])[0]]

def combine(fn):
    with open(fn,'r') as f:
        c = json.load(f)
    labels = [translator[x] for x in list(c['metas'].keys())]
    txtn = "/home/ruben/Documents/GitHub/CrisisBureaucracy/data/classifier/annotation-round-2/annotation-texts/" + fn[130:].replace('.ann.json','').replace('_','-')
    with open(ltf_[txtn],'r') as f:
        t = f.read()
    return [os.path.split(ltf_[txtn])[-1].replace('.txt',''),t,labels]

lf = gb('/home/ruben/Documents/GitHub/CrisisBureaucracy/data/classifier/annotation-round-2/annotation-results/*')
ltf = gb('/home/ruben/Documents/GitHub/CrisisBureaucracy/data/classifier/annotation-round-2/annotation-texts/*')
ltf_ = {k.replace('_','-'):k for k in ltf}

df = [combine(x) for x in lf]
df = pd.DataFrame(df,columns=['id','text','label'])
or_texts = [load_full("-".join(x.split('-')[1:-1])) for x in df['id']]
df['original_par'] = [x[0] for x in or_texts]
df = df[['id','original_par','label']]
df.columns = "id text label".split(' ')

In [23]:
# Preprocessing
stops = stopwords.words('english') + "hon member friend gentleman gentlemen speaker right".split(' ')
df['text'] = [" ".join([w.text for w in nlp(t) if w.pos_ in ["ADJ","NOUN","VERB"] and str(w.text) not in stops]) for t in tqdm(df['text'])]

100%|██████████| 750/750 [00:13<00:00, 54.88it/s]


In [26]:
# Model train function
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    classifier.fit(feature_vector_train, label)
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

In [44]:
print('Cross validation on separate categories \n ---------')
for cat in set([item for sublist in list(df['label']) for item in sublist]):
    dfc = df.copy()
    dfc['label'] = [cat if cat in x else "other" for x in dfc['label']]

    # Vectorize
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=8000)
    tfidf_vect.fit(dfc['text'])

    # Use Cross-Validation
    clf = make_pipeline(tfidf_vect, naive_bayes.MultinomialNB())
    scores = cross_validate(clf, dfc['text'], dfc['label'], scoring=['accuracy'], cv=5, return_train_score=False)
    print(cat,'\t'," ".join([str(x)[:4] for x in scores['test_accuracy']]))

Cross validation on separate categories 
 ---------
irrationality 	 0.98 0.98 0.98 0.98 0.98
cost 	 0.86 0.86 0.86 0.86 0.86
accountability 	 0.93 0.93 0.92 0.92 0.92
inefficiency 	 0.84 0.84 0.84 0.84 0.83
freedom 	 0.91 0.91 0.90 0.90 0.90
size 	 0.68 0.72 0.7 0.72 0.70
centralisation 	 0.95 0.95 0.95 0.94 0.94
neutral 	 0.78 0.78 0.78 0.77 0.77
power 	 0.94 0.94 0.94 0.94 0.93


In [46]:
categories = list(set([item for sublist in list(df['label']) for item in sublist]))

In [56]:
dfr = df[['id','text','label']]
for c in categories:
    dfr[c] = [1 if c in i else 0 for i in dfr['label']]
dfr =dfr.drop(['id','label'],axis=1)

In [105]:
train, test = train_test_split(dfr, random_state=42, test_size=0.33, shuffle=True)
X_train = train.text
X_test = test.text

In [86]:
from sklearn.naive_bayes import MultinomialNB

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
for category in categories:
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print(f'{category}:\t {round(accuracy_score(test[category], prediction),3)}')

irrationality:	 0.98
cost:	 0.871
accountability:	 0.927
inefficiency:	 0.823
freedom:	 0.891
size:	 0.738
centralisation:	 0.927
neutral:	 0.774
power:	 0.948


In [104]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

In [123]:
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=10000)
tfidf_vect.fit(dfr['text'])
train,test = train_test_split(dfr, random_state=42, test_size=0.33, shuffle=True)

xtrain_tfidf =  tfidf_vect.transform(train.text)
xvalid_tfidf =  tfidf_vect.transform(test.text)

classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(xtrain_tfidf, train.drop(labels = ['text'], axis=1))
# predict
predictions = classifier.predict(xvalid_tfidf)
# accuracy
print("Accuracy = ",accuracy_score(test.drop(labels = ['text'], axis=1),predictions))


Accuracy =  0.11290322580645161


In [124]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())
# Training logistic regression model on train data
classifier.fit(xtrain_tfidf, train.drop(labels = ['text'], axis=1))
# predict
predictions = classifier.predict(xvalid_tfidf)
# accuracy
print("Accuracy = ",accuracy_score(test.drop(labels = ['text'], axis=1),predictions))
print("\n")

Accuracy =  0.27419354838709675




In [125]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
# Training logistic regression model on train data
classifier.fit(xtrain_tfidf, train.drop(labels = ['text'], axis=1))
# predict
predictions = classifier.predict(xvalid_tfidf)
# accuracy
print("Accuracy = ",accuracy_score(test.drop(labels = ['text'], axis=1),predictions))
print("\n")

Accuracy =  0.2056451612903226




In [126]:
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(xtrain_tfidf).toarray()
y_train = lil_matrix(train.drop(labels = ['text'], axis=1)).toarray()
x_test = lil_matrix(xvalid_tfidf).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(test.drop(labels = ['text'], axis=1),predictions_new))
print("\n")

Accuracy =  0.13709677419354838


