In [1]:
from sklearn import preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV,train_test_split, RandomizedSearchCV,cross_val_score,cross_validate  
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,classification_report,f1_score,roc_auc_score,average_precision_score,average_precision_score,recall_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBClassifier

import re, string,os
from glob import glob as gb
import pandas as pd
from collections import Counter
from tqdm import tqdm
from utils.functions import *
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
import spacy 

# Source: https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff

In [2]:
nlp = spacy.load("en_core_web_sm")
project_path = '/home/ruben/Documents/GitHub/CrisisBureaucracy'
data_path = '/media/ruben/Elements/PhD/data/hansard'

In [3]:
# Load annotations from TagTog format
# From .json to dataframe. Also loads original texts by identifier
# Turns out that only a handfull of 'arguments' are not the full paragraph, so reverting to full paragraphs

translator = {"m_10":"cost","m_11":"accountability","m_12":"freedom","m_13":"inefficiency","m_14":"irrationality",
              "m_15":"centralisation","m_16":"power","m_8":"neutral","m_9":"size"}

def load_full(id_):
    f = pd.read_csv(f'{data_path}/lemmatized_pm/uk.proc.d.{id_[:10]}.txt',sep='\t')
    return [list(f[f['id'] == 'uk.proc.d.' + id_]['text'])[0],list(f[f['id'] == 'uk.proc.d.' + id_]['text_lemmatized'])[0]]

def combine(fn):
    with open(fn,'r') as f:
        c = json.load(f)
    labels = [translator[x] for x in list(c['metas'].keys())]
    txtn = f"{project_path}/data/classifier/annotation-round-2/annotation-texts/" + fn[130:].replace('.ann.json','').replace('_','-')
    with open(ltf_[txtn],'r') as f:
        t = f.read()
    return [os.path.split(ltf_[txtn])[-1].replace('.txt',''),t,labels]

lf = gb(f'{project_path}/data/classifier/annotation-round-2/annotation-results/*')
ltf = gb(f'{project_path}/data/classifier/annotation-round-2/annotation-texts/*')
ltf_ = {k.replace('_','-'):k for k in ltf}

df = pd.DataFrame([combine(x) for x in lf],columns=['id','text','label'])
df['text'] = [load_full("-".join(x.split('-')[1:-1]))[0] for x in df['id']]
df = df[['id','text','label']]

In [None]:
# Preprocessing:
# Remove stopwords plus frequent forms of addressing MPs ("right hon. gentleman") 
# POS-tag using spacy, save only adjectives, nouns and verbs (verbs are important because of "growing bureaucracy" etc.)

stops = stopwords.words('english') + "hon member friend gentleman gentlemen speaker right".split(' ')
df['text'] = [" ".join([w.text for w in nlp(t) if w.pos_ in ["ADJ","NOUN","VERB"] and str(w.text) not in stops]) for t in tqdm(df['text'])]

In [5]:
df['label'] = [x[0] for x in df['label']]

In [5]:
categories = list(set([item for sublist in list(df['label']) for item in sublist]))

In [6]:
dfr = df[['id','text','label']]
for c in categories:
    dfr[c] = [1 if c in i else 0 for i in dfr['label']]
dfr =dfr.drop(['id','label'],axis=1)

In [21]:
# Cross-Validation with a pipeline. To do: GridSearch (although no huge differences in earlier tests)
# Using OneVsRestClassifier, so basically making a separate classifier for every label.

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,4), max_features=10000)),
                ('clf', OneVsRestClassifier(naive_bayes.MultinomialNB(
                    fit_prior=True, class_prior=None))),
                ])
for category in categories:
    scores = cross_validate(NB_pipeline, dfr['text'], dfr[category], scoring=['accuracy'], cv=10, return_train_score=False)
    print(category[:8] + '.','\t'," ".join([str(x)[:6] for x in scores['test_accuracy']]))

centrali. 	 0.96 0.96 0.96 0.9466 0.9466 0.9466 0.9466 0.9466 0.9466 0.9466
power. 	 0.9466 0.9466 0.9466 0.9466 0.9333 0.9333 0.9333 0.9333 0.9333 0.9333
cost. 	 0.8666 0.8666 0.8666 0.8666 0.8666 0.8666 0.8666 0.8666 0.8666 0.8533
irration. 	 0.9866 0.9866 0.9866 0.9866 0.9866 0.9866 0.9733 0.9733 0.9733 0.9733
size. 	 0.7066 0.68 0.7333 0.7333 0.7066 0.68 0.7466 0.72 0.7333 0.6933
accounta. 	 0.9333 0.9333 0.9333 0.9333 0.9333 0.9333 0.9333 0.92 0.92 0.92
ineffici. 	 0.84 0.84 0.84 0.84 0.84 0.84 0.84 0.84 0.84 0.8266
freedom. 	 0.92 0.92 0.9066 0.9066 0.9066 0.9066 0.9066 0.9066 0.9066 0.9066
neutral. 	 0.7866 0.7866 0.7866 0.7733 0.7733 0.7733 0.7733 0.7733 0.7733 0.7733


In [123]:
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=10000)
tfidf_vect.fit(dfr['text'])
train,test = train_test_split(dfr, random_state=42, test_size=0.33, shuffle=True)

xtrain_tfidf =  tfidf_vect.transform(train.text)
xvalid_tfidf =  tfidf_vect.transform(test.text)

classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(xtrain_tfidf, train.drop(labels = ['text'], axis=1))
# predict
predictions = classifier.predict(xvalid_tfidf)
# accuracy
print("Accuracy = ",accuracy_score(test.drop(labels = ['text'], axis=1),predictions))


Accuracy =  0.11290322580645161


In [124]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())
# Training logistic regression model on train data
classifier.fit(xtrain_tfidf, train.drop(labels = ['text'], axis=1))
# predict
predictions = classifier.predict(xvalid_tfidf)
# accuracy
print("Accuracy = ",accuracy_score(test.drop(labels = ['text'], axis=1),predictions))
print("\n")

Accuracy =  0.27419354838709675




In [125]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
# Training logistic regression model on train data
classifier.fit(xtrain_tfidf, train.drop(labels = ['text'], axis=1))
# predict
predictions = classifier.predict(xvalid_tfidf)
# accuracy
print("Accuracy = ",accuracy_score(test.drop(labels = ['text'], axis=1),predictions))
print("\n")

Accuracy =  0.2056451612903226




In [126]:
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(xtrain_tfidf).toarray()
y_train = lil_matrix(train.drop(labels = ['text'], axis=1)).toarray()
x_test = lil_matrix(xvalid_tfidf).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(test.drop(labels = ['text'], axis=1),predictions_new))
print("\n")

Accuracy =  0.13709677419354838




In [7]:
## Combine categories

## irrationality -> inefficiency
## power -> size
## freedom -> accountability
## centralisation -> size


dfr['inefficiency'] = dfr['inefficiency'] + dfr['irrationality']
dfr['size'] = dfr['size'] + dfr['power'] + dfr['centralisation']
#dfr['accountability'] = dfr['accountability'] + dfr['freedom']

dfr = dfr.drop(['irrationality','power','centralisation','freedom'],axis=1)


dfr = dfr.replace(2,1)
dfr = dfr.replace(3,1)
dfr = dfr.replace(4,1)

In [8]:
# Using BERT for feature extraction
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
import swifter 

model = SentenceTransformer('bert-base-nli-mean-tokens')

In [9]:
vects = dfr['text'].swifter.apply(model.encode)

Pandas Apply: 100%|██████████| 750/750 [01:16<00:00,  9.86it/s]


In [10]:
dfr['sentence-bert'] = vects

In [11]:
def stack_embeddings(embeddings):
    import numpy as np
    return np.vstack(embeddings.values)

ct = ColumnTransformer([
    ('bag of ngrams', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=10000), 'text'),
    # Lambda functions cannot be pickled
    ('sentence bert', FunctionTransformer(stack_embeddings), 'sentence-bert')],remainder='passthrough')

# lm = LogisticRegression()
xgb = XGBClassifier(random_state=0)

In [20]:
#pipeline = Pipeline([('transformer', ct), ('classifier', xgb)])
#pipeline = Pipeline([('transformer', ct), ('classifier', LogisticRegression())])
pipeline = Pipeline([('transformer', ct), ('classifier', naive_bayes.MultinomialNB)])

In [21]:
for category in "accountability inefficiency neutral size cost".split(' '):
    df_c = dfr.copy()
    y,X = df_c.pop(category),df_c
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.5, random_state=42,stratify=y)
    model = pipeline.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(category)
    print(metrics.classification_report(y_test,y_pred))
    print('------------------------------_')
    #scores = cross_validate(pipeline, dfr['text'], dfr[category], scoring=['accuracy'], cv=10, return_train_score=False)
    #print(category[:8] + '.','\t'," ".join([str(x)[:6] for x in scores['test_accuracy']]))


TypeError: fit() missing 1 required positional argument: 'y'

In [48]:
##
# Making new predictions on all the data (everything below is testing)
##

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=10000)
tfidf_vect.fit(dfr['text'])

# import all
new_df = pd.read_csv('~/Documents/GitHub/CrisisBureaucracy/data/classifier/bureaucracy-sentences-full.tsv',sep='\t').dropna()
new_df.columns = "id text".split(' ')
new_df = new_df.sample(1000).reset_index(drop=True)

# Preprocessing
stops = stopwords.words('english') + "hon member friend gentleman gentlemen speaker right".split(' ')
new_df['text'] = [" ".join([w.text for w in nlp(t) if w.pos_ in ["ADJ","NOUN","VERB"] and str(w.text) not in stops]) for t in tqdm(new_df['text'])]
new_docs_tfidf = tfidf_vect.transform(new_df.text)

100%|██████████| 1000/1000 [00:21<00:00, 46.90it/s]


In [56]:
for cat in ['accountability','cost','freedom','size']:
    model = OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None)).fit(tfidf_vect.fit_transform(dfr.text), dfr[cat])
    predictions = model.predict_proba(new_docs_tfidf)
    new_df[cat] = [x[1] for x in predictions]