In [72]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV  # or GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
import re, string,os
from glob import glob as gb
import pandas as pd
from collections import Counter
from tqdm import tqdm
from utils.functions import *
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
stops = stopwords.words('english') + ["hon","member","right","friend","mr",'hon.','make','say','great']

from classification import *
import spacy 
nlp = spacy.load("en_core_web_sm")

# First test: https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/
# Word2vec: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ 

In [59]:
## Import annotated data
annotations = pd.read_csv('~/Documents/GitHub/CrisisBureaucracy/data/classifier/annotated-arguments-bureaucracy.csv')
annotations['metadata'] = ''

refdata = pd.read_csv('~/Documents/GitHub/CrisisBureaucracy/data/classifier/training_data_full.csv',sep='\t')
refdata['id-ann'] = [x + 594 for x in refdata.index]

for c,i in enumerate(annotations['id']):
    annotations['metadata'][c] = str(refdata[refdata['id-ann'] == i].reset_index(drop=True)['id'][0])

annotations['text'] = utils.preprocess_(annotations['text'])
annotations = annotations[["id","label","text"]]
annotations['text'] = [" ".join([w.text for w in nlp(t) if w.pos_ in ["ADJ","NOUN"] and w.text not in stops]) for t in tqdm(annotations['text'])]

labels = {1:"neutral",2:"inefficient",3:"powerful/large",4:"centralization",5:"freedom",6:"expensive",7:"anti-democratic"}
annotations['label'] = annotations['label'].astype(str)
annotations['label'] = [labels[int(x)] for x in annotations['label']]

df = annotations.drop_duplicates('text')
df = df[df['label'] != 'neutral'].reset_index(drop=True)

100%|██████████| 640/640 [00:09<00:00, 64.87it/s]


In [61]:
# Inspect Category TF-IDF terms
tfidfo, docterms = tfidf.get_docterms(df,"text")
tt = tfidf.get_topterms(tfidfo,docterms,df,'label')
tt.head(5)

Unnamed: 0,freedom,inefficient,anti-democratic,expensive,powerful/large,centralization
0,freedom,government,democratic,cost,government,industry
1,citizen,board,parliament,government,people,state
2,people,minister,elect,money,service,policy
3,house,small,democracy,service,local,government
4,individual,case,executive,commission,authority,nationalise


In [57]:
#df['label'] = ["argument" if x != "neutral" else "non-argument" for x in df['label']]

In [94]:
# Model train function
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    classifier.fit(feature_vector_train, label)
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

# Vectorize
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=8000)
tfidf_vect.fit(df['text'])

# Split Test/Train sets and vectorize
train_x, valid_x, train_y, valid_y = train_test_split(df['text'], df['label'], test_size=0.333)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [95]:
# Evaluate Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, Vectors: ", accuracy)

# Evaluate SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM, Vectors: ", accuracy)

# Evaluate RF on Ngram Level TF IDF Vectors
accuracy = train_model(RandomForestClassifier(n_estimators=100), xtrain_tfidf, train_y, xvalid_tfidf)
print("Random Forest, Vectors: ", accuracy)

NB, Vectors:  0.2982456140350877
SVM, Vectors:  0.2894736842105263
Random Forest, Vectors:  0.2894736842105263


In [96]:
# Use Cross-Validation
clf = make_pipeline(tfidf_vect, naive_bayes.MultinomialNB())
scores = cross_validate(clf, df['text'], df['label'], scoring=['accuracy'], cv=25, return_train_score=False)
scores['test_accuracy']

array([0.35714286, 0.35714286, 0.35714286, 0.28571429, 0.28571429,
       0.35714286, 0.35714286, 0.35714286, 0.35714286, 0.35714286,
       0.28571429, 0.28571429, 0.28571429, 0.07142857, 0.28571429,
       0.28571429, 0.28571429, 0.30769231, 0.23076923, 0.30769231,
       0.23076923, 0.30769231, 0.30769231, 0.23076923, 0.30769231])

In [70]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.values())

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), max_features=2500)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(lambda: max_idf,[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
        return self

    def transform(self, X):
        return np.array([np.mean([self.word2vec[w] * self.word2weight[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) for words in X])

In [71]:
model = KeyedVectors.load_word2vec_format("~/Documents/GitHub/CrisisBureaucracy/results/w2v-models/model-single-sample.bin",binary=True)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

In [73]:
# Model train function
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    classifier.fit(feature_vector_train, label)
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

# Vectorize
tfidf_vect = TfidfEmbeddingVectorizer(w2v)
tfidf_vect.fit(df['text'])

# Split Test/Train sets and vectorize
train_x, valid_x, train_y, valid_y = train_test_split(df['text'], df['label'], test_size=0.6)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# Evaluate SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM, Vectors: ", accuracy)

SVM, Vectors:  0.270935960591133
Random Forest, Vectors:  0.2315270935960591


In [51]:
from glob import glob as gb
import json 
import pandas as pd

translator = {
    "m_10":"cost",
    "m_11":"accountability",
    "m_12":"freedom",
    "m_13":"inefficiency",
    "m_14":"irrationality",
    "m_15":"centralisation",
    "m_16":"power",
    "m_8":"neutral",
    "m_9":"size"
}

lf = gb('/home/ruben/Documents/GitHub/CrisisBureaucracy/data/classifier/annotation-round-2/annotation-results/*')
ltf = gb('/home/ruben/Documents/GitHub/CrisisBureaucracy/data/classifier/annotation-round-2/annotation-texts/*')
ltf_ = {k.replace('_','-'):k for k in ltf}

In [60]:
def load_text_labels(fn):
    try:
        with open(fn,'r') as f:
            c = json.load(f)
        labels = [translator[x] for x in list(c['metas'].keys())]

        txtn = fn[130:].replace('.ann.json','').replace('_','-')
        txtn = "/home/ruben/Documents/GitHub/CrisisBureaucracy/data/classifier/annotation-round-2/annotation-texts/" + txtn

        with open(ltf_[txtn],'r') as f:
            t = f.read()
        return [os.path.split(ltf_[txtn])[-1].replace('.txt',''),t,labels]
    except:
        print(e)
        return

In [69]:
df = [load_text_labels(x) for x in lf]
d = []

for x in df:
    for l in x[2]:
        d.append([x[0],x[1],l])

In [75]:
df = pd.DataFrame(d,columns=['id','text','label'])
df['text'] = utils.preprocess_(df['text'])
df['text'] = [" ".join([w.text for w in nlp(t) if w.pos_ in ["ADJ","NOUN"] and w.text not in stops]) for t in tqdm(df['text'])]

100%|██████████| 342/342 [00:05<00:00, 60.40it/s]


In [79]:
# Inspect Category TF-IDF terms
tfidfo, docterms = tfidf.get_docterms(df,"text")
tt = tfidf.get_topterms(tfidfo,docterms,df,'label')
tt = tt.head(15)
tt.head(15)
ttm = tt.melt()
unique_words = {cat:[w for w in tt[cat] if w not in set(ttm[ttm['variable'] != cat]['value'])] for cat in tt.columns}

In [93]:
df

Unnamed: 0,id,text,label
0,10659-1983-07-18.15.1.4.7-conservative,new regime per cent reduction dlo work force m...,size
1,10327-1981-07-30.10.17.28.2-labour,line new clause system planning permission cou...,size
2,18728-1972-08-03.15.1.11.8-labour,modern world talk devolution responsibility pl...,accountability
3,18622-1967-02-01.8.2.88.1-conservative,order deputy speaker increase loan contain sup...,inefficiency
4,17461-1982-11-23.8.1.72.1-labour,gentleman main objective reform effective expa...,inefficiency
...,...,...,...
337,21715-1968-01-29.17.1.199.12-conservative,general proposition growth bureaucracy bad num...,size
338,21715-1968-01-29.17.1.199.12-conservative,general proposition growth bureaucracy bad num...,cost
339,21872-1967-05-12.7.1.4.20-ulster_unionist_party,important part british constitution many year ...,freedom
340,21872-1967-05-12.7.1.4.20-ulster_unionist_party,important part british constitution many year ...,inefficiency
