In [1]:
from sklearn import preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV,train_test_split, RandomizedSearchCV,cross_val_score,cross_validate  
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score,classification_report,f1_score,roc_auc_score,average_precision_score,average_precision_score,recall_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import TruncatedSVD

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBClassifier

import re, string,os
from glob import glob as gb
import pandas as pd
from collections import Counter
from tqdm import tqdm
from utils.functions import *
import seaborn as sns 
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import operator

plotting.style()
import spacy
nlp = spacy.load("en_core_web_sm")


project_path = '/home/ruben/Documents/GitHub/CrisisBureaucracy'
data_path = '/media/ruben/Elements/PhD/data/hansard'

m = KeyedVectors.load_word2vec_format('/media/ruben/Elements/PhD/casebureaucracy/w2v-models/model-single-sample.bin', binary = True)

In [3]:
# Load all sentences with 'bureaucracy' (not just annotated set, but everything: 4080 texts)
df = pd.read_csv('~/Documents/GitHub/CrisisBureaucracy/data/classifier/bureaucracy-sentences-full.tsv',sep='\t').dropna()

def load_full(id_):
    f = pd.read_csv(f'{data_path}/lemmatized_pm/uk.proc.d.{id_[:10]}.txt',sep='\t')
    return [list(f[f['id'] == 'uk.proc.d.' + id_]['text'])[0],list(f[f['id'] == 'uk.proc.d.' + id_]['text_lemmatized'])[0]]

    
df['text'] = [load_full("-".join(x.split('-')[1:-1]))[0] for x in tqdm(df['id'])]
df = df[['id','text']]

100%|██████████| 4080/4080 [00:53<00:00, 76.84it/s]


In [4]:
# Split metadata to columns
df['year'] = [int(x.split('-')[1]) for x in df.id]
df['party'] = [str(x.split('-')[-1]) for x in df.id]
df['id'] = ["-".join(x.split('-')[1:-1]) for x in df.id]

# Clean
stops = stopwords.words('english') + "hon member friend gentleman gentlemen speaker right".split(' ')
df['text'] = [" ".join([w.text for w in nlp(t) if w.pos_ in ["ADJ","NOUN"] and str(w.text) not in stops]) for t in tqdm(df['text'])]

100%|██████████| 4080/4080 [00:59<00:00, 68.52it/s]


In [5]:
df = df[df['party'].isin(['labour','conservative'])].reset_index(drop=True)

In [114]:
# Train a classifier for every government period

for p in [[1957,1963],[1964,1970],[1971,1973],[1974,1979]]:
    tdf = df[df['year'].isin(list(range(p[0],p[1]+1)))]
        
    ## Classifier
    # vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3),stop_words=stops,smooth_idf=True)
    vectorizer = CountVectorizer(analyzer='word',ngram_range=(1,3),stop_words=(1,3))
    X = vectorizer.fit_transform(tdf['text']).toarray()

    X_train, X_test, y_train, y_test = train_test_split(X, tdf['party'], test_size = 0.4,random_state = 0)
    # classifier = svm.LinearSVC()
    # classifier = MultinomialNB()
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    # Classification metrics
    print("Period: ",p[0],p[1],'|',len(tdf),'texts')
    print('Accuracy: ', round(accuracy_score(y_test, y_pred),3))
    print('\n', classification_report(y_test,y_pred))
    print("Counted predictions: ",dict(Counter(y_pred)))
    print('======================================================')

Period:  1957 1963 | 204 texts
Accuracy:  0.561

               precision    recall  f1-score   support

conservative       0.25      0.06      0.10        32
      labour       0.59      0.88      0.71        50

    accuracy                           0.56        82
   macro avg       0.42      0.47      0.40        82
weighted avg       0.46      0.56      0.47        82

Counted predictions:  {'labour': 74, 'conservative': 8}
Period:  1964 1970 | 518 texts
Accuracy:  0.635

               precision    recall  f1-score   support

conservative       0.63      0.97      0.77       129
      labour       0.64      0.09      0.16        79

    accuracy                           0.63       208
   macro avg       0.64      0.53      0.46       208
weighted avg       0.64      0.63      0.53       208

Counted predictions:  {'conservative': 197, 'labour': 11}
Period:  1971 1973 | 254 texts
Accuracy:  0.598

               precision    recall  f1-score   support

conservative       0.59    

In [62]:
# Extract important words for every period
# Important here means features (words) with a coefficient of > 0.5

def important_words_period(start_year,end_year,verbose=False,n=50):
    tdf = df[df['year'].isin(list(range(start_year,end_year+1)))]
    
    ## Classifier
    vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), max_features=7500,max_df=0.3,stop_words=stops)
    classifier = MultinomialNB()
    vectorizer.fit(tdf['text'])
    train = vectorizer.transform(tdf['text'])
    classifier.fit(train,tdf['party'])
    
    ## Feature Selection
    class_labels = classifier.classes_
    feature_names =vectorizer.get_feature_names()

    topn_class1 = sorted(zip(classifier.feature_count_[0], feature_names),reverse=True)[:n]
    topn_class2 = sorted(zip(classifier.feature_count_[1], feature_names),reverse=True)[:n]

    if verbose == True:
        print("Important words in class: conservative")
        for coef, feat in topn_class1:
            print('\t',feat,'\t',round(coef,3))
        print("-----------------------------------------")
        print("Important words in class: labour")
        for coef, feat in topn_class2:
            print('\t',feat,'\t',round(coef,3))
    return topn_class1, topn_class2

In [66]:
words = []
for p in [[1957,1963],[1964,1970],[1971,1973],[1974,1976],[1977,1980]]:
    w_con,w_lab = words_period(p[0],p[1]+1,False,150)
    words += [w[1] for w in w_con if w[0] > 0.5]
    words += [w[1] for w in w_lab if w[0] > 0.5]

In [93]:
pd.DataFrame(list(set(words))).to_csv(project_path + '/data/classifier/party-classification-words.csv',index=False)