The optimised model did extremely well on the held-out test set.
Now subject it to other robustness checks after running the vectorizer again with the optimised values.


In [1]:
import os
import operator as op

import numpy as np
from numpy import array

from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

from sklearn.svm import LinearSVC

from nltk.stem.snowball import SnowballStemmer

In [2]:
parties = ['act', 'green', 'labour', 'maori', 'national', 'nzfirst']

strip_list = ['Posted by', '\n', 'Jacinda', 'Ardern', 'Steven', 'Joyce', ' Bill ', 'English', 'Carmel', 'Sepuloni', 'Barry',
                '2017', 'David', 'Clark', 'Phil', 'Twyford', 'Michael', 'Wood', 'Chris ', 'Hipkins', 'Grant', 'Maggie',
                'Robertson', 'Greg', 'O’Connor', 'Andrew', ' Little', 'Winston', 'Peters', 'Damien', 'O\'Connor',
                'Kelvin', 'Davis', 'Phil', 'Twyford', 'Megan', 'Woods', 'Parker', 'Nanaia', 'Mahuta', 'Paula', 'Bennett',
                'Carter', 'Gerry', 'Brownlee', 'Simon', ' Bridges', ' Amy', 'Adams', 'Jonathan', 'Coleman', 'Christopher',
                'Finlayson', 'Woodhouse', 'Nathan', 'Guy', 'Anne', 'Tolley', ' Ron ', 'Mark', 'Marama', 'Fox', ' Te ', 
                'Ururoa', 'Flavel', 'Jones', 'Shane', 'Taurima', 'Seymour', 'James', 'Shaw', 'Marama', 'Davidson', ' Dr ',
                'Julie', 'Anne', 'Genter', 'Jan ', 'Logie', 'Eugenie', 'Sage', 'Gareth', 'Hughes', 'Steffan', 'Browning',
                'Rt', 'Hon', 'Nick', 'Smith', 'Nikki', 'Kaye', 'Nicky', 'Wagner', 'Minister', 'Paul', 'Goldsmith',
                'ACT', ' National ', 'Green Party', 'Labour', 'First ', 'ENDS', '.', ',' '\"', '\'', 'Māori Party',
                '“','”', 'Facebook5Twitter', 'Steffan', 'Browning', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                'Carlton', 'Burke', 'Chadwick', 'Catherine', 'Christine', 'Alex', 'Alexander', 'Baker', 'Carolyn', 
                'Alyssa', 'Brown', 'Bob', 'Byrn', 'Augustine', 'Crawford', 'Antonio', 'Claudetta', 'Christina', 'Collins'
                'Buckner', 'centre', 'Ben', 'Boyden',  'Alan', 'Bosley', '’', 'Alastair', 'Ballantyne', 'Bruno', 'Cecelia',
                 'Allan', 'Bernard', 'Anderson', 'Andrea', 'Tim', 'spokesperson', 'Scott', 'Simpson', 'Epsom', 'Metiria',
                 'Turei', 'said', 'say', 'John Key', 'John', 'Tukoroirangi Morgan', 'Dame', 'Mei', 'Reedy', 'Leader', 
                 'Northland', 'Member of Parliament', 'Spokesperson', 'Don', 'Houlbrook', 'Stephen', 'Todd', 'Barclay',
                 'Morgan', 'Tariana', 'Turia', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
                 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 
                 'November', 'December', 'Southland', 'Judith', 'Collins', 'Jacqui', 'Dean', 'Bhupind', 'Singh', 'van Velden',
                 'govt.nz', 'Mitchel', 'New Zealand', 'First', 'NZ', 'Mitchell', 'Tracy', 'Martin', 'Mike', 'MP', 'Prosser',
                 'William', 'Sio', 'Don', 'Zealand', 'Aupito', 'Kevin', 'Hague', 'Bhupind', 'Singh', 'Louise', 'Upston'] 

strip_from_stemmed = ['conclusionth', 'bennet', 'brydon', 'bosley', 'centrewellington', 'countrynew', 'ballantyn', 'allan',
                     'delahunti', ',', 'beth', 'ms', 'mr', 'www', 'http', 'media contact', 'govt', 'nz'] 


In [3]:
def read_in(party, folder):
    release_text_list = []
    for filename in os.listdir(folder):
        full_path = os.path.join(party, filename)
        #print(full_path)
        file_obj = open(os.path.join(party, filename), 'r', encoding='utf8')
        content = file_obj.read()
        file_obj.close()
        release_text_list.append(content)
    return(release_text_list)
# Create dictionary where key = label (party) and value = list of release text strings.

dict_of_text_lists = {}
list_of_all_texts = []
list_of_all_party_texts = []

for party in parties:
    list_of_all_party_texts = read_in(party, party)
    dict_of_text_lists[party] = list_of_all_party_texts
    print('Total docs for party', party, '=', len(list_of_all_party_texts))
    list_of_all_texts = list_of_all_texts + list_of_all_party_texts
    list_of_all_party_texts = []

print('Total documents: ', len(list_of_all_texts))


Total docs for party act = 130
Total docs for party green = 110
Total docs for party labour = 169
Total docs for party maori = 48
Total docs for party national = 900
Total docs for party nzfirst = 235
Total documents:  1592


In [4]:
# Create dictionary where key = label (party) and value = list of release text strings.

dict_of_text_lists = {}
list_of_all_texts = []
list_of_all_party_texts = []

for party in parties:
    list_of_all_party_texts = read_in(party, party)
    dict_of_text_lists[party] = list_of_all_party_texts
    print('Total docs for party', party, '=', len(list_of_all_party_texts))
    list_of_all_texts = list_of_all_texts + list_of_all_party_texts
    list_of_all_party_texts = []

print('Total documents: ', len(list_of_all_texts))


Total docs for party act = 130
Total docs for party green = 110
Total docs for party labour = 169
Total docs for party maori = 48
Total docs for party national = 900
Total docs for party nzfirst = 235
Total documents:  1592


In [5]:
# Undersample National to improve dataset balance
# If re-running, need to run the cell above first!

print ('Removing three out of every four National Party press release over time:')
print('National had', len(dict_of_text_lists['national']), 'press releases')
top_end = len(dict_of_text_lists['national'])
              
for i in range(0, int(top_end/4)):
#for i in range(1, int(top_end/4)):  # Robustness check - try a different quarter
#for i in range(2, int(top_end/4)):  # Robustness check - try a different quarter
#for i in range(3, int(top_end/4)):  # Robustness check - try a different quarter
    del dict_of_text_lists['national'][i]
    del dict_of_text_lists['national'][i]        
    del dict_of_text_lists['national'][i]        
    i += 3
    
print('National now has', len(dict_of_text_lists['national']), 'press releases')


Removing three out of every four National Party press release over time:
National had 900 press releases
National now has 225 press releases


In [6]:
# Get rid of names (giveaways for the authoring problem) and a few other problematic ('cheat') strings, and stem the text

stem_words = []
party_list_of_proc_texts = []
dict_of_proc_text_lists = {}
stemmer = SnowballStemmer("english")

for party in parties:
    party_list_of_proc_texts = []
    
    # Remove the words in my manual strip list above
    for text in dict_of_text_lists[party]:
        strip_text = text
        for goner in strip_list:
            if goner in text:
                strip_text = strip_text.replace(goner, ' ')                
            else:
                pass
        
        # Apply Snowball stemmer:
        stem_words = []
        text_words = strip_text.split()
        for word in text_words:
            stem_word = stemmer.stem(word)
            stem_words.append(stem_word)
        stem_text = " ".join(stem_words)
        
        # Strip a few problematic strings from the stemmed text:
        for stemword in strip_from_stemmed:
            if stemword in stem_text:
                stem_text = stem_text.replace(stemword, ' ')
        party_list_of_proc_texts.append(stem_text)
        
    # Put list of processed party texts in dictionary with party as key
    dict_of_proc_text_lists[party] = party_list_of_proc_texts

        
    
print('Example:')    
print('Original text:', text[:80])
print('\n', 'Stripped text:', strip_text[:80])
print('\n', 'After stem:' , stem_text[:80])


Example:
Original text: 
The 2017 campaign of promises demonstrates the old parties in Parliament are de

 Stripped text:  The   campaign of promises demonstrates the old parties in Parliament are despe

 After stem: the campaign of promis demonstr the old parti in parliament are desper do anyth 


In [7]:
# Make a list of (unlabelled) processed texts
list_of_all_proc_texts = []

for party in parties:
    procd = dict_of_proc_text_lists[party]
    list_of_all_proc_texts = list_of_all_proc_texts + procd
    
print(len(list_of_all_proc_texts))
    
# Make a list of party authors that will match up with the texts 
party_match = []
for party in dict_of_proc_text_lists:
    for text in dict_of_proc_text_lists[party]:
        party_match.append(party)
        
print(len(party_match))


917
917


In [8]:
# Split into training and testing sets:
docs_train, docs_test, labels_train, labels_test = model_selection.train_test_split(list_of_all_proc_texts, 
                                                                                    party_match, 
                                                                                    test_size = 0.2,
                                                                                    stratify = party_match)
# Stratify ensures same balance of test as training data
print(len(docs_train))

733


In [9]:
top_vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.9, min_df = 0.005, stop_words='english', ngram_range=(1, 3))
features_train_transf = top_vectorizer.fit_transform(docs_train)
features_test_transf  = top_vectorizer.transform(docs_test)

In [None]:
all_feature_names = top_vectorizer.get_feature_names()
print('Initial number of features after vectorisation:', len(all_feature_names))  

In [None]:
# Select top x% of most individually useful features
top_selector = SelectPercentile(f_classif, percentile=60) 
top_selector.fit(features_train_transf, labels_train)

my_names = np.asarray(top_vectorizer.get_feature_names())[top_selector.get_support()]  #  Feature names (alphabetical)
print('New number of features after SelectPercentile:', len(my_names)) # Halved.

In [None]:
# Transform and convert to numpy arrays
cut_features_train_transf = top_selector.transform(features_train_transf).toarray()
cut_features_test_transf  = top_selector.transform(features_test_transf).toarray()

# Convert labels from lists to numpy arrays
labels_train = np.array(labels_train)
labels_test = np.array(labels_test)

### Now ready for Scikit-learn

In [None]:
top_nb = MultinomialNB(alpha = 0.01)
top_fitted     = top_nb.fit(cut_features_train_transf, labels_train)
test_predicted = top_nb.predict(cut_features_test_transf) 
print(classification_report(test_predicted, labels_test, labels = parties))


In [None]:
print('Confusion Matrix')
print(metrics.confusion_matrix(labels_test, test_predicted))
# {'act': 26, 'green': 22, 'labour': 34, 'maori': 10, 'national': 60, 'nzfirst': 47}

In [None]:
# Make list of features with highest coefficient values, per class, from most to least important

def list_top_features(classifier, feature_names, num_feat):
    party_words = {}
    counter = 0
    top = 'top' + str(num_feat)
    for i, label_train in enumerate(parties):          # enumerate loops with an automatic counter (in this case, i)
        top = np.argsort(classifier.coef_[i])[::-1][0:num_feat]
        list_top = str(', '.join(feature_names[j] for j in top)).split(',')
        print(" ")
        print(parties[counter], 'most distinguishing words from most to ' + str(num_feat) + ':')
        print(list_top)
        party_words[parties[counter]] = list_top  # dict of lists where keys are parties
        counter += 1        
    return(party_words)



# This version also prints out coefficients:

def list_top_features_with_coefs(classifier, feature_names, num_feat):
    word_nb_coef = {}
    party_word_nb_coef = {}
    maximum = 0
    minimum = 0
    
    len_feature_names = len(feature_names)
    for i in range(6):
        print('\n', parties[i], '\n')  
        diff = classifier.feature_log_prob_[i,:] - np.max(classifier.feature_log_prob_[-i:]) # Only works for NB
        
        name_diff = {}   
        for j in range(len_feature_names):
            name_diff[feature_names[j]] = diff[j]
            names_diff_sorted = sorted(name_diff.items(), key = op.itemgetter(1), reverse = True)
        # Check min coef - any negative?
            if diff[j] < minimum:
                minimum = diff[j]
            else:
                pass
            if diff[j] > maximum:
                maximum = diff[j]
            else:
                pass
            
        for k in range(num_feat):
            print(k, names_diff_sorted[k])
            word_nb_coef[names_diff_sorted[k][0]] = names_diff_sorted[k][1] # Dictionary of word coefficients            
        party_word_nb_coef[parties[i]] = word_nb_coef    
        print ('maximum', maximum, 'minimum', minimum)
    return party_word_nb_coef       

In [None]:
top_nb_party_words = list_top_features(top_nb, my_names, 200)   
party_word_nb_coef = list_top_features_with_coefs(top_nb, my_names, 200) 


In [None]:
print (party_word_nb_coef['green']['river']) # test

In [None]:
# Try a linear SVC for comparison - use the above values for feature selection (min_df and max_df in the Tfidf vectorizer, 
# percentile in the selector) and only gridsearch the SVC hyperparameter C.

text_svc_clf = Pipeline([('vect', TfidfVectorizer(sublinear_tf=True, max_df = 0.9, min_df = 0.005, 
                                                  stop_words='english', ngram_range=(1, 3))),
                         ('selector', SelectPercentile(percentile = 60)),
                         ('svc_clf', LinearSVC(class_weight = 'balanced'))])
text_svc_clf.fit(docs_train, labels_train) 

# class_weight: Set the parameter C of class i to class_weight[i]*C for SVC.
# If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y 
# to automatically adjust weights inversely proportional to class frequencies in the input data 

In [None]:
# Gridsearch LinearSVC over C
svc_parameters = {'svc_clf__C': (0.001, 0.01, 0.1, 1, 10, 100, 1000)}

svc_scoring = {'F1':       make_scorer(f1_score, average='weighted'), 
               'Accuracy': make_scorer(accuracy_score)}

gs_svc_clf = GridSearchCV(text_svc_clf, svc_parameters, scoring = svc_scoring, refit = 'F1', cv = 6)
gs_svc_clf.fit(docs_train, labels_train)
svc_results = gs_svc_clf.cv_results_

In [None]:
print("Best score")
print(gs_svc_clf.best_score_)   

for param_name in sorted(svc_parameters.keys()):
    print("%s: %r" % (param_name, gs_svc_clf.best_params_[param_name]))

In [None]:
# Use on test set
svc_test_predicted = gs_svc_clf.predict(docs_test)

# Assess performance:
print(classification_report(svc_test_predicted, labels_test, labels = parties))

In [None]:
# Examine top features and see if there is much crossover 
top_svc = LinearSVC(class_weight = 'balanced', C = 1)
top_svc.fit(cut_features_train_transf, labels_train)

In [None]:
def list_top_svc_features_with_coefs(classifier, feature_names, num_feat):
    len_feature_names = len(feature_names)
    word_nb_coef = {}
    minimum = 0
    maximum = 0
    for i in range(6):
        print('\n', parties[i], '\n')  
        coef = classifier.coef_[i,:]  
        name_coef = {}   
        for j in range(len_feature_names):
            name_coef[feature_names[j]] = coef[j]
            names_coef_sorted = sorted(name_coef.items(), key = op.itemgetter(1), reverse = True)
        # Check min coef - any negative?
            if classifier.coef_[i,j] > minimum:
                minimum = classifier.coef_[i,j]
            else:
                pass
            if classifier.coef_[i,j] < maximum:
                maximum = classifier.coef_[i,j]
            else:
                pass
            
        for k in range(num_feat):
            print(k, names_coef_sorted[k])   
        print("max:", maximum, "min", minimum)
        
 # SVC coefficients can go negative - squaring is a bad idea.       

In [None]:
top_svc_party_words = list_top_features(top_svc, my_names, 200)   
list_top_svc_features_with_coefs(top_svc, my_names, 200)

In [None]:
top_nb_party_words = list_top_features(top_nb, my_names, 200)   
#print(type(top_nb_party_words))

In [None]:
# Compare with top nb:

# Calculate crossover between top Naive Bayes model and top SVC model:
common_words = {}
for party in parties:
    if len(top_nb_party_words[party]) == len(top_svc_party_words[party]):
        pass
    else:
        print("Warning: invalid comparison - the lists are different lengths.")
    common = list(set(top_nb_party_words[party]).intersection(top_svc_party_words[party]))  
    print(party, ":", len(common), "out of", len(top_nb_party_words[party]), "words are the same. (", 
          100*len(common)/len(top_svc_party_words[party]), '%)')
    common_words[party] = common
    print(common)
    
# I'm more comfortable interpreting the Naive Bayes coefficients so I will use them, but use the cross-over list.    

In [None]:
# Convert NB coefficients to suitable number for wordle.net
#party_word_nb_coef = list_top_features_with_coefs(top_nb, my_names, 200) 

party_word_cloud_coef = {}

for party in parties:
    print('\n', party, '\n')
    party_word_cloud_coef[party] = {}
    for word in common_words[party]:
        word = word.strip()
        party_word_cloud_coef[party][word] = str(int(100*(party_word_nb_coef[party][word] + 3))) # Make them positive integers
        my_string = word + ':' + party_word_cloud_coef[party][word] # Print in format ready to drop into wordle.net
        print (my_string)

# These are still stemmed

In [None]:
# READ IN NEW (OUT-OF-SAMPLE) TEST DOCUMENTS: LATEST PRESS RELEASES
# Create dictionary where key = label (party) and value = list of release text strings.
def read_in_new(party, folder):
    release_text_list = []
    for filename in os.listdir(folder):
        full_path = os.path.join(folder, filename)
        #print(full_path)
        file_obj = open(os.path.join(folder, filename), 'r', encoding='utf8')
        content = file_obj.read()
        file_obj.close()
        release_text_list.append(content)
    return(release_text_list)

dict_of_new_text_lists = {}
list_of_all_new_texts = []
list_of_all_new_party_texts = []

for party in parties:
    new_folder = str(os.path.join('test', party))
    print(new_folder)
    list_of_all_new_party_texts = read_in_new(party, new_folder)
    #print(list_of_all_new_party_texts[0][:100])
    dict_of_new_text_lists[party] = list_of_all_new_party_texts
    print('Total new docs for party', party, '=', len(list_of_all_new_party_texts))
    list_of_all_new_texts = list_of_all_new_texts + list_of_all_new_party_texts
    list_of_all_new_party_texts = []

print('Total documents: ', len(list_of_all_new_texts))

In [None]:
# Strip and stem

new_stem_words = []
party_list_of_new_proc_texts = []
dict_of_new_proc_text_lists = {}
stemmer = SnowballStemmer("english")

for party in parties:
    party_list_of_new_proc_texts = []
    
    # Remove the words in my manual strip list above
    for text in dict_of_new_text_lists[party]:
        strip_text = text
        for goner in strip_list:
            if goner in text:
                strip_text = strip_text.replace(goner, ' ')                
            else:
                pass
        
        # Apply Snowball stemmer:
        new_stem_words = []
        text_words = strip_text.split()
        for word in text_words:
            stem_word = stemmer.stem(word)
            new_stem_words.append(stem_word)
        stem_text = " ".join(new_stem_words)
        
        # Strip a few problematic strings from the stemmed text:
        for stemword in strip_from_stemmed:
            if stemword in stem_text:
                stem_text = stem_text.replace(stemword, ' ')
        party_list_of_new_proc_texts.append(stem_text)
        
    # Put list of processed party texts in dictionary with party as key
    dict_of_new_proc_text_lists[party] = party_list_of_new_proc_texts

In [None]:
# Make a list of (unlabelled) processed texts
list_of_all_new_proc_texts = []

for party in parties:
    procd = dict_of_new_proc_text_lists[party]
    list_of_all_new_proc_texts = list_of_all_new_proc_texts + procd
    
print(len(list_of_all_new_proc_texts))
    
# Make a list of party authors that will match up with the texts 
new_party_match = []
for party in dict_of_new_proc_text_lists:
    for text in dict_of_new_proc_text_lists[party]:
        new_party_match.append(party)
        
print(len(new_party_match))

In [None]:
features_test2_transf = top_vectorizer.transform(list_of_all_new_proc_texts)
cut_features_test2_transf  = top_selector.transform(features_test2_transf).toarray()
print(cut_features_test2_transf.shape[0]) # sparse numpy array. shape[0] = 184, number of docs. shape[1] = 6336, num of features
print(cut_features_test2_transf.shape[1]) # sparse numpy array. shape[0] = 184, number of docs. shape[1] = 3801, num of features

In [None]:
labels_new_test = np.array(new_party_match)
print(len(labels_new_test))
print(features_test2_transf.shape[0]) # sparse numpy array. shape[0] = 184, number of docs. shape[1] = num of features

In [None]:
# Naive Bayes 
test_new_predicted_nb = top_nb.predict(cut_features_test2_transf) 
print(classification_report(test_new_predicted_nb, labels_new_test, labels = parties))
print('Confusion Matrix')
print(metrics.confusion_matrix(labels_new_test, test_new_predicted_nb))

# SVC
test_new_predicted_svc = top_svc.predict(cut_features_test2_transf) 
print(classification_report(test_new_predicted_svc, labels_new_test, labels = parties))

print('Confusion Matrix')
print(metrics.confusion_matrix(labels_new_test, test_new_predicted_svc))