The optimised model did extremely well on the held-out test set.
Now subject it to other robustness checks after running the vectorizer again with the optimised values.


In [None]:
import os
import operator as op

import numpy as np
from numpy import array

from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

from sklearn.svm import LinearSVC

from nltk.stem.snowball import SnowballStemmer

In [None]:
parties = ['act', 'green', 'labour', 'maori', 'national', 'nzfirst']

strip_list = ['Posted by', '\n', 'Jacinda', 'Ardern', 'Steven', 'Joyce', ' Bill ', 'English', 'Carmel', 'Sepuloni', 'Barry',
                '2017', 'David', 'Clark', 'Phil', 'Twyford', 'Michael', 'Wood', 'Chris ', 'Hipkins', 'Grant', 'Maggie',
                'Robertson', 'Greg', 'O’Connor', 'Andrew', ' Little', 'Winston', 'Peters', 'Damien', 'O\'Connor',
                'Kelvin', 'Davis', 'Phil', 'Twyford', 'Megan', 'Woods', 'Parker', 'Nanaia', 'Mahuta', 'Paula', 'Bennett',
                'Carter', 'Gerry', 'Brownlee', 'Simon', ' Bridges', ' Amy', 'Adams', 'Jonathan', 'Coleman', 'Christopher',
                'Finlayson', 'Woodhouse', 'Nathan', 'Guy', 'Anne', 'Tolley', ' Ron ', 'Mark', 'Marama', 'Fox', ' Te ', 
                'Ururoa', 'Flavel', 'Jones', 'Shane', 'Taurima', 'Seymour', 'James', 'Shaw', 'Marama', 'Davidson', ' Dr ',
                'Julie', 'Anne', 'Genter', 'Jan ', 'Logie', 'Eugenie', 'Sage', 'Gareth', 'Hughes', 'Steffan', 'Browning',
                'Rt', 'Hon', 'Nick', 'Smith', 'Nikki', 'Kaye', 'Nicky', 'Wagner', 'Minister', 'Paul', 'Goldsmith',
                'ACT', ' National ', 'Green Party', 'Labour', 'First ', 'ENDS', '.', ',' '\"', '\'', 'Māori Party',
                '“','”', 'Facebook5Twitter', 'Steffan', 'Browning', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                'Carlton', 'Burke', 'Chadwick', 'Catherine', 'Christine', 'Alex', 'Alexander', 'Baker', 'Carolyn', 
                'Alyssa', 'Brown', 'Bob', 'Byrn', 'Augustine', 'Crawford', 'Antonio', 'Claudetta', 'Christina', 'Collins'
                'Buckner', 'centre', 'Ben', 'Boyden',  'Alan', 'Bosley', '’', 'Alastair', 'Ballantyne', 'Bruno', 'Cecelia',
                 'Allan', 'Bernard', 'Anderson', 'Andrea', 'Tim', 'spokesperson', 'Scott', 'Simpson', 'Epsom', 'Metiria',
                 'Turei', 'said', 'say', 'John Key', 'John', 'Tukoroirangi Morgan', 'Dame', 'Mei', 'Reedy', 'Leader', 
                 'Northland', 'Member of Parliament', 'Spokesperson', 'Don', 'Houlbrook', 'Stephen', 'Todd', 'Barclay',
                 'Morgan', 'Tariana', 'Turia', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
                 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 
                 'November', 'December', 'Southland', 'Judith', 'Collins', 'Jacqui', 'Dean', 'Bhupind', 'Singh', 'van Velden',
                 'govt.nz', 'Mitchel', 'New Zealand', 'First', 'NZ', 'Mitchell', 'Tracy', 'Martin', 'Mike', 'MP', 'Prosser',
                 'William', 'Sio', 'Don', 'Zealand', 'Aupito', 'Kevin', 'Hague', 'Bhupind', 'Singh', 'Louise', 'Upston'] 

strip_from_stemmed = ['conclusionth', 'bennet', 'brydon', 'bosley', 'centrewellington', 'countrynew', 'ballantyn', 'allan',
                     'delahunti', ',', 'beth', 'ms', 'mr', 'www', 'http', 'media contact', 'govt', 'nz'] 


In [None]:
def read_in(party, folder):
    release_text_list = []
    for filename in os.listdir(folder):
        full_path = os.path.join(party, filename)
        #print(full_path)
        file_obj = open(os.path.join(party, filename), 'r', encoding='utf8')
        content = file_obj.read()
        file_obj.close()
        release_text_list.append(content)
    return(release_text_list)
# Create dictionary where key = label (party) and value = list of release text strings.

dict_of_text_lists = {}
list_of_all_texts = []
list_of_all_party_texts = []

for party in parties:
    list_of_all_party_texts = read_in(party, party)
    dict_of_text_lists[party] = list_of_all_party_texts
    print('Total docs for party', party, '=', len(list_of_all_party_texts))
    list_of_all_texts = list_of_all_texts + list_of_all_party_texts
    list_of_all_party_texts = []

print('Total documents: ', len(list_of_all_texts))


In [None]:
# Create dictionary where key = label (party) and value = list of release text strings.

dict_of_text_lists = {}
list_of_all_texts = []
list_of_all_party_texts = []

for party in parties:
    list_of_all_party_texts = read_in(party, party)
    dict_of_text_lists[party] = list_of_all_party_texts
    print('Total docs for party', party, '=', len(list_of_all_party_texts))
    list_of_all_texts = list_of_all_texts + list_of_all_party_texts
    list_of_all_party_texts = []

print('Total documents: ', len(list_of_all_texts))


In [None]:
# Undersample National to improve dataset balance
# If re-running, need to run the cell above first!

print ('Removing three out of every four National Party press release over time:')
print('National had', len(dict_of_text_lists['national']), 'press releases')
top_end = len(dict_of_text_lists['national'])
              
for i in range(0, int(top_end/4)):
#for i in range(1, int(top_end/4)):  # Robustness check - try a different quarter
#for i in range(2, int(top_end/4)):  # Robustness check - try a different quarter
#for i in range(3, int(top_end/4)):  # Robustness check - try a different quarter
    del dict_of_text_lists['national'][i]
    del dict_of_text_lists['national'][i]        
    del dict_of_text_lists['national'][i]        
    i += 3
    
print('National now has', len(dict_of_text_lists['national']), 'press releases')


In [None]:
# Get rid of names (giveaways for the authoring problem) and a few other problematic ('cheat') strings, and stem the text

stem_words = []
party_list_of_proc_texts = []
dict_of_proc_text_lists = {}
stemmer = SnowballStemmer("english")

for party in parties:
    party_list_of_proc_texts = []
    
    # Remove the words in my manual strip list above
    for text in dict_of_text_lists[party]:
        strip_text = text
        for goner in strip_list:
            if goner in text:
                strip_text = strip_text.replace(goner, ' ')                
            else:
                pass
        
        # Apply Snowball stemmer:
        stem_words = []
        text_words = strip_text.split()
        for word in text_words:
            stem_word = stemmer.stem(word)
            stem_words.append(stem_word)
        stem_text = " ".join(stem_words)
        
        # Strip a few problematic strings from the stemmed text:
        for stemword in strip_from_stemmed:
            if stemword in stem_text:
                stem_text = stem_text.replace(stemword, ' ')
        party_list_of_proc_texts.append(stem_text)
        
    # Put list of processed party texts in dictionary with party as key
    dict_of_proc_text_lists[party] = party_list_of_proc_texts

        
    
print('Example:')    
print('Original text:', text[:80])
print('\n', 'Stripped text:', strip_text[:80])
print('\n', 'After stem:' , stem_text[:80])


In [None]:
# Make a list of (unlabelled) processed texts
list_of_all_proc_texts = []

for party in parties:
    procd = dict_of_proc_text_lists[party]
    list_of_all_proc_texts = list_of_all_proc_texts + procd
    
print(len(list_of_all_proc_texts))
    
# Make a list of party authors that will match up with the texts 
party_match = []
for party in dict_of_proc_text_lists:
    for text in dict_of_proc_text_lists[party]:
        party_match.append(party)
        
print(len(party_match))


In [None]:
# Split into training and testing sets:
docs_train, docs_test, labels_train, labels_test = model_selection.train_test_split(list_of_all_proc_texts, 
                                                                                    party_match, 
                                                                                    test_size = 0.2,
                                                                                    stratify = party_match)
# Stratify ensures same balance of test as training data
print(len(docs_train))

In [None]:
top_vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.9, min_df = 0.005, stop_words='english', ngram_range=(1, 3))
features_train_transf = top_vectorizer.fit_transform(docs_train)
features_test_transf  = top_vectorizer.transform(docs_test)

In [None]:
all_feature_names = top_vectorizer.get_feature_names()
print('Initial number of features after vectorisation:', len(all_feature_names))  

In [None]:
# Select top x% of most individually useful features
top_selector = SelectPercentile(f_classif, percentile=60) 
top_selector.fit(features_train_transf, labels_train)

my_names = np.asarray(top_vectorizer.get_feature_names())[top_selector.get_support()]  #  Feature names (alphabetical)
print('New number of features after SelectPercentile:', len(my_names)) # Halved.

In [None]:
# Transform and convert to numpy arrays
cut_features_train_transf = top_selector.transform(features_train_transf).toarray()
cut_features_test_transf  = top_selector.transform(features_test_transf).toarray()

# Convert labels from lists to numpy arrays
labels_train = np.array(labels_train)
labels_test = np.array(labels_test)

### Now ready for Scikit-learn

In [None]:
top_nb = MultinomialNB(alpha = 0.01)
top_fitted     = top_nb.fit(cut_features_train_transf, labels_train)
test_predicted = top_nb.predict(cut_features_test_transf) 
print(classification_report(test_predicted, labels_test, labels = parties))


In [None]:
"""
            precision    recall  f1-score   support

        act       0.77      0.91      0.83        22
      green       0.64      0.78      0.70        18
     labour       0.85      0.72      0.78        40
      maori       0.80      0.89      0.84         9
   national       0.89      0.89      0.89        45
    nzfirst       0.85      0.80      0.82        50

avg / total       0.83      0.82      0.82       184


"""

In [None]:
print('Confusion Matrix')
print(metrics.confusion_matrix(labels_test, test_predicted))
# {'act': 26, 'green': 22, 'labour': 34, 'maori': 10, 'national': 60, 'nzfirst': 47}

In [None]:
"""
Confusion Matrix
[[20  1  1  0  1  3]
 [ 0 14  3  0  2  3]
 [ 1  0 29  1  0  3]
 [ 0  1  0  8  1  0]
 [ 0  1  3  0 40  1]
 [ 1  1  4  0  1 40]]
In [ ]:


 
 Looks much better. Much less tendency to always predict National or NZ First. Maori party accuracy is excellent!
"""


In [None]:
"""
# Robustness check 1: try different quarters of National data

# Result with alternative 1:
precision    recall  f1-score   support

        act       0.81      0.81      0.81        26
      green       0.55      0.86      0.67        14
     labour       0.76      0.76      0.76        34
      maori       0.80      0.89      0.84         9
   national       0.98      0.83      0.90        53
    nzfirst       0.74      0.73      0.74        48

avg / total       0.81      0.79      0.80       184

# Result with alternative 2:

precision    recall  f1-score   support

        act       0.88      0.96      0.92        24
      green       0.55      0.86      0.67        14
     labour       0.62      0.88      0.72        24
      maori       0.80      1.00      0.89         8
   national       0.96      0.85      0.90        52
    nzfirst       1.00      0.75      0.85        63

avg / total       0.88      0.84      0.85       185


# Result with alternative 3:

precision    recall  f1-score   support

        act       0.81      0.88      0.84        24
      green       0.55      0.75      0.63        16
     labour       0.74      0.78      0.76        32
      maori       0.80      1.00      0.89         8
   national       0.96      0.83      0.89        54
    nzfirst       0.87      0.79      0.83        52

avg / total       0.83      0.82      0.82       186


"""

In [None]:
# Make list of features with highest coefficient values, per class, from most to least important

def list_top_features(classifier, feature_names, num_feat):
    party_words = {}
    counter = 0
    top = 'top' + str(num_feat)
    for i, label_train in enumerate(parties):          # enumerate loops with an automatic counter (in this case, i)
        top = np.argsort(classifier.coef_[i])[::-1][0:num_feat]
        list_top = str(', '.join(feature_names[j] for j in top)).split(',')
        print(" ")
        print(parties[counter], 'most distinguishing words from most to ' + str(num_feat) + ':')
        print(list_top)
        party_words[parties[counter]] = list_top  # dict of lists where keys are parties
        counter += 1        
    return(party_words)



# This version also prints out coefficients:

def list_top_features_with_coefs(classifier, feature_names, num_feat):
    word_nb_coef = {}
    party_word_nb_coef = {}
    maximum = 0
    minimum = 0
    
    len_feature_names = len(feature_names)
    for i in range(6):
        print('\n', parties[i], '\n')  
        diff = classifier.feature_log_prob_[i,:] - np.max(classifier.feature_log_prob_[-i:]) # Only works for NB
        
        name_diff = {}   
        for j in range(len_feature_names):
            name_diff[feature_names[j]] = diff[j]
            names_diff_sorted = sorted(name_diff.items(), key = op.itemgetter(1), reverse = True)
        # Check min coef - any negative?
            if diff[j] < minimum:
                minimum = diff[j]
            else:
                pass
            if diff[j] > maximum:
                maximum = diff[j]
            else:
                pass
            
        for k in range(num_feat):
            print(k, names_diff_sorted[k])
            word_nb_coef[names_diff_sorted[k][0]] = names_diff_sorted[k][1] # Dictionary of word coefficients            
        party_word_nb_coef[parties[i]] = word_nb_coef    
        print ('maximum', maximum, 'minimum', minimum)
    return party_word_nb_coef       

In [None]:
top_nb_party_words = list_top_features(top_nb, my_names, 200)   
party_word_nb_coef = list_top_features_with_coefs(top_nb, my_names, 200) 


In [None]:
print (party_word_nb_coef['green']['river']) # test


In [None]:
# Try a linear SVC for comparison - use the above values for feature selection (min_df and max_df in the Tfidf vectorizer, 
# percentile in the selector) and only gridsearch the SVC hyperparameter C.

text_svc_clf = Pipeline([('vect', TfidfVectorizer(sublinear_tf=True, max_df = 0.9, min_df = 0.005, 
                                                  stop_words='english', ngram_range=(1, 3))),
                         ('selector', SelectPercentile(percentile = 60)),
                         ('svc_clf', LinearSVC(class_weight = 'balanced'))])
text_svc_clf.fit(docs_train, labels_train) 

# class_weight: Set the parameter C of class i to class_weight[i]*C for SVC.
# If not given, all classes are supposed to have weight one. The “balanced” mode uses the values of y 
# to automatically adjust weights inversely proportional to class frequencies in the input data 

In [None]:
# Gridsearch LinearSVC over C
svc_parameters = {'svc_clf__C': (0.001, 0.01, 0.1, 1, 10, 100, 1000)}

svc_scoring = {'F1':       make_scorer(f1_score, average='weighted'), 
               'Accuracy': make_scorer(accuracy_score)}

gs_svc_clf = GridSearchCV(text_svc_clf, svc_parameters, scoring = svc_scoring, refit = 'F1', cv = 6)
gs_svc_clf.fit(docs_train, labels_train)
svc_results = gs_svc_clf.cv_results_


In [None]:
print("Best score")
print(gs_svc_clf.best_score_)   

for param_name in sorted(svc_parameters.keys()):
    print("%s: %r" % (param_name, gs_svc_clf.best_params_[param_name]))

In [None]:
# Use on test set
svc_test_predicted = gs_svc_clf.predict(docs_test)

# Assess performance:
print(classification_report(svc_test_predicted, labels_test, labels = parties))

In [None]:
"""
It beat the Naive Bayes

SVC:
            precision    recall  f1-score   support

        act       0.77      0.95      0.85        21
      green       0.73      0.84      0.78        19
     labour       0.85      0.88      0.87        33
      maori       0.80      1.00      0.89         8
   national       0.93      0.89      0.91        47
    nzfirst       0.91      0.77      0.83        56

avg / total       0.87      0.86      0.86       184




vs best NB with same features and test set:
            precision    recall  f1-score   support

        act       0.81      0.78      0.79        27
      green       0.45      0.59      0.51        17
     labour       0.68      0.82      0.74        28
      maori       0.90      0.90      0.90        10
   national       0.91      0.82      0.86        50
    nzfirst       0.87      0.79      0.83        52

avg / total       0.81      0.79      0.79       184

"""

In [None]:
# Examine top features and see if there is much crossover 
top_svc = LinearSVC(class_weight = 'balanced', C = 1)
top_svc.fit(cut_features_train_transf, labels_train)


In [None]:
def list_top_svc_features_with_coefs(classifier, feature_names, num_feat):
    len_feature_names = len(feature_names)
    word_nb_coef = {}
    minimum = 0
    maximum = 0
    for i in range(6):
        print('\n', parties[i], '\n')  
        coef = classifier.coef_[i,:]  
        name_coef = {}   
        for j in range(len_feature_names):
            name_coef[feature_names[j]] = coef[j]
            names_coef_sorted = sorted(name_coef.items(), key = op.itemgetter(1), reverse = True)
        # Check min coef - any negative?
            if classifier.coef_[i,j] > minimum:
                minimum = classifier.coef_[i,j]
            else:
                pass
            if classifier.coef_[i,j] < maximum:
                maximum = classifier.coef_[i,j]
            else:
                pass
            
        for k in range(num_feat):
            print(k, names_coef_sorted[k])   
        print("max:", maximum, "min", minimum)
        
 # SVC coefficients can go negative - squaring is a bad idea.       

In [None]:
top_svc_party_words = list_top_features(top_svc, my_names, 200)   
list_top_svc_features_with_coefs(top_svc, my_names, 200)

In [None]:
top_nb_party_words = list_top_features(top_nb, my_names, 200)   
#print(type(top_nb_party_words))

In [None]:
# Compare with top nb:

# Calculate crossover between top Naive Bayes model and top SVC model:
common_words = {}
for party in parties:
    if len(top_nb_party_words[party]) == len(top_svc_party_words[party]):
        pass
    else:
        print("Warning: invalid comparison - the lists are different lengths.")
    common = list(set(top_nb_party_words[party]).intersection(top_svc_party_words[party]))  
    print(party, ":", len(common), "out of", len(top_nb_party_words[party]), "words are the same. (", 
          100*len(common)/len(top_svc_party_words[party]), '%)')
    common_words[party] = common
    print(common)
    
# I'm more comfortable interpreting the Naive Bayes coefficients so I will use them, but use the cross-over list.    

act : 90 out of 200 words are the same. ( 45.0 %)
[' partnership school', ' manag act', ' choic', ' right', ' person', ' instead', ' campaign', ' homebuild', ' left', ' hous shortag', ' poll', ' onli', ' partnership', ' reveal', ' point', ' ll', ' red', ' share', ' polici', ' age', ' freedom', ' salari', ' resourc manag', ' achiev', ' cut tax', ' pay tax', ' busi', ' far', ' entir', ' life', ' current', ' properti', ' debat', ' tax', ' wait', ' simpli', ' benefit', ' turn', ' lie', ' replac', ' choos', ' voter', ' rural', ' market', ' polit', ' shortag', ' caus', ' scrap', ' confisc', ' power', ' infrastructur', ' use', ' anti', ' red tape', ' mean', ' welcom', ' resourc manag act', ' save', ' parti vote', ' believ', ' onli parti', ' politician', ' let', ' tape', ' cut', ' road price', ' book', ' stronger', ' green', ' taxpay money', ' actual', ' honest', ' strong', ' state school', ' superannu', ' right govern', ' words', ' candid', ' effect', ' alreadi', ' case', ' election', ' rais', ' assist die', ' hous market', ' pay', ' cap', ' kid', ' resign', ' confirm']
green : 84 out of 200 words are the same. ( 42.0 %)
[' foreign trust', ' real', ' risk', ' inquiri', ' people', ' today govern', ' aren', ' child poverti', ' concern', ' river', ' power', ' roch', ' includ', ' child', ' capit gain', ' investor', ' refus', ' need', ' landcorp', ' left', ' lead', ' farm', ' transport', ' control', ' agreement', ' larg', ' ensur', ' convers', ' law', ' select committe', ' cold', ' nativ', ' rivers', ' industri', ' protect', ' clear', ' properti specul', ' famili', ' work', ' govern need', ' ignor', ' chang', ' decis', ' percent', ' profit', ' encourag', ' better', ' benefit', ' marin', ' won', ' make', ' environ', ' bank', ' stop', ' forest', ' review', ' poverti', ' hous market', ' like', ' veri', ' children', ' thousand', ' govern stop', ' kind', ' human', ' choos', ' capit', ' push', ' low', ' kid', ' leav', ' cut', ' plan', ' amend', ' home', ' close', ' pollut', ' climat', ' isn', ' son', ' doesn', ' green', ' select', ' look']
labour : 77 out of 200 words are the same. ( 38.5 %)
[' sure', ' happen', ' cent', ' long', ' citi', ' modern', ' rapid', ' health', ' dhb', ' communiti', ' auckland', ' river', ' hospit', ' climat chang', ' pressur', ' addit', ' teacher', ' fail', ' cost', ' wrong', ' ask', ' deliv', ' healthi', ' ensur', ' allow', ' opposit', ' start', ' financ', ' fix', ' fair', ' vision', ' specul', ' commit', ' promis', ' motel', ' world class', ' build', ' time fresh approach', ' boost', ' simpli', ' skill', ' believ', ' alreadi', ' restor', ' fiscal plan', ' better', ' state', ' share', ' years', ' place', ' invest', ' fix hous', ' claim', ' evid', ' thing', ' time', ' approach', ' clean', ' afford', ' nation', ' question', ' kiwibuild', ' polici', ' fact', ' answer', ' tax cut', ' fresh approach', ' make sure', ' elect', ' qualiti', ' plan', ' time fresh', ' care', ' lack', ' underfund', ' deserv', ' state hous']
maori : 118 out of 200 words are the same. ( 59.0 %)
[' parti lead', ' issu', 'māori', ' papa', ' establish', ' presid', ' inspir', ' tribut', ' te', ' hapū', ' influenc', ' aspir', ' kai', ' tonight', ' mana', ' years', ' ko', ' togeth', ' approach', ' māori seat', ' vote', ' general roll', ' whānau hapū', ' tangata', ' care', ' issue', ' tourism', ' māori pasifika', ' suffer', ' life', ' don', ' year māori', ' ringa', ' atu', ' correct', ' dream', ' whakapapa', ' repres', ' budget', ' kōhanga reo', ' tamariki', ' network', ' tai', ' heard', ' host', ' kaupapa', ' make', ' lie', ' māori tourism', ' aotearoa', ' peopl', ' ethnic', ' voter', ' fight', ' abus', ' voic', ' parti', ' whānau', ' tonga', ' seat', ' māori hous', ' long', ' kura', ' taar', ' state care', ' stori', ' ora', ' iwi', ' prison', ' job', ' wellb', ' million budget', ' manurewa', ' kei', ' whānau hapū iwi', ' royal commiss', ' indigen', ' royal', ' non māori', ' seek', ' asian', ' veri', ' royal commiss inquiri', ' justic', ' especi', ' indigen peopl', ' think', ' seab', ' kōhanga', ' inquiri', ' speak', ' foreshor seab', ' lead', ' hand', ' reo', ' tabl', ' sharp', ' candid', ' seven', ' link', ' elector', ' ki', ' mokopuna', ' pasifika', ' term', ' hapū iwi', ' advoc', ' strength', ' foreshor', ' general', ' commiss inquiri', ' tai tonga', ' roll', ' mai', ' live', ' whānau ora', ' waitangi', ' commiss']
national : 100 out of 200 words are the same. ( 50.0 %)
[' servic', ' communiti', ' initi', ' visit', ' support', ' secur', ' complet', ' forc', ' deliv', ' improv', ' billion', ' trust', ' help', ' skill', ' chang', ' week', ' key', ' togeth', ' import', ' intern', ' month', ' process', ' role', ' achiev', ' busi', ' abl', ' technolog', ' cent', ' north', ' project', ' develop', ' current', ' progress', ' addit', ' agenc', ' design', ' continu', ' member', ' upston', ' aim', ' success', ' social hous', ' direct', ' upgrad', ' invest million', ' launch', ' today', ' inform', ' appoint', ' enabl', ' crown', ' digit', ' engag', ' encourag', ' extend', ' includ', ' provid', ' onlin', ' conserv', ' rang', ' contribut', ' signific', ' identifi', ' wider', ' follow', ' event', ' wellington', ' reduc', ' defenc', ' receiv', ' pacif', ' justic', ' play', ' increas', ' recoveri', ' connect', ' collabor', ' facil', ' south', ' regul', ' avail', ' meet', ' innov', ' partner', ' term', ' million', ' earthquak', ' implement', ' construct', ' group', ' practic', ' scienc', ' focus', ' social', ' bay', ' announc', ' number', ' today announc', ' announc today', ' associ']
nzfirst : 73 out of 200 words are the same. ( 36.5 %)
[' shut', ' hard', ' demand', ' noth', ' real', ' econom', ' year', ' mass', ' farmer', ' countri', ' scheme', ' auckland', ' concern', ' sort', ' deni', ' accept', ' farm', ' pm', ' know', ' super', ' hand', ' massiv', ' got', ' sale', ' fujixerox', ' south', ' talk', ' start', ' face', ' despit', ' industri', ' job', ' biosecur', ' ago', ' fraud', ' sold', ' net', ' taken', ' promis', ' mass immigr', ' national', ' hold', ' want', ' small', ' doe', ' parliament', ' gst', ' prime', ' end', ' did', ' anoth', ' given', ' line', ' old', ' ownership', ' parliament today', ' crime', ' deal', ' tri', ' question', ' district', ' total', ' offic', ' matter', ' asset', ' economi', ' lack', ' maori', ' big', ' export', ' deputi', ' immigr', ' built']



In [None]:
# Convert NB coefficients to suitable number for wordle.net
#party_word_nb_coef = list_top_features_with_coefs(top_nb, my_names, 200) 

party_word_cloud_coef = {}

for party in parties:
    print('\n', party, '\n')
    party_word_cloud_coef[party] = {}
    for word in common_words[party]:
        word = word.strip()
        party_word_cloud_coef[party][word] = str(int(100*(party_word_nb_coef[party][word] + 3))) # Make them positive integers
        my_string = word + ':' + party_word_cloud_coef[party][word] # Print in format ready to drop into wordle.net
        print (my_string)

# These are still stemmed

#### My best guess at de-stemming (removing unintelligible and very common words)

act 

partnership school:61
choice:96
right:90
campaign:38
homebuilding:55
left:220
housing shortage:78
poll:121
online:116
partnership:77
reveal:62
point:107
red:158
share:169
policy:118
age:103
freedom:93
salary:85
resource management act:67
achieve:98
cut tax:69
pay tax:55
business:87
property:225
debate:104
tax:69
simplify:200
benefit:98
lie:80
replace:106
choose:205
voter:96
rural:71
market:57
politics:86
shortage:171
cause:203
scrap:97
confiscate:69
power:36
infrastructure:40
anti:57
red tape:166
welcome:51
save:191
party vote:73
believe:32
only party:56
politician:85
let:62
tape:164
cut:212
roading price:74
stronger:130
green:38
taxpayers money:79
honest:66
strong:71
state school:74
superannuation:85
right to govern:72
words:106
candid:156
effect:82
election:81
raise:77
assisted dying:55
housing market:210
pay:76
cap:61
kids:179
resign:133
confirm:80

 green 

foreign trust:220
risk:242
inquiry:123
people:196
today's government:181
arena:191
child poverty:209
concern:71
river:178
power:36
children:51
capital gains:189
investor:186
refuse:196
need:117
Landcorp:216
lead:217
farm:78
transport:38
control:55
agreement:225
large:194
conversation:209
law:55
select committee:212
cold:193
native:191
rivers:214
industry:92
protect:73
clear:43
property speculators:195
family:32
work:99
government needs:214
ignore:155
change:61
decision:90
profit:205
encourage:69
better:84
benefit:98
marine:178
environment:95
bank:53
stop:49
forest:180
review:243
poverty:223
housing market:210
children:128
government to stop:179
kind:194
human:190
choose:205
capital:196
push:206
low:247
kids:179
leave:191
cut:212
plan:52
amend:182
home:61
pollution:271
climate:148
green:38

 labour 

city:45
modern:157
rapid:151
health:61
DHB:167
community:29
Auckland:126
river:178
hospital:168
climate change:149
pressure:207
teachers:195
fail:228
cost:57
wrong:148
deliver:107
healthy:176
ensure:117
allow:201
opposition:237
financial:161
fix:194
fair:196
vision:165
speculators:173
commit:111
promise:80
motel:151
world class:144
build:53
time for a fresh approach:176
simplify:200
skill:114
believe:32
restore:150
fiscal plan:160
better:84
state:77
share:169
place:83
invest:36
fix housing:148
claim:91
evidence:158
time:108
clean:152
affordability:248
nation:125
question:104
Kiwibuild:178
policy:118
fact:43
tax cut:174
fresh approach:268
make sure:161
election:95
quality:186
plan:52
fresh:182
care:123
lack:30
underfunded:151
deserve:199
state housing:211

 maori 

issue:48
māori:300
papa:105
establish:86
inspired:64
tribute:76
te:207
hapū:126
influence:71
aspire:93
kai:52
mana:85
together:60
approach:72
māori seats:123
vote:153
general roll:58
whānau hapū:96
tangata:84
care:123
issues:65
tourism:85
māori pasifika:52
suffer:64
life:90
ringa:54
atu:90
correct:77
dream:66
whakapapa:52
repressed:76
budget:87
kōhanga reo:111
tamariki:150
network:52
tai:154
host:112
kaupapa:113
make:70
lie:80
māori tourism:60
Aotearoa:137
people:96
ethnicity:54
voter:96
fight:112
voice:134
party:144
whānau:267
tonga:119
seat:156
māori housing:73
long:50
kura:62
state care:80
story:91
ora:176
iwi:164
prison:88
job:102
wellbeing:73
millions budget:66
Manurewa:87
whānau hapū iwi:96
indigenous:138
non-māori:82
Asian:72
royal commission of inquiry:140
justice:52
seabed:55
kōhanga:115
inquiry:123
foreshore and seabed:61
reo:174
candid:156
electorate:50
mokopuna:80
Pasifika:94
hapū iwi:120
advocate:94
strength:50
foreshore:76
tai tonga:125
roll:92
whānau ora:167
waitangi:91
commission:96

 national 

service:56
community:29
initial:111
support:47
secure:89
complete:96
force:61
deliver:107
improve:139
billion:42
trust:98
help:42
skill:114
change:61
week:54
key:93
together:60
important:65
intern:59
process:66
role:77
achieve:98
business:87
able:76
technology:75
project:162
develop:32
current:97
progress:62
addition:79
agency:92
design:66
continue:48
aim:69
success:100
social housing:64
direct:58
upgrade:61
invest millions:52
launch:71
inform:135
appoint:64
enable:83
crown:59
digital:61
engage:70
encourage:69
extend:59
include:159
provide:178
online:48
conservation:49
contribution:119
significant:133
identify:50
follow:91
wellington:67
defence:79
Pacific:111
justice:52
increase:36
recovery:50
connect:124
collaboration:51
facility:62
regulation:51
available:120
innovation:102
partner:80
earthquake:51
implement:73
construction:62
practical:47
science:52
focus:104
social:110
announced today:84
association:90

 nzfirst 

hard:36
demand:39
nothing:84
real:72
economy:105
mass:52
farmers:64
country:117
scheme:60
Auckland:126
concern:71
denial:42
accept:53
farms:78
super:69
massive:70
sale:52
start:65
despite:55
industry:92
jobs:102
biosecurity:48
fraud:64
sold:33
promise:80
mass immigration:38
national:39
want:114
parliament:106
GST:46
ownership:43
crime:44
deal:41
try:75
question:104
office:99
asset:33
economy:106
lack:30
maori:36
export:98
immigration:126
built:55


In [None]:
# READ IN NEW (OUT-OF-SAMPLE) TEST DOCUMENTS: LATEST PRESS RELEASES
# Create dictionary where key = label (party) and value = list of release text strings.
def read_in_new(party, folder):
    release_text_list = []
    for filename in os.listdir(folder):
        full_path = os.path.join(folder, filename)
        #print(full_path)
        file_obj = open(os.path.join(folder, filename), 'r', encoding='utf8')
        content = file_obj.read()
        file_obj.close()
        release_text_list.append(content)
    return(release_text_list)

dict_of_new_text_lists = {}
list_of_all_new_texts = []
list_of_all_new_party_texts = []

for party in parties:
    new_folder = str(os.path.join('test', party))
    print(new_folder)
    list_of_all_new_party_texts = read_in_new(party, new_folder)
    #print(list_of_all_new_party_texts[0][:100])
    dict_of_new_text_lists[party] = list_of_all_new_party_texts
    print('Total new docs for party', party, '=', len(list_of_all_new_party_texts))
    list_of_all_new_texts = list_of_all_new_texts + list_of_all_new_party_texts
    list_of_all_new_party_texts = []

print('Total documents: ', len(list_of_all_new_texts))

In [None]:
# Strip and stem

new_stem_words = []
party_list_of_new_proc_texts = []
dict_of_new_proc_text_lists = {}
stemmer = SnowballStemmer("english")

for party in parties:
    party_list_of_new_proc_texts = []
    
    # Remove the words in my manual strip list above
    for text in dict_of_new_text_lists[party]:
        strip_text = text
        for goner in strip_list:
            if goner in text:
                strip_text = strip_text.replace(goner, ' ')                
            else:
                pass
        
        # Apply Snowball stemmer:
        new_stem_words = []
        text_words = strip_text.split()
        for word in text_words:
            stem_word = stemmer.stem(word)
            new_stem_words.append(stem_word)
        stem_text = " ".join(new_stem_words)
        
        # Strip a few problematic strings from the stemmed text:
        for stemword in strip_from_stemmed:
            if stemword in stem_text:
                stem_text = stem_text.replace(stemword, ' ')
        party_list_of_new_proc_texts.append(stem_text)
        
    # Put list of processed party texts in dictionary with party as key
    dict_of_new_proc_text_lists[party] = party_list_of_new_proc_texts

In [None]:
print (dict_of_new_proc_text_lists['act'][0][0:100])
print (dict_of_new_proc_text_lists['green'][9][0:100])
print (dict_of_new_proc_text_lists['labour'][0][0:100])
print (dict_of_new_proc_text_lists['maori'][0][0:100])
print (dict_of_new_proc_text_lists['national'][0][0:100])
print (dict_of_new_proc_text_lists['nzfirst'][0][0:100])


In [None]:
# Make a list of (unlabelled) processed texts
list_of_all_new_proc_texts = []

for party in parties:
    procd = dict_of_new_proc_text_lists[party]
    list_of_all_new_proc_texts = list_of_all_new_proc_texts + procd
    
print(len(list_of_all_new_proc_texts))
    
# Make a list of party authors that will match up with the texts 
new_party_match = []
for party in dict_of_new_proc_text_lists:
    for text in dict_of_new_proc_text_lists[party]:
        new_party_match.append(party)
        
print(len(new_party_match))

In [None]:
features_test2_transf = top_vectorizer.transform(list_of_all_new_proc_texts)
cut_features_test2_transf  = top_selector.transform(features_test2_transf).toarray()
print(cut_features_test2_transf.shape[0]) # sparse numpy array. shape[0] = 184, number of docs. shape[1] = 6336, num of features
print(cut_features_test2_transf.shape[1]) # sparse numpy array. shape[0] = 184, number of docs. shape[1] = 3801, num of features

In [None]:
labels_new_test = np.array(new_party_match)
print(len(labels_new_test))
print(features_test2_transf.shape[0]) # sparse numpy array. shape[0] = 184, number of docs. shape[1] = num of features

In [None]:
# Naive Bayes 
test_new_predicted_nb = top_nb.predict(cut_features_test2_transf) 
print(classification_report(test_new_predicted_nb, labels_new_test, labels = parties))
print('Confusion Matrix')
print(metrics.confusion_matrix(labels_new_test, test_new_predicted_nb))

# SVC
test_new_predicted_svc = top_svc.predict(cut_features_test2_transf) 
print(classification_report(test_new_predicted_svc, labels_new_test, labels = parties))

print('Confusion Matrix')
print(metrics.confusion_matrix(labels_new_test, test_new_predicted_svc))

In [None]:
"""

Optimised Naive Bayes model:


            precision    recall  f1-score   support

        act       0.71      0.75      0.73        16
      green       0.45      0.86      0.59        21
     labour       0.62      0.53      0.57        34
      maori       0.89      0.73      0.80        11
   national       0.71      0.75      0.73        64
    nzfirst       0.90      0.50      0.64        38

avg / total       0.71      0.67      0.67       184

Confusion Matrix
[[12  0  0  0  3  2]
 [ 1 18  7  3  7  4]
 [ 0  2 18  0  5  4]
 [ 0  0  0  8  0  1]
 [ 2  1  9  0 48  8]
 [ 1  0  0  0  1 19]]
 
 
 Optimised SVC model:
 
             precision    recall  f1-score   support

        act       0.71      0.75      0.73        16
      green       0.70      0.85      0.77        33
     labour       0.69      0.74      0.71        27
      maori       0.89      0.73      0.80        11
   national       0.74      0.82      0.78        61
    nzfirst       0.95      0.56      0.70        36

avg / total       0.77      0.75      0.75       184

Confusion Matrix
[[12  0  0  0  4  1]
 [ 0 28  3  2  4  3]
 [ 0  3 20  1  2  3]
 [ 0  0  0  8  0  1]
 [ 4  2  4  0 50  8]
 [ 0  0  0  0  1 20]]


"""



In [None]:
print(labels_new_test)