### This file first preprocesses the text files and loads them into a numpy array ready for modeling, then fits Naive Bayes and an SVM

In [None]:
import os
import operator as op
import numpy as np
from numpy import array

from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

from nltk.stem.snowball import SnowballStemmer

In [None]:
parties = ['act', 'green', 'labour', 'maori', 'national', 'nzfirst']

strip_list = ['Posted by', '\n', 'Jacinda', 'Ardern', 'Steven', 'Joyce', ' Bill ', 'English', 'Carmel', 'Sepuloni', 'Barry',
                '2017', 'David', 'Clark', 'Phil', 'Twyford', 'Michael', 'Wood', 'Chris ', 'Hipkins', 'Grant', 'Maggie',
                'Robertson', 'Greg', 'O’Connor', 'Andrew', ' Little', 'Winston', 'Peters', 'Damien', 'O\'Connor',
                'Kelvin', 'Davis', 'Phil', 'Twyford', 'Megan', 'Woods', 'Parker', 'Nanaia', 'Mahuta', 'Paula', 'Bennett',
                'Carter', 'Gerry', 'Brownlee', 'Simon', ' Bridges', ' Amy', 'Adams', 'Jonathan', 'Coleman', 'Christopher',
                'Finlayson', 'Woodhouse', 'Nathan', 'Guy', 'Anne', 'Tolley', ' Ron ', 'Mark', 'Marama', 'Fox', ' Te ', 
                'Ururoa', 'Flavel', 'Jones', 'Shane', 'Taurima', 'Seymour', 'James', 'Shaw', 'Marama', 'Davidson', ' Dr ',
                'Julie', 'Anne', 'Genter', 'Jan ', 'Logie', 'Eugenie', 'Sage', 'Gareth', 'Hughes', 'Steffan', 'Browning',
                'Rt', 'Hon', 'Nick', 'Smith', 'Nikki', 'Kaye', 'Nicky', 'Wagner', 'Minister', 'Paul', 'Goldsmith',
                'ACT', ' National ', 'Green Party', 'Labour', 'First ', 'ENDS', '.', ',' '\"', '\'', 'Māori Party',
                '“','”', 'Facebook5Twitter', 'Steffan', 'Browning', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                'Carlton', 'Burke', 'Chadwick', 'Catherine', 'Christine', 'Alex', 'Alexander', 'Baker', 'Carolyn', 
                'Alyssa', 'Brown', 'Bob', 'Byrn', 'Augustine', 'Crawford', 'Antonio', 'Claudetta', 'Christina', 'Collins'
                'Buckner', 'centre', 'Ben', 'Boyden',  'Alan', 'Bosley', '’', 'Alastair', 'Ballantyne', 'Bruno', 'Cecelia',
                 'Allan', 'Bernard', 'Anderson', 'Andrea', 'Tim', 'spokesperson', 'Scott', 'Simpson', 'Epsom', 'Metiria',
                 'Turei', 'said', 'say', 'John Key', 'John', 'Tukoroirangi Morgan', 'Dame', 'Mei', 'Reedy', 'Leader', 
                 'Northland', 'Member of Parliament', 'Spokesperson', 'Don', 'Houlbrook', 'Stephen', 'Todd', 'Barclay',
                 'Morgan', 'Tariana', 'Turia', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
                  'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 
                  'November', 'December', 'Southland', 'Judith', 'Collins', 'Jacqui', 'Dean', 'Bhupind', 'Singh', 'van Velden',
                 'govt.nz', 'Mitchel', 'New Zealand', 'First', 'NZ', 'Mitchell', 'Tracy', 'Martin', 'Mike', 'MP', 'Prosser',
                 'William', 'Sio', 'Don', 'Zealand', 'Aupito', 'Kevin', 'Hague', 'Bhupind', 'Singh', 'Louise', 'Upston'] 

strip_from_stemmed = ['conclusionth', 'bennet', 'brydon', 'bosley', 'centrewellington', 'countrynew', 'ballantyn', 'allan',
                     'delahunti', ',', 'beth', 'ms', 'mr', 'www', 'http', 'media contact', 'govt', 'nz'] 


In [None]:
def read_in(party, folder):
    release_text_list = []
    for filename in os.listdir(folder):
        full_path = os.path.join(party, filename)
        file_obj = open(os.path.join(party, filename), 'r', encoding='utf8')
        content = file_obj.read()
        file_obj.close()
        release_text_list.append(content)
    return(release_text_list)

In [None]:
# Create dictionary where key = label (party) and value = list of release text strings.

dict_of_text_lists = {}
list_of_all_texts = []
list_of_all_party_texts = []

for party in parties:
    list_of_all_party_texts = read_in(party, party)
    dict_of_text_lists[party] = list_of_all_party_texts
    print('Total docs for party', party, '=', len(list_of_all_party_texts))
    list_of_all_texts = list_of_all_texts + list_of_all_party_texts
    list_of_all_party_texts = []

print('Total documents: ', len(list_of_all_texts))

In [None]:
# Undersample National to improve dataset balance

print ('Removing three out of every four National Party press release over time:')
print('National had', len(dict_of_text_lists['national']), 'press releases')
top_end = len(dict_of_text_lists['national'])
              
for i in range(0, int(top_end/4)):
#for i in range(1, int(top_end/4)):  # Robustness check - try a different quarter
#for i in range(2, int(top_end/4)):  # Robustness check - try a different quarter
#for i in range(3, int(top_end/4)):  # Robustness check - try a different quarter

    del dict_of_text_lists['national'][i:i+3]
    i += 3
    
print('National now has', len(dict_of_text_lists['national']), 'press releases')


In [None]:
print(dict_of_text_lists['green'][30][:150]) # Test

In [None]:
# Get rid of names (giveaways for the authoring problem) and a few other 'cheat' strings, and stem the text

stem_words = []
party_list_of_proc_texts = []
dict_of_proc_text_lists = {}
stemmer = SnowballStemmer("english")

for party in parties:
    party_list_of_proc_texts = []
    
    # Remove the words in my manual strip list above
    for text in dict_of_text_lists[party]:
        strip_text = text
        for goner in strip_list:
            if goner in text:
                strip_text = strip_text.replace(goner, ' ')                
            else:
                pass
        
        # Apply Snowball stemmer:
        stem_words = []
        text_words = strip_text.split()
        for word in text_words:
            stem_word = stemmer.stem(word)
            stem_words.append(stem_word)
        stem_text = " ".join(stem_words)
        
        # Strip a few problematic strings from the stemmed text:
        for stemword in strip_from_stemmed:
            if stemword in stem_text:
                stem_text = stem_text.replace(stemword, ' ')
        party_list_of_proc_texts.append(stem_text)
        
    # Put list of processed party texts in dictionary with party as key
    dict_of_proc_text_lists[party] = party_list_of_proc_texts

print('Example:')    
print('Original text:', text[:80])
print('\n', 'Stripped text:', strip_text[:80])
print('\n', 'After stem:' , stem_text[:80])


In [None]:
# Test a few examples
print(dict_of_proc_text_lists['act'][10][0:100])
print(dict_of_proc_text_lists['labour'][3][0:100])
print(dict_of_proc_text_lists['green'][53][0:100])

In [None]:
# Make a list of (unlabelled) processed texts
list_of_all_proc_texts = []

for party in parties:
    procd = dict_of_proc_text_lists[party]
    list_of_all_proc_texts = list_of_all_proc_texts + procd
    
print(len(list_of_all_proc_texts))
    
# Make a list of party authors that will match up with the texts 
party_match = []
for party in dict_of_proc_text_lists: # 6 parties
    for text in dict_of_proc_text_lists[party]:
        party_match.append(party)
        
print(len(party_match))



Text is ready for analysis


In [None]:
# Split into training and testing sets:
docs_train, docs_test, labels_train, labels_test = model_selection.train_test_split(list_of_all_proc_texts, 
                                                                                    party_match, 
                                                                                    test_size = 0.2,
                                                                                    stratify = party_match)
# Stratifying ensures same balance of test as training data
print(len(docs_train))

In [None]:
# Text vectorization

# Trial and error exploration:
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df = 0.01, stop_words='english')
# max_df=0.5 means words must not appear in more than half of docs

# add 2- and 3-grams
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df = 0.01, stop_words='english', ngram_range=(1, 3))

#Take min_df out again
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', ngram_range=(1, 3))

#Put min_df in again (number of parameters exploded!)

# Interim chosen model before gridsearch:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df = 0.75, min_df = 0.005, stop_words='english', ngram_range=(1, 3))

features_train_transf = vectorizer.fit_transform(docs_train)
features_test_transf  = vectorizer.transform(docs_test)

In [None]:
# Check number of features
all_feature_names = vectorizer.get_feature_names()
print('Initial number of features after vectorisation:', len(all_feature_names))  

# Note features will be further reduced with SelectPercentile below

In [None]:
# Select top x% of most individually useful features
selector = SelectPercentile(f_classif, percentile=50) 
selector.fit(features_train_transf, labels_train)

my_names = np.asarray(vectorizer.get_feature_names())[selector.get_support()]  #  Feature names (alphabetical)
print('New number of features after SelectPercentile:', len(my_names)) # Halved.

In [None]:
# Put into numpy arrays
cut_features_train_transf = selector.transform(features_train_transf).toarray()
cut_features_test_transf  = selector.transform(features_test_transf).toarray()

# Convert labels from lists to numpy arrays
labels_train = array(labels_train)
labels_test = array(labels_test)

In [None]:
print('Number of training documents:')
print(len(cut_features_train_transf))
print(len(labels_train))

In [None]:
unique, counts = np.unique(labels_train, return_counts=True)
train_counts = dict(zip(unique, counts))
print('Size of training dataset (documents):', train_counts)

### Now ready for scikit-learn


In [None]:
# Multinomial Naive Bayes

classifier = MultinomialNB() # Using default alpha for now
fitted    = classifier.fit(cut_features_train_transf, labels_train)
test_predicted = classifier.predict(cut_features_test_transf)

In [None]:
# Simple accuracy score
print('Score:', classifier.score(cut_features_test_transf, labels_test))

# Deeper metrics
print(classification_report(test_predicted, labels_test, labels = parties))

# Note that although all classes are represented in the test data, this report will throw a warning if the model never predicts
# some of the classes. Here it often makes no predictions that a release comes from the Maori party (under-represented in data)

Precision: how many of the ones predicted to be (party) were in fact authored by that party?

Recall: what proportion of releases authored by (party) were in fact correctly identified as such?

In [None]:
unique, counts = np.unique(labels_test, return_counts=True)
test_counts = dict(zip(unique, counts))
print('Make-up of the test set documents:', test_counts)

#### Assess which features (words) matter for each party - check for 'cheats'


In [None]:
# Make list of features with highest coefficient values, per class, from most to least important

def list_top_features(classifier, feature_names, num_feat):
    party_words = {}
    counter = 0
    top = 'top' + str(num_feat)
    for i, label_train in enumerate(parties):          # enumerate loops with an automatic counter (in this case, i)
        top = np.argsort(classifier.coef_[i])[::-1][0:num_feat]
        list_top = str(', '.join(feature_names[j] for j in top)).split(',')
        print(" ")
        print(parties[counter], 'most distinguishing words from most to ' + str(num_feat) + ':')
        print(list_top)
        party_words[parties[counter]] = list_top  # dict of lists where keys are parties
        counter += 1        
    return(party_words)


# This version also prints out coefficients

def list_top_features_with_coefs(classifier, feature_names, num_feat):
    len_feature_names = len(feature_names)
    for i in range(6):
        print('\n', parties[i], '\n')  
        diff = classifier.feature_log_prob_[i,:] - np.max(classifier.feature_log_prob_[-i:]) 
        name_diff = {}   
        for j in range(len_feature_names):
            name_diff[feature_names[j]] = diff[j]
            names_diff_sorted = sorted(name_diff.items(), key = op.itemgetter(1), reverse = True)
            
        for k in range(num_feat):
            print(k, names_diff_sorted[k])

In [None]:
my_names = np.asarray(vectorizer.get_feature_names())[selector.get_support()]    

party_words = list_top_features(classifier, my_names, 100) 

list_top_features_with_coefs(classifier, my_names, 10) 

So which parties does it mix up? eg left (Labour, Greens, Maori) vs right (National, ACT)? Or is it more random?

In [None]:
print('Confusion Matrix')
print(metrics.confusion_matrix(labels_test, test_predicted))

### Interpreting the Confusion Matrix

### Example run of my interim chosen model
Confusion Matrix
[[ 6  0  2  0  3 15]
 [ 0  2  3  0  7 10]
 [ 0  0 13  0 10 11]
 [ 0  0  1  0  6  3]
 [ 0  0  0  0 44  1]
 [ 0  0  0  0  1 46]]


Test sample counts: {'act': 26, 'green': 22, 'labour': 34, 'maori': 10, 'national': 45, 'nzfirst': 47}

The model has a strong tendency to overpredict the classes with the most examples, namely National and NZ First (the last two columns). It totally failed in identifying Maori party releases and was nearly as poor for the Green party.
 
https://stackoverflow.com/questions/30746460/how-to-interpret-scikits-learn-confusion-matrix-and-classification-report
   

# Interim model before optimisation


Score: 0.614130434783
             precision    recall  f1-score   support

        act       0.27      1.00      0.42         7
      green       0.05      1.00      0.09         1
     labour       0.53      0.67      0.59        27
      maori       0.00      0.00      0.00         0
   national       0.96      0.63      0.76        68
    nzfirst       0.94      0.54      0.69        81

avg / total       0.85      0.61      0.69       184



### Total fail for the Maori and pretty bad for Green parties when maximising score (accuracy). Try Gridsearch using equally weighted F1 score instead and hopefully improve the recall for the smaller parties.

In [None]:
# Build a pipeline to make Gridsearching different models easier

text_clf = Pipeline([('vect', TfidfVectorizer(sublinear_tf=True, stop_words='english', ngram_range=(1, 3), max_df=0.9, min_df=0.005)),
                     ('selector', SelectPercentile(percentile = 60)),
                     ('clf', MultinomialNB()), 
    # Put optimised min_df & max_df into vectorizer and optimised percentile into SelectPercentile for fine-tuning round
])

In [None]:
# Initial run:
"""
parameters = {'vect__max_df': (0.5, 0.7, 0.9),
              'vect__min_df': (0.005, 0.01),
              'selector__percentile': (40, 60, 80),
              'clf__alpha': (0.0001, 0.001, 0.01) }  # Laplace smoothing

"""
# Later run: fine-tune:
parameters = {'clf__alpha': (0.005, 0.01, 0.05, 0.1, 0.2) }  # Laplace smoothing


# Scoring:
scoring = {'F1':       make_scorer(f1_score, average='weighted'), 
           'Accuracy': make_scorer(accuracy_score)}

# average='weighted': Calculate metrics for each label, and find their average, weighted by support 
# (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; 
# it can result in an F-score that is not between precision and recall.


In [None]:
gs_clf = GridSearchCV(text_clf, parameters, scoring = scoring, refit = 'F1', cv = 6)
gs_clf.fit(docs_train, labels_train)
results = gs_clf.cv_results_

# Setting refit='F1', refits an estimator on the whole dataset with the
# parameter setting that has the best F1 score.


In [None]:
print("Best score")
print(gs_clf.best_score_)   

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))


Best identified parameters: 
clf__alpha: 0.01
selector__percentile: 60
vect__max_df: 0.9
vect__min_df: 0.005

Score with these parameters: 0.8027
NOTE: On one run it chose max_df = 0.5 with a similar score. It doesn't seem to matter too much really

Now fine-tune alpha, putting the optimised values for the other parameters as above into the Pipeline.

FINE-TUNED BEST MODEL: actually it just chose alpha = 0.01 again

In [None]:
# Now test how a model fitted with these parameters does on held out test set
# Need to re-run from vectorizer stage with these parameters

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df = 0.9, min_df = 0.005, stop_words='english', ngram_range=(1, 3))
features_train_transf = vectorizer.fit_transform(docs_train)
features_test_transf  = vectorizer.transform(docs_test)
selector = SelectPercentile(f_classif, percentile=60) 
selector.fit(features_train_transf, labels_train)
cut_features_train_transf = selector.transform(features_train_transf).toarray()
cut_features_test_transf  = selector.transform(features_test_transf).toarray()
labels_train = array(labels_train)
labels_test = array(labels_test)
classifier = MultinomialNB(alpha = 0.01)
fitted    = classifier.fit(cut_features_train_transf, labels_train)
test_predicted = classifier.predict(cut_features_test_transf)

print(classification_report(test_predicted, labels_test, labels=parties))


Results of optimised model on held-out test set:

precision    recall  f1-score   support

        act       0.81      0.78      0.79        27
      green       0.45      0.59      0.51        17
     labour       0.68      0.82      0.74        28
      maori       0.90      0.90      0.90        10
   national       0.91      0.82      0.86        50
    nzfirst       0.87      0.79      0.83        52

avg / total       0.81      0.79      0.79       184



In [None]:
# Now to test out optimised model's robustness more thoroughly, go to Vet_optimised_model notebook