In [13]:
%matplotlib inline
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sklearn
import os
import csv

Importing Maori lexicon from http://nzetc.victoria.ac.nz/tm/scholarly/tei-legalMaoriCorpus.html#legal_maori_lexicon. Removed macrons, parentheses etc from file in Sublime Text.

In [14]:
with open('../data/raw/maori_legal_lexicon.csv', 'rb') as f:
    reader = csv.reader(f)
    rough_list = list(reader)
    maori_lexicon = set([])
    for item in rough_list:
        for word in item:
            if word!='':
                for i in word.split(' '):
                    maori_lexicon.add(i.lower())
maori_lexicon.remove('')

In [15]:
print len(maori_lexicon)

1923


Using a slightly edited verison of the first ~2000 from https://github.com/first20hours/google-10000-english/blob/master/20k.txt

In [16]:
with open('../data/raw/english_lexicon_top_2000.csv', 'rb') as f:
    reader = csv.reader(f)
    rough_list = list(reader)
    english_lexicon = set([])
    for wordlist in rough_list:
        for word in wordlist:
            english_lexicon.add(word)

In [17]:
print len(english_lexicon)

1982


In [18]:
def word2features(word,maori=True):
    """Returns a word length normalized vector of all letter frequencies and bigrams.
       First entry is word, second is whether it is a Maori word or not."""
    if maori == True:
        features = [word,1]
    else:
        features = [word,0]
    word_length = len(word)
    alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l',
                'm','n','o','p','q','r','s','t','u','v','w','y','z']
    for letter in alphabet:
        features.append(word.count(letter)/float(word_length))
    for first_letter in alphabet:
        for second_letter in alphabet:
            features.append(word.count(first_letter+second_letter)/float(word_length))
    return features

In [19]:
words = []
for word in maori_lexicon:
    words.append(word2features(word))
for word in english_lexicon:
    words.append(word2features(word,maori=False))

In [63]:
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l',
            'm','n','o','p','q','r','s','t','u','v','w','y','z']
column_names = ['word', 'maori_binary']
for letter in alphabet:
    column_names.append(letter)
for first_letter in alphabet:
    for second_letter in alphabet:
        column_names.append(first_letter+second_letter)

In [64]:
words_df = pd.DataFrame(words)
words_df.columns = column_names
print words_df.head()

           word  maori_binary         a    b    c    d         e    f  \
0         ainga             1  0.400000  0.0  0.0  0.0  0.000000  0.0   
1         putua             1  0.200000  0.0  0.0  0.0  0.000000  0.0   
2  whakaminenga             1  0.250000  0.0  0.0  0.0  0.083333  0.0   
3       pononga             1  0.142857  0.0  0.0  0.0  0.000000  0.0   
4       whakata             1  0.428571  0.0  0.0  0.0  0.000000  0.0   

          g         h ...    zp   zq   zr   zs   zt   zu   zv   zw   zy   zz  
0  0.200000  0.000000 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.000000  0.000000 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.083333  0.083333 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.142857  0.000000 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.000000  0.142857 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 652 columns]


In [67]:
X = words_df.drop(['word','maori_binary'],axis=1).values
y = words_df['maori_binary']

In [68]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

In [69]:
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)
cm = np.array([[0,0],[0,0]])
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100, \
                              silent=True, objective='binary:logistic', nthread=-1, \
                              gamma=0, min_child_weight=1, max_delta_step=0, \
                              subsample=1, colsample_bytree=1, colsample_bylevel=1, \
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, \
                              base_score=0.5, missing=None)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm += confusion_matrix(y_test, y_pred)
print cm

[[1854  128]
 [  62 1861]]


In [70]:
final_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100, \
                              silent=True, objective='binary:logistic', nthread=-1, \
                              gamma=0, min_child_weight=1, max_delta_step=0, \
                              subsample=1, colsample_bytree=1, colsample_bylevel=1, \
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, \
                              base_score=0.5, missing=None)
final_model.fit(X, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

Which words does the model get wrong?

In [61]:
misclassified = []
for word in maori_lexicon:
    test=[]
    test.append(word2features(word))
    test_df = pd.DataFrame(test)
    binary = final_model.predict(test_df.drop([0,1],axis=1).values)
    if binary == 0:
        misclassified.append(word)
        
for word in english_lexicon:
    test=[]
    test.append(word2features(word))
    test_df = pd.DataFrame(test)
    binary = final_model.predict(test_df.drop([0,1],axis=1).values)
    if binary == 1:
        misclassified.append(word)

print misclassified

['me', '1993', '1992', '1996', 'mo', 'toe', '1969', 'tono', '1974', '1973', 'pl.', 'o', 'no', '1991', 'ngerengere', 'pere', 'orite', 'momo', 'tere', 'moni', 'pinono', 'ngere', 'act', '5', 'to', 'romalpa', '1908', 'pono', 'e', '2002', '2000', 'reo', '1986', 'tene', '1987', '1983', 'here', 'how', 'hot', 'poker', 'japan', 'arizona', 'took', 'amazon', 'nature', 'iraq', 'air', 'ip', 'it', 'make', 'european', 'kit', 'photo', 'human', 'pain', 'war', 'amateur', 'heat', 'hear', 'maximum', 'time', 'team', 'appear', 'wait', 'root', 'extra', 'market', 'making', 'map', 'max', 'nation', 'name', 'white', 'take', 'where', 'week', 'output', 'ok', 'her', 'area', 'am', 'an', 'at', 'again', 'homepage', 'rate', 'what', 'put', 'taken', 'hour', 'a', 'paper', 'trip', 'hit', 'him', 'are', 'europe', 'error', 'repair', 'up', 'mature', 'tax', 'rape', 'auto', 'keep', 'who', 'hope', 'ohio', 'taking', 'mike', 'i', 'hair', 'nokia', 'hi', 'he']


# NZ Road Names

Downloaded the Road Address database https://data.linz.govt.nz

In [76]:
nz_roads_df = pd.read_csv('../data/raw/lds-nz-roads-addressing-CSV/nz-roads-addressing.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [79]:
nz_roads_df.head()

Unnamed: 0,WKT,road_id,geometry_class,road_type,road_name_class,full_road_name,road_name_label,road_name_prefix,road_name_body,road_name_type,road_name_suffix,route_name_body,route_name_number,route_name_alpha,route_name_suffix,full_road_name_ascii,road_name_label_ascii,road_name_body_ascii
0,MULTILINESTRING ((170.539440266700012 -45.8829...,1770045,Addressing Road,Unknown,Road,Harbour Heights,Harbour Hts,,Harbour,Heights,,,,,,Harbour Heights,Harbour Hts,Harbour
1,MULTILINESTRING ((170.359735333299994 -45.8823...,1770050,Addressing Road,Unknown,Road,Stanley Square,Stanley Sq,,Stanley,Square,,,,,,Stanley Square,Stanley Sq,Stanley
2,MULTILINESTRING ((169.532780550000012 -45.5881...,1770085,Addressing Road,Unknown,Road,Lake Onslow Road,Lake Onslow Rd,,Lake Onslow,Road,,,,,,Lake Onslow Road,Lake Onslow Rd,Lake Onslow
3,MULTILINESTRING ((170.490824783300013 -45.8825...,1770130,Addressing Road,Unknown,Road,William Street,William St,,William,Street,,,,,,William Street,William St,William
4,MULTILINESTRING ((170.478778566311604 -45.4729...,1770136,Addressing Road,Unknown,Road,Ainges Road,Ainges Rd,,Ainges,Road,,,,,,Ainges Road,Ainges Rd,Ainges


In [83]:
road_names = nz_roads_df['road_name_body_ascii'].values
road_name_X = []
for name in road_names:
    road_name_X.append(word2features(str(name)))
print len(road_name_X)

74594


In [84]:
road_name_X_df = pd.DataFrame(road_name_X)

In [85]:
X = road_name_X_df.drop([0,1],axis=1).values

In [86]:
nz_roads_df['maori_binary'] = final_model.predict(X)

In [94]:
maori_roads = nz_roads_df[nz_roads_df['maori_binary']==1]['road_name_body_ascii'].values
english_roads = nz_roads_df[nz_roads_df['maori_binary']==0]['road_name_body_ascii'].values

In [95]:
print list(maori_roads)
print list(english_roads)

['Muir', 'Totara Peak', 'Totara', 'Waiareka', 'Waiareka', 'Meek', 'Arapeta', 'Upper Taieri-Paerau', 'Hermitage', 'Eureka', 'White', 'Kawarau', 'Barra', 'Warren', 'Domain', 'Wakatipu', 'Rata', 'Te Ra', 'Broughton', 'Matai', 'Marewa', 'Peat', 'Orokonui', 'Cupar', 'Nairn', 'Pari', 'Rawhiti', 'Rona', 'Totara', 'View', 'Cromer', 'Ajax', 'Taratu', 'Mowat', 'Matuanui', 'Marama', 'Houghton', 'Matau', 'Pukeawa', 'John', 'Baker', 'Swamp', 'View', 'Taine', 'Briar Bank', 'Minaret', 'Otokia', 'Muriwai', 'Stowmore', 'Waimea', 'Utah', 'Keir', 'Toi Toi', 'Diana', 'Kauana', 'Marinui', 'Crimea', 'Crino', 'Aurum', 'Annan', 'Waimatuku South', 'Rimu', 'Brook', 'Moana', 'Tainui', 'Katie', 'Kaka', 'Maher', 'Oware', 'Matai', 'Main', 'Paringa', 'Johnnie', 'Tipapa', 'Wiremu', 'Tau', 'Bank', 'Weka', 'Omara', 'Orari', 'Karaka', 'Sherratt', 'Miro', 'Hooker', 'Rata', 'Iri Irikapua', 'Spur', 'Te Puke', 'Renata', 'Rima', 'Rehua', 'Harman', 'Tekoa', 'Muritai', 'Waitaki', 'Waratah', 'Barrer', 'Tauiwi', 'Ruakaka', 'Jarr