In [49]:
#Imports
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('ggplot')

import seaborn as sns

In [67]:
#Reading csv file into Pandas. I'm using the 200 language sample for initial exploration, but later on I will use the larger dataset.
data = pd.read_csv("200-language-sample.csv")
data.head(10)

Unnamed: 0,wals_code,iso_code,glottocode,Name,latitude,longitude,genus,family,macroarea,countrycodes,...,130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand'_nan,58B Number of Possessive Nouns_1 None reported,58B Number of Possessive Nouns_2 One,58B Number of Possessive Nouns_3 Two to four,58B Number of Possessive Nouns_nan,79B Suppletion in Imperatives and Hortatives_1 A regular and a suppletive form alternate,79B Suppletion in Imperatives and Hortatives_2 Imperative,79B Suppletion in Imperatives and Hortatives_3 Hortative,79B Suppletion in Imperatives and Hortatives_5 None (= no suppletive imperatives reported in the reference material),79B Suppletion in Imperatives and Hortatives_nan
0,abi,axb,abip1241,Abipón,-29.0,-61.0,South Guaicuruan,Guaicuruan,South America,AR,...,1,0,0,0,1,0,0,0,0,1
1,abk,abk,abkh1244,Abkhaz,43.083333,41.0,Northwest Caucasian,Northwest Caucasian,Eurasia,GE,...,1,1,0,0,0,0,0,0,1,0
2,aco,kjq,west2632,Acoma,34.916667,-107.583333,Keresan,Keresan,North America,US,...,1,1,0,0,0,0,0,1,0,0
3,aeg,arz,egyp1253,Arabic (Egyptian),30.0,31.0,Semitic,Afro-Asiatic,Africa,EG,...,1,1,0,0,0,0,1,0,0,0
4,ain,ain,ainu1240,Ainu,43.0,143.0,Ainu,Ainu,Eurasia,JP,...,1,1,0,0,0,0,0,0,1,0
5,ala,amp,alam1246,Alamblak,-4.666667,143.333333,Sepik Hill,Sepik,Papunesia,PG,...,1,1,0,0,0,0,0,0,1,0
6,ame,aey,amel1241,Amele,-5.25,145.583333,Madang,Trans-New Guinea,Papunesia,PG,...,1,1,0,0,0,0,0,0,1,0
7,ana,aro,arao1248,Araona,-12.333333,-67.75,Tacanan,Tacanan,South America,BO,...,1,0,0,0,1,0,0,0,1,0
8,apu,apu,apur1254,Apurinã,-9.0,-67.0,Purus,Arawakan,South America,BR,...,1,1,0,0,0,0,0,0,1,0
9,arm,hye,nucl1235,Armenian (Eastern),40.0,45.0,Armenian,Indo-European,Eurasia,AM,...,1,1,0,0,0,0,1,0,0,0


In [69]:
#What are the largest families in the dataset?
families = data.groupby(by="family")
families.size().sort_values()

#For now, I'm just going to focus on the largest language families and their phonologies. 

largest_families = ['Sino-Tibetan',
                  'Trans-New Guinea',
                  'Indo-European',
                  'Niger-Congo',
                  'Austronesian',
                  'Afro-Asiatic'] 

data = data.loc[data['family'].isin(largest_families)]

phonological_features = ['1A Consonant Inventories_1 Small',
 '1A Consonant Inventories_2 Moderately small',
 '1A Consonant Inventories_3 Average',
 '1A Consonant Inventories_4 Moderately large',
 '1A Consonant Inventories_5 Large',
 '1A Consonant Inventories_nan',
 '2A Vowel Quality Inventories_1 Small (2-4)',
 '2A Vowel Quality Inventories_2 Average (5-6)',
 '2A Vowel Quality Inventories_3 Large (7-14)',
 '2A Vowel Quality Inventories_nan',
 '3A Consonant-Vowel Ratio_1 Low',
 '3A Consonant-Vowel Ratio_2 Moderately low',
 '3A Consonant-Vowel Ratio_3 Average',
 '3A Consonant-Vowel Ratio_4 Moderately high',
 '3A Consonant-Vowel Ratio_5 High',
 '3A Consonant-Vowel Ratio_nan',
 '4A Voicing in Plosives and Fricatives_1 No voicing contrast',
 '4A Voicing in Plosives and Fricatives_2 In plosives alone',
 '4A Voicing in Plosives and Fricatives_3 In fricatives alone',
 '4A Voicing in Plosives and Fricatives_4 In both plosives and fricatives',
 '4A Voicing in Plosives and Fricatives_nan',
 '5A Voicing and Gaps in Plosive Systems_1 Other',
 '5A Voicing and Gaps in Plosive Systems_2 None missing in /p t k b d g/',
 '5A Voicing and Gaps in Plosive Systems_3 Missing /p/',
 '5A Voicing and Gaps in Plosive Systems_4 Missing /g/',
 '5A Voicing and Gaps in Plosive Systems_5 Both missing',
 '5A Voicing and Gaps in Plosive Systems_nan',
 '6A Uvular Consonants_1 None',
 '6A Uvular Consonants_2 Uvular stops only',
 '6A Uvular Consonants_3 Uvular continuants only',
 '6A Uvular Consonants_4 Uvular stops and continuants',
 '6A Uvular Consonants_nan',
 '7A Glottalized Consonants_1 No glottalized consonants',
 '7A Glottalized Consonants_2 Ejectives only',
 '7A Glottalized Consonants_3 Implosives only',
 '7A Glottalized Consonants_4 Glottalized resonants only',
 '7A Glottalized Consonants_5 Ejectives and implosives',
 '7A Glottalized Consonants_6 Ejectives and glottalized resonants',
 '7A Glottalized Consonants_7 Implosives and glottalized resonants',
 '7A Glottalized Consonants_nan',
 '8A Lateral Consonants_1 No laterals',
 '8A Lateral Consonants_2 /l/, no obstruent laterals',
 '8A Lateral Consonants_3 Laterals, but no /l/, no obstruent laterals',
 '8A Lateral Consonants_4 /l/ and lateral obstruent',
 '8A Lateral Consonants_5 No /l/, but lateral obstruents',
 '8A Lateral Consonants_nan',
 '9A The Velar Nasal_1 Initial velar nasal',
 '9A The Velar Nasal_2 No initial velar nasal',
 '9A The Velar Nasal_3 No velar nasal',
 '9A The Velar Nasal_nan',
 '10A Vowel Nasalization_1 Contrast present',
 '10A Vowel Nasalization_2 Contrast absent',
 '10A Vowel Nasalization_nan',
 '11A Front Rounded Vowels_1 None',
 '11A Front Rounded Vowels_2 High and mid',
 '11A Front Rounded Vowels_3 High only',
 '11A Front Rounded Vowels_4 Mid only',
 '11A Front Rounded Vowels_nan',
 '12A Syllable Structure_1 Simple',
 '12A Syllable Structure_2 Moderately complex',
 '12A Syllable Structure_3 Complex',
 '12A Syllable Structure_nan',
 '13A Tone_1 No tones',
 '13A Tone_2 Simple tone system',
 '13A Tone_3 Complex tone system',
 '13A Tone_nan',
 '14A Fixed Stress Locations_1 No fixed stress',
 '14A Fixed Stress Locations_2 Initial',
 '14A Fixed Stress Locations_3 Second',
 '14A Fixed Stress Locations_5 Antepenultimate',
 '14A Fixed Stress Locations_6 Penultimate',
 '14A Fixed Stress Locations_7 Ultimate',
 '14A Fixed Stress Locations_nan',
 '15A Weight-Sensitive Stress_1 Left-edge: First or second',
 '15A Weight-Sensitive Stress_3 Right-edge: Ultimate or penultimate',
 '15A Weight-Sensitive Stress_4 Right-oriented: One of the last three',
 '15A Weight-Sensitive Stress_5 Unbounded: Stress can be anywhere',
 '15A Weight-Sensitive Stress_6 Combined: Right-edge and unbounded',
 '15A Weight-Sensitive Stress_7 Not predictable',
 '15A Weight-Sensitive Stress_8 Fixed stress (no weight-sensitivity)',
 '15A Weight-Sensitive Stress_nan',
 '16A Weight Factors in Weight-Sensitive Stress Systems_1 No weight',
 '16A Weight Factors in Weight-Sensitive Stress Systems_2 Long vowel',
 '16A Weight Factors in Weight-Sensitive Stress Systems_3 Coda consonant',
 '16A Weight Factors in Weight-Sensitive Stress Systems_4 Long vowel or coda consonant',
 '16A Weight Factors in Weight-Sensitive Stress Systems_5 Prominence',
 '16A Weight Factors in Weight-Sensitive Stress Systems_6 Lexical stress',
 '16A Weight Factors in Weight-Sensitive Stress Systems_7 Combined',
 '16A Weight Factors in Weight-Sensitive Stress Systems_nan',
 '17A Rhythm Types_1 Trochaic',
 '17A Rhythm Types_2 Iambic',
 '17A Rhythm Types_3 Dual: both trochaic and iambic',
 '17A Rhythm Types_4 Undetermined',
 '17A Rhythm Types_5 No rhythmic stress',
 '17A Rhythm Types_nan',
 '18A Absence of Common Consonants_1 All present',
 '18A Absence of Common Consonants_2 No bilabials',
 '18A Absence of Common Consonants_3 No fricatives',
 '18A Absence of Common Consonants_4 No nasals',
 '18A Absence of Common Consonants_nan',
 '19A Presence of Uncommon Consonants_1 None',
 '19A Presence of Uncommon Consonants_2 Clicks',
 '19A Presence of Uncommon Consonants_3 Labial-velars',
 '19A Presence of Uncommon Consonants_4 Pharyngeals',
 "19A Presence of Uncommon Consonants_5 'Th' sounds",
 '19A Presence of Uncommon Consonants_7 Pharyngeals and "th"',
 '19A Presence of Uncommon Consonants_nan']

phonology = pd.concat([data[large_families],data[phonological_features],data['Name']],axis=1)

phonology.describe()

Unnamed: 0,Sino-Tibetan,Trans-New Guinea,Indo-European,Niger-Congo,Austronesian,Afro-Asiatic,1A Consonant Inventories_1 Small,1A Consonant Inventories_2 Moderately small,1A Consonant Inventories_3 Average,1A Consonant Inventories_4 Moderately large,...,18A Absence of Common Consonants_3 No fricatives,18A Absence of Common Consonants_4 No nasals,18A Absence of Common Consonants_nan,19A Presence of Uncommon Consonants_1 None,19A Presence of Uncommon Consonants_2 Clicks,19A Presence of Uncommon Consonants_3 Labial-velars,19A Presence of Uncommon Consonants_4 Pharyngeals,19A Presence of Uncommon Consonants_5 'Th' sounds,"19A Presence of Uncommon Consonants_7 Pharyngeals and ""th""",19A Presence of Uncommon Consonants_nan
count,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,...,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0
mean,0.123077,0.153846,0.169231,0.2,0.230769,0.123077,0.138462,0.153846,0.369231,0.276923,...,0.030769,0.0,0.046154,0.692308,0.015385,0.092308,0.046154,0.107692,0.0,0.046154
std,0.331082,0.363609,0.377874,0.403113,0.424604,0.331082,0.348072,0.363609,0.486352,0.450961,...,0.174036,0.0,0.211451,0.46513,0.124035,0.291712,0.211451,0.312404,0.0,0.211451
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0


In [71]:
phonology_feature_names = phonology.iloc[:, np.arange(6,113)].columns.to_list()
phonology_features = phonology.iloc[:, np.arange(6,113)]

In [73]:
#As we increase max depth, how does the accuracy of the classifier chance? Let's start with Indo-European.

for i in np.arange(1,10):
    decisionTree = tree.DecisionTreeClassifier(max_depth=i)
    decisionTree = decisionTree.fit(phonology_features, phonology['Indo-European'])
    y_pred = decisionTree.predict(phonology_features)
    print("max depth: ",i," accuracy: ",metrics.accuracy_score(y_true = phonology['Indo-European'], y_pred = y_pred))

max depth:  1  accuracy:  0.8769230769230769
max depth:  2  accuracy:  0.8769230769230769
max depth:  3  accuracy:  0.9230769230769231
max depth:  4  accuracy:  0.9846153846153847
max depth:  5  accuracy:  1.0
max depth:  6  accuracy:  1.0
max depth:  7  accuracy:  1.0
max depth:  8  accuracy:  1.0
max depth:  9  accuracy:  1.0


In [75]:
#Sino Tibetan

for i in np.arange(1,10):
    decisionTree = tree.DecisionTreeClassifier(max_depth=i)
    decisionTree = decisionTree.fit(phonology_features, phonology['Sino-Tibetan'])
    y_pred = decisionTree.predict(phonology_features)
    print("max depth: ",i," accuracy: ",metrics.accuracy_score(y_true = phonology['Sino-Tibetan'], y_pred = y_pred))

max depth:  1  accuracy:  0.8769230769230769
max depth:  2  accuracy:  0.9076923076923077
max depth:  3  accuracy:  0.9538461538461539
max depth:  4  accuracy:  1.0
max depth:  5  accuracy:  1.0
max depth:  6  accuracy:  1.0
max depth:  7  accuracy:  1.0
max depth:  8  accuracy:  1.0
max depth:  9  accuracy:  1.0


In [77]:
#Austronesian

for i in np.arange(1,10):
    decisionTree = tree.DecisionTreeClassifier(max_depth=i)
    decisionTree = decisionTree.fit(phonology_features, phonology['Austronesian'])
    y_pred = decisionTree.predict(phonology_features)
    print("max depth: ",i," accuracy: ",metrics.accuracy_score(y_true = phonology['Austronesian'], y_pred = y_pred))

max depth:  1  accuracy:  0.7692307692307693
max depth:  2  accuracy:  0.9384615384615385
max depth:  3  accuracy:  0.9538461538461539
max depth:  4  accuracy:  0.9846153846153847
max depth:  5  accuracy:  1.0
max depth:  6  accuracy:  1.0
max depth:  7  accuracy:  1.0
max depth:  8  accuracy:  1.0
max depth:  9  accuracy:  1.0


In [83]:
#Now let's make a decision tree, starting again with Indo-European using a max depth of 3

decisionTree = tree.DecisionTreeClassifier(max_depth=3)

decisionTree = decisionTree.fit(phonology.iloc[:, np.arange(6,113)], phonology['Indo-European'])

y_pred = decisionTree.predict(phonology.iloc[:, np.arange(6,113)])

print(metrics.accuracy_score(y_true = phonology['Indo-European'], y_pred = y_pred))

#Printing the tree
print(tree.export_text(decisionTree, feature_names = phonology_feature_names))

#Printing which languages were misclassified
for i in np.arange(len(phonology)):
    a = decisionTree.predict(phonology.iloc[:, np.arange(6,113)])[i]
    language_name = phonology['Name'].to_list()[i]
    True_value = phonology['Indo-European'].to_list()[i]
    True_data = data.loc[data['Name'] == language_name][['Name','family','countrycodes','genus','macroarea']]
    if True_value != a:
        print("misclassified:",language_name, "\n   true family: ", True_data['family'].to_string(index=False))
    #if True_value == a:
        #print("correctly classified:",language_name)

0.9230769230769231
|--- 6A Uvular Consonants_3 Uvular continuants only <= 0.50
|   |--- 12A Syllable Structure_3 Complex <= 0.50
|   |   |--- 4A Voicing in Plosives and Fricatives_nan <= 0.50
|   |   |   |--- class: 0
|   |   |--- 4A Voicing in Plosives and Fricatives_nan >  0.50
|   |   |   |--- class: 0
|   |--- 12A Syllable Structure_3 Complex >  0.50
|   |   |--- 9A The Velar Nasal_1 Initial velar nasal <= 0.50
|   |   |   |--- class: 1
|   |   |--- 9A The Velar Nasal_1 Initial velar nasal >  0.50
|   |   |   |--- class: 0
|--- 6A Uvular Consonants_3 Uvular continuants only >  0.50
|   |--- class: 1

misclassified: Arabic (Egyptian) 
   true family:  Afro-Asiatic
misclassified: Beja 
   true family:  Afro-Asiatic
misclassified: Berber (Middle Atlas) 
   true family:  Afro-Asiatic
misclassified: Irish 
   true family:  Indo-European
misclassified: Spanish 
   true family:  Indo-European


According to this model, if a language has only continuant uvual consonants, it is Indo-European.
If not, but it has a complex syllable structure and no initial velar nasal, it is also Indo-European. 
Otherwise, it's not Indo-European.

With the larger dataset, I expect uvual continuants will not be so important.

Spanish was misclassified because its syllable structure is only moderately complex. 

All three misclassified non-IE languages were Afroasiatic: Egyptian Arabic, Beja, and Berber. Beja is a language of Sudan with a complex syllable structure that lacks uvualar consonants and velar nasals. Berber is a language of 🇲🇦 which is similarly misclassified.

Why might this be?

How about Austronesian?

In [86]:
decisionTree = tree.DecisionTreeClassifier(max_depth=1)

decisionTree = decisionTree.fit(phonology.iloc[:, np.arange(6,113)], phonology['Austronesian'])

y_pred = decisionTree.predict(phonology.iloc[:, np.arange(6,113)])

print(metrics.accuracy_score(y_true = phonology['Austronesian'], y_pred = y_pred))

print(tree.export_text(decisionTree, feature_names = phonology_feature_names))

for i in np.arange(len(phonology)):
    a = decisionTree.predict(phonology.iloc[:, np.arange(6,113)])[i]
    language_name = phonology['Name'].to_list()[i]
    True_value = phonology['Austronesian'].to_list()[i]
    True_data = data.loc[data['Name'] == language_name][['Name','family','countrycodes','genus','macroarea']]
    if True_value != a:
        print("misclassified:",language_name, "\n   true family: ", True_data['family'].to_string(index=False))
    #if True_value == a:
        #print("correctly classified:",language_name)

0.7692307692307693
|--- 9A The Velar Nasal_1 Initial velar nasal <= 0.50
|   |--- class: 0
|--- 9A The Velar Nasal_1 Initial velar nasal >  0.50
|   |--- class: 0

misclassified: Batak (Karo) 
   true family:  Austronesian
misclassified: Chamorro 
   true family:  Austronesian
misclassified: Drehu 
   true family:  Austronesian
misclassified: Fijian 
   true family:  Austronesian
misclassified: Indonesian 
   true family:  Austronesian
misclassified: Kilivila 
   true family:  Austronesian
misclassified: Kiribati 
   true family:  Austronesian
misclassified: Malagasy 
   true family:  Austronesian
misclassified: Maori 
   true family:  Austronesian
misclassified: Paiwan 
   true family:  Austronesian
misclassified: Paamese 
   true family:  Austronesian
misclassified: Rapanui 
   true family:  Austronesian
misclassified: Taba 
   true family:  Austronesian
misclassified: Tagalog 
   true family:  Austronesian
misclassified: Tukang Besi 
   true family:  Austronesian


In [88]:
#Trans New Guinean?

decisionTree = tree.DecisionTreeClassifier(max_depth=2)

decisionTree = decisionTree.fit(phonology.iloc[:, np.arange(6,113)], phonology['Trans-New Guinea'])

y_pred = decisionTree.predict(phonology.iloc[:, np.arange(6,113)])

print(metrics.accuracy_score(y_true = phonology['Trans-New Guinea'], y_pred = y_pred))

print(tree.export_text(decisionTree, feature_names = phonology_feature_names))

for i in np.arange(len(phonology)):
    a = decisionTree.predict(phonology.iloc[:, np.arange(6,113)])[i]
    language_name = phonology['Name'].to_list()[i]
    True_value = phonology['Trans-New Guinea'].to_list()[i]
    True_data = data.loc[data['Name'] == language_name][['Name','family','countrycodes','genus','macroarea']]
    if True_value != a:
        print("misclassified:",language_name, "\n   true family: ", True_data['family'].to_string(index=False))
    #if True_value == a:
        #print("correctly classified:",language_name)

0.9384615384615385
|--- 1A Consonant Inventories_1 Small <= 0.50
|   |--- 5A Voicing and Gaps in Plosive Systems_5 Both missing <= 0.50
|   |   |--- class: 0
|   |--- 5A Voicing and Gaps in Plosive Systems_5 Both missing >  0.50
|   |   |--- class: 1
|--- 1A Consonant Inventories_1 Small >  0.50
|   |--- 9A The Velar Nasal_1 Initial velar nasal <= 0.50
|   |   |--- class: 1
|   |--- 9A The Velar Nasal_1 Initial velar nasal >  0.50
|   |   |--- class: 0

misclassified: Amele 
   true family:  Trans-New Guinea
misclassified: Hamtai 
   true family:  Trans-New Guinea
misclassified: Kewa 
   true family:  Trans-New Guinea
misclassified: Kobon 
   true family:  Trans-New Guinea
