In [1]:
from xml.etree import ElementTree as ET

In [2]:
xml = ET.parse("countries-full.xml")

In [3]:
class TreeNode:
    def __init__(self, name):
        self.name = name
        self.children = []
    def pprint(self, level=0):
        print(' ' * level + self.name)
        for c in self.children:
            c.pprint(level=level+1)

In [4]:
def recurse(paths, current_path, xml_element):
    if len(xml_element) == 0:
        paths.add(current_path + '/' + xml_element.tag)
    else:    
        for child in xml_element:
            recurse(paths, current_path + '/' + xml_element.tag, child)
def get_paths_from_xml(xml_string):
    paths = set()
    xml_tree = ET.fromstring(xml_string)
    recurse(paths, '', xml_tree)
    return paths

In [5]:
def collect_instance_data(xml_string):
    def _recurse(data, current_path, xml_element):
        current_path = current_path + '/' + xml_element.tag
        if len(xml_element) == 0:
            if current_path not in data:
                data[current_path] = []
            data[current_path].append(xml_element.text)
        else:    
            for child in xml_element:
                _recurse(data, current_path, child)
    data = {}
    xml_tree = ET.fromstring(xml_string)
    _recurse(data, '', xml_tree)
    return data

In [6]:
xml_text = ''
with open('countries-full.xml') as xml_file:
    xml_text = xml_file.read()
get_paths_from_xml(xml_text)

{'/Countries/Country/area',
 '/Countries/Country/callingCode',
 '/Countries/Country/capital',
 '/Countries/Country/currency',
 '/Countries/Country/languages',
 '/Countries/Country/latlng',
 '/Countries/Country/name'}

In [7]:
food_xml_text = open('food.xml').read()
get_paths_from_xml(food_xml_text)

{'/breakfast_menu/food/calories',
 '/breakfast_menu/food/description',
 '/breakfast_menu/food/name',
 '/breakfast_menu/food/price'}

In [8]:
countries_instance_data = collect_instance_data(xml_text)

In [9]:
countries_instance_data.keys()

dict_keys(['/Countries/Country/latlng', '/Countries/Country/capital', '/Countries/Country/currency', '/Countries/Country/callingCode', '/Countries/Country/area', '/Countries/Country/languages', '/Countries/Country/name'])

In [10]:
print(countries_instance_data['/Countries/Country/currency'])

['AWG', 'AFN', 'AOA', 'XCD', 'EUR', 'ALL', 'EUR', 'AED', 'ARS', 'AMD', 'USD', 'nan', 'EUR', 'XCD', 'AUD', 'EUR', 'AZN', 'BIF', 'EUR', 'XOF', 'XOF', 'BDT', 'BGN', 'BHD', 'BSD', 'BAM', 'EUR', 'SHP,GBP', 'BYN', 'BZD', 'BMD', 'BOB,BOV', 'USD', 'BRL', 'BBD', 'BND', 'BTN,INR', 'NOK', 'BWP', 'XAF', 'CAD', 'AUD', 'CHE,CHF,CHW', 'CLF,CLP', 'CNY', 'XOF', 'XAF', 'CDF', 'XAF', 'NZD,CKD', 'COP', 'KMF', 'CVE', 'CRC', 'CUC,CUP', 'ANG', 'AUD', 'KYD', 'EUR', 'CZK', 'EUR', 'DJF', 'XCD', 'DKK', 'DOP', 'DZD', 'USD', 'EGP', 'ERN', 'MAD,DZD,MRO', 'EUR', 'EUR', 'ETB', 'EUR', 'FJD', 'FKP', 'EUR', 'DKK', 'USD', 'XAF', 'GBP', 'GEL', 'GBP', 'GHS', 'GIP', 'GNF', 'EUR', 'GMD', 'XOF', 'XAF', 'EUR', 'XCD', 'DKK', 'GTQ', 'EUR', 'USD', 'GYD', 'HKD', 'AUD', 'HNL', 'HRK', 'HTG,USD', 'HUF', 'IDR', 'GBP', 'INR', 'USD', 'EUR', 'IRR', 'IQD', 'ISK', 'ILS', 'EUR', 'JMD', 'GBP', 'JOD', 'JPY', 'KZT', 'KES', 'KGS', 'KHR', 'AUD', 'XCD', 'KRW', 'EUR', 'KWD', 'LAK', 'LBP', 'LRD', 'LYD', 'XCD', 'CHF', 'LKR', 'LSL,ZAR', 'EUR', 'EUR',

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='char', lowercase=False)
vectors = vectorizer.fit_transform(countries_instance_data['/Countries/Country/latlng'])
vectorizer.vocabulary_

{',': 0,
 '-': 1,
 '.': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12}

In [12]:
currency_data_full = countries_instance_data['/Countries/Country/currency']
capital_data_full = countries_instance_data['/Countries/Country/capital']
latlong_data_full = countries_instance_data['/Countries/Country/latlng']

In [13]:
def split_into_two(data):
    length = len(data)
    return data[:length // 2], data[length // 2:]

currency_data_1, currency_data_2 = split_into_two(currency_data_full)
latlong_data_1, latlang_data_2 = split_into_two(latlong_data_full)
capital_data_1, capital_data_2 = split_into_two(capital_data_full)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
def create_pipeline():
    count_vectorizer = CountVectorizer(analyzer='char', lowercase=False)
    linear_regressor = LinearRegression()
    pipeline = Pipeline([
        ('vectorizer', count_vectorizer), 
        ('regressor' ,linear_regressor)
    ])
    return pipeline

In [15]:
currency_data_model = create_pipeline()

In [16]:
currency_data_model.fit(currency_data_1)

TypeError: Singleton array array(None, dtype=object) cannot be considered a valid collection.

In [18]:
x, y = [], []
x.extend(currency_data_full)
y.extend(['currency'] * len(currency_data_full))
x.extend(latlong_data_full)
y.extend(['latlong'] * len(latlong_data_full))
x.extend(capital_data_full)
y.extend(['capital'] * len(capital_data_full))


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
X = CountVectorizer(analyzer='char', lowercase=False).fit_transform(x).toarray()

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [21]:
list(map(len, [X_train, X_test, y_train, y_test]))

[502, 248, 502, 248]

In [22]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [23]:
classifier.score(X_train, y_train)

0.99800796812749

In [24]:
classifier.score(X_test, y_test)

0.9879032258064516

In [25]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(analyzer='char', lowercase=False)),
    ('desicion_tree_classifier', DecisionTreeClassifier())
])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(x, y)

In [27]:
list(map(len, [X_train, X_test, y_train, y_test]))

[562, 188, 562, 188]

In [28]:
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.9946808510638298

In [29]:
pipeline.predict(['Didfda', 'DDR'])

array(['capital', 'currency'], dtype='<U8')

In [30]:
with open('countries-gen_1.xml') as file1:
    xml1 = file1.read()
with open('countries-gen_2.xml') as file2:
    xml2 = file2.read()

In [33]:
from schemamatching import get_features
xml1_features = get_features(xml1)
xml2_features = get_features(xml2)

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
def create_pipeline():
    count_vectorizer = CountVectorizer(analyzer='char', lowercase=False)
    tree_classifier = DecisionTreeClassifier()
    pipeline = Pipeline([
        ('vectorizer', count_vectorizer), 
        ('classifier' , tree_classifier)
    ])
    return pipeline

In [35]:
pipeline = create_pipeline()
pipeline.fit(xml1_features['item'], xml1_features['tag'])

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
      ...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [36]:
pipeline.score(xml2_features['item'], xml2_features['tag'])

0.6901818181818182

In [38]:
from schemamatching import compare_xmls