Transformer for xml files from stackoverflow.

In [2]:
import os
import glob
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import csv

In [3]:
BASE = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))

folder = os.path.join(BASE, "data/raw/stackoverflow")

csv_folder = os.path.join(BASE, "data/cleaned/stackoverflow")

In [4]:
files = glob.glob(os.path.join(folder, "*.xml"))

In [5]:
for file in files:

    tree = ET.parse(file)
    root = tree.getroot()
    
    filename = os.path.basename(file)
    csv_filename = "".join((os.path.splitext(filename)[0], ".csv"))
    csv_path = os.path.join(csv_folder, csv_filename)

    print("Converting xml to csv....")

    with open(csv_path, 'w') as csvfile:
        
        fieldnames = [
            'id', 'published_on', 'updated_on', 'company', 
            'slug', 'title', 'link', 'content', 'tags'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
   
        for elem in root.iter(tag='item'):

            data = {}

            data['id'] = elem.find('guid').text

            data['published_on'] = elem.find('pubDate').text

            data['updated_on'] = elem.find('{http://www.w3.org/2005/Atom}updated').text

            data['company'] = elem.find('{http://www.w3.org/2005/Atom}author').\
                        find('{http://www.w3.org/2005/Atom}name').text

            data['slug'] = elem.find('link').text.split("/")[-1]

            data['title'] = elem.find('title').text

            data['link'] = elem.find('link').text

            data['content'] = elem.find('description').text

            data['tags'] = [cat.text for cat in elem.findall('category')]

            writer.writerow(data)
            
        print(f"\t{csv_filename} saved")

Converting xml to csv....
	2017-10-06-14-03.csv saved
Converting xml to csv....
	2017-10-07-19-19.csv saved
Converting xml to csv....
	2017-10-10-20-07.csv saved
Converting xml to csv....
	2017-10-11-21-49.csv saved
Converting xml to csv....
	2017-10-12-15-54.csv saved
Converting xml to csv....
	2017-10-13-11-56.csv saved
Converting xml to csv....
	2017-10-14-17-40.csv saved
Converting xml to csv....
	2017-10-15-11-47.csv saved
Converting xml to csv....
	2017-10-17-08-51.csv saved
Converting xml to csv....
	2017-10-17-21-55.csv saved
Converting xml to csv....
	2017-10-18-16-57.csv saved
Converting xml to csv....
	2017-10-18-21-55.csv saved
Converting xml to csv....
	2017-10-19-21-05.csv saved
Converting xml to csv....
	2017-10-20-11-22.csv saved
Converting xml to csv....
	2017-10-22-09-27.csv saved
Converting xml to csv....
	2017-10-22-09-28.csv saved
Converting xml to csv....
	2017-10-25-09-46.csv saved
Converting xml to csv....
	2017-10-26-13-39.csv saved
