In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
folder = '/content/drive/My Drive/ire-major-project/' # use in python code

# Functions for Parsing and Cleaning the Data

In [3]:
from bs4 import BeautifulSoup as Soup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem.porter import PorterStemmer
import re
import numpy as np
import xml.etree.ElementTree as et
import lxml
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from time import time
import pandas as pd
%matplotlib inline

def clean_txt(text):
    # Lowercase
    text = text.lower()
    # Remove stop words
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if w not in stops]
    text = " ".join(text)
    # Remove numbers
    text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", text)
    # Remove non-Latin characters
    text = re.sub(
        u'[^\\x00-\\x7F\\x80-\\xFF\\u0100-\\u017F\\u0180-\\u024F\\u1E00-\\u1EFF]', u'', text)
    # Remove multiple spaces
    text = re.sub(r'[\W_+]', ' ', text)
    # Remove multiple spaces
    text = re.sub(' +', ' ', text)
    return text

def parse_data_xml_helper(article):
    # get meta data
    id = article.get('id')
    date = article.get('published-at')
    title = clean_txt(article.get('title'))

    """
    # get external link info
    external_links, internal_count=[],0
    for link in article.find_all('a'):
        if str(link.get('type'))=='internal':
            internal_count+=1
        else:
            external_links.append(link.get('href'))
    """
    # get actual text
    article_text = article.get_text()
    article_text = clean_txt(article_text)
    result = {'id':id, 'date':date, 'title':title, 'article_text':article_text}
    return result

def get_data_from_xml(filename):
    preprocessed_dicts=[]
    with open(filename) as f:
        for event, element in et.iterparse(f):
            if(element.tag=='article'):
                i=et.tostring(element)
                sp = Soup(i, "xml")
                i=sp.find('article')
                preprocessed_dicts.append(parse_data_xml_helper(i))
    return pd.DataFrame(preprocessed_dicts)

def parse_truth_xml_helper(article):
    # get meta data
    id = article.get('id')
    hyperpartisan = article.get('hyperpartisan')
    bias = article.get('bias')
    result = {'id':id,'hyperpartisan':hyperpartisan,'bias':bias}
    return result

def get_ground_truth_from_xml(filename):
    print("Loading data from xml")
    xml_file = open(filename).read()
    soup = Soup(xml_file, 'lxml')
    articles=soup.find_all('article')
    print("Number of articles: ", len(articles))
    truth = []
    for a in tqdm(articles):
        truth.append(parse_truth_xml_helper(a))        
    return pd.DataFrame(truth)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Preprocessing
Use these filenames to change the dataset for data
    # [this is smallest] articles-training-byarticle-20181122.xml - 3mb
    # articles-validation-bypublisher-20181122.xml - 894mb
    # articles-training-bypublisher-20181122.xml - 3gb
Use these filenames to change the dataset for truth
    # [this is smallest] ground-truth-training-byarticle-20181122.xml - 109kb
    # ground-truth-validation-bypublisher-20181122.xml - 24mb
    # ground-truth-training-bypublisher-20181122.xml - 100mb

Run the following cell **ONLY ONCE** to save all the parsed dataset files in a csv format in your drive (directly loading from raw files takes lot of time plus a lot of extra memory also due to some reason).

In [None]:
df_data = get_data_from_xml(folder+'articles-training-bypublisher-20181122.xml')
df_data.to_csv(folder+'data_training_bypublisher.csv')

df_truth = get_ground_truth_from_xml(folder+'ground-truth-training-bypublisher-20181122.xml')
df_truth.to_csv(folder+'ground_truth_training_bypublisher.csv')

df_val_data = get_data_from_xml(folder+'articles-validation-bypublisher-20181122.xml')
df_val_data.to_csv(folder+'data_validation_bypublisher.csv')

df_val_truth = get_ground_truth_from_xml(folder+'ground-truth-validation-bypublisher-20181122.xml')
df_val_truth.to_csv(folder+'ground_truth_validation_bypublisher.csv')

df_test_data = get_data_from_xml(folder+'articles-training-byarticle-20181122.xml')
df_test_data.to_csv(folder+'data_training_byarticle.csv')

df_test_truth = get_ground_truth_from_xml(folder+'ground-truth-training-byarticle-20181122.xml')
df_test_truth.to_csv(folder+'ground_truth_training_byarticle.csv')

100%|██████████| 645/645 [00:00<00:00, 182262.76it/s]

Loading data from xml
Number of articles:  645



