In [60]:

import numpy
import scipy
import gensim
import keras
import tensorflow

In [61]:
feature_names = [
    "number_of_words",
    "number_of_unique_words",
    "number_of_sentences",
    "number_of_long_words",
    "number_of_monosyllable_words",
    "number_of_polsyllable_words",
    "average_number_of_syllables",
    "flesch_readability_ease",
    "first_person_pronouns",
    "second_person_pronouns",
    "third_person_pronouns",
    "conjunction_count",
    "modal_verb_count",
    "number_of_hedge_words",
    "number_of_weasel_words"
]

In [70]:
import xml.etree.ElementTree as ET
from collections import Counter
import numpy as np
import os

def read_files(cols):
    """
    For each xml file return a matrix of values asked for
    """
    path = 'data/train/'
    possibilities = ['mixture of true and false', 'mostly false', 'no factual content', 'mostly true']
    for filename in os.listdir(path):
        data_row = []
        if not filename.endswith('.xml'): continue
        xmlfile = os.path.join(path, filename)
        tree = ET.parse(xmlfile)
        if cols == "mainText":
            if tree.find("mainText").text:
                yield tree.find("mainText").text
            else:
                yield ''
        elif cols == "veracity":
            v = possibilities.index(tree.find("veracity").text)
            yield v
        else:
            for col in cols:
                try:
                    data_row.append(int(tree.find(col).text))
                except:
                    data_row.append(0)
            yield data_row


def feature_matrix(cols):
    data = []
    for row in read_files(cols):
        data.append(row)
    return np.array(data)

def get_document_text():
    data = []
    for row in read_files("mainText"):
        data.append(row)
    return data

def get_veracity():
    data = []
    for row in read_files("veracity"):
        data.append(row)
    return data

def data_distribution(col):
    """
    Return the statistics for each feature
    """
    title, distribution = "" , ""
    path = 'articles/'
    possibilities = ['mixture of true and false', 'mostly false', 'no factual content', 'mostly true']
    stats = [[],[],[],[]]
    for filename in os.listdir("articles/"):
        if not filename.endswith('.xml'): continue
        xmlfile = os.path.join(path, filename)
        tree = ET.parse(xmlfile)
        v = possibilities.index(tree.find("veracity").text)
        try:
            stats[v].append(int(tree.find(col).text))
        except:
            stats[v].append(0)
    if len(col) < 30: col += ("." * (30 - len(col)))
    title = "\t".join([col, "docs", "max","min","mode", "mean"]) + "\n"
    print(title)
    for i,stat in enumerate(stats):
        mean = sum(stat) / len(stat)
        mode = Counter(stat).most_common(1)
        Y = possibilities[i]
        if len(Y) < 30: Y += ("." * (30-len(Y)))
        distribution += "\t".join([Y, str(len(stat)), str(max(stat)), str(min(stat)), str(mode), str(mean)]) + "\n"
        print(distribution)
    return title, distribution

def write_to_feature_distribution_file():
    with open("feature_characteristics.tsv", "w") as f:
        for feature in feature_names:
            title, distribution = data_distribution(feature)
            f.write(title)
            f.write(distribution)
            f.write("\n\n")

if __name__ == '__main__':
    write_to_feature_distribution_file()

number_of_words...............	docs	max	min	mode	mean

mixture of true and false.....	212	0	0	[(0, 212)]	0.0

mixture of true and false.....	212	0	0	[(0, 212)]	0.0
mostly false..................	87	0	0	[(0, 87)]	0.0

mixture of true and false.....	212	0	0	[(0, 212)]	0.0
mostly false..................	87	0	0	[(0, 87)]	0.0
no factual content............	64	0	0	[(0, 64)]	0.0

mixture of true and false.....	212	0	0	[(0, 212)]	0.0
mostly false..................	87	0	0	[(0, 87)]	0.0
no factual content............	64	0	0	[(0, 64)]	0.0
mostly true...................	1264	0	0	[(0, 1264)]	0.0

number_of_unique_words........	docs	max	min	mode	mean

mixture of true and false.....	212	0	0	[(0, 212)]	0.0

mixture of true and false.....	212	0	0	[(0, 212)]	0.0
mostly false..................	87	0	0	[(0, 87)]	0.0

mixture of true and false.....	212	0	0	[(0, 212)]	0.0
mostly false..................	87	0	0	[(0, 87)]	0.0
no factual content............	64	0	0	[(0, 64)]	0.0

mixture of true and false.....	21

number_of_weasel_words........	docs	max	min	mode	mean

mixture of true and false.....	212	0	0	[(0, 212)]	0.0

mixture of true and false.....	212	0	0	[(0, 212)]	0.0
mostly false..................	87	0	0	[(0, 87)]	0.0

mixture of true and false.....	212	0	0	[(0, 212)]	0.0
mostly false..................	87	0	0	[(0, 87)]	0.0
no factual content............	64	0	0	[(0, 64)]	0.0

mixture of true and false.....	212	0	0	[(0, 212)]	0.0
mostly false..................	87	0	0	[(0, 87)]	0.0
no factual content............	64	0	0	[(0, 64)]	0.0
mostly true...................	1264	0	0	[(0, 1264)]	0.0



In [78]:
########################################################################################################################
# Read in the data 
########################################################################################################################
possibilities = ['mixture of true and false', 'mostly false', 'no factual content', 'mostly true']
# possibilities = ['mixture of true and false', 'mostly false', 'mostly true']
def read_files(cols, orientation='all',data='all'):
    """
    For each xml file, return a matrix of values requested
    
    orientation: Only read out data for specific political orientations ('mainstream','')
    data: 'all' tests the full dataset, 'partisan' tests just the partisan articles (i.e. not mainstream/non-factual)
    """
    path = 'articles/'
    for filename in os.listdir(path):
        data_row = []
        if not filename.endswith('.xml'): continue
        xmlfile = os.path.join(path, filename)
        tree = ET.parse(xmlfile)

# Testing whole dataset:        
        if data == 'all':
            if not tree.find("mainText").text: continue
            if orientation != "all" and tree.find("orientation").text != orientation:
                continue
        
# Testing only partisan dataset:
        elif data == 'partisan':
            if not tree.find("mainText").text or tree.find("veracity").text == "no factual content": continue
            if orientation == "all" and tree.find("orientation").text == 'mainstream':
                continue    

# For the dataset, yield the text, veracity label from XML files
        if cols == "mainText":
            if tree.find("mainText").text:
                yield tree.find("mainText").text
            else:
                continue
        elif cols == "veracity":
            v = possibilities.index(tree.find("veracity").text)
            yield v
        elif cols == "both":
            if tree.find("mainText").text:
                v = possibilities.index(tree.find("veracity").text)
                yield tree.find("mainText").text, v
            else:
                continue
        else:
            for col in cols:
                try:
                    data_row.append(float(tree.find(col).text))
                except:
                    data_row.append(0.0)
            yield data_row

def feature_matrix(cols):
    """Returns a matrix of the feature values for the files (note feature values have been written into XML)"""
    data = []
    for row in read_files(cols):
        data.append(np.array(row))
    return np.array(data)

def get_document_text():
    """Return list of article maintext - deprecated"""
    data = []
    for row in read_files("mainText"):
        if not row:
            continue
        else:
            data.append(row)
    return data

def get_veracity():
    """Return list of veracity labels - deprecated"""
    data = []
    for row in read_files("veracity"):
        data.append(row)
    return data

def get_document_text_and_veracity():
    # Return lists of document maintext and veracity labels
    docs, preds = [], []
    for row in read_files("both"):
        if not row[0]:
            continue
        else:
            docs.append(row[0])
            preds.append(row[1])
    return docs, preds

In [80]:
if __name__ == '__main__':
    read_files()
    

TypeError: read_files() missing 1 required positional argument: 'cols'