In [1]:
from os import chdir, listdir, environ, makedirs, rename, chmod, walk, remove, path
from csv import reader,writer,QUOTE_ALL
import re
import codecs
from random import sample

#generate training data
from math import log, sqrt
import math

#UTILS

#Save csv files
def save_csv_file(urls_info_list, name_csv_file):
    arq_out  = open(name_csv_file, "w")
    writer_out = writer(arq_out, delimiter=',', quoting=QUOTE_ALL)

    for url in urls_info_list:
        writer_out.writerow(tuple(url))
    
    arq_out.close()


#Save csv files
def create_head_cont_csv(filename):
    subset = []
    with open(filename, "r") as arq_in:
        reader_in = reader(arq_in, delimiter=',', quoting=QUOTE_ALL)
        for t in reader_in:
            instance = []
            headCont = t[1]+" "+t[2]
            instance = [t[0], t[1], headCont] + t[3:len(t)]
            subset.append(instance)
    new_namefile_subset = filename[:-4]+'_headContent'+'.csv'
    utils.save_csv_file(subset, new_namefile_subset)
    return "completed"

#Save csv files
def create_subset_csv_file(filename):
    subset = []
    with open(filename, "r") as arq_in:
        reader_in = reader(arq_in, delimiter=',', quoting=QUOTE_ALL)
        for t in reader_in:
            instance = []
            headCont = t[1]+" "+t[2]
            instance = [t[1], t[2], headCont]
            subset.append(instance)
    new_namefile_subset = filename[:-4]+'_SUBSet'+'.csv'
    
    save_csv_file(subset, new_namefile_subset)
    return "completed"
    
#Getting domain name from url.
def get_url_domain(url):
    pos_ini = url.find('://')+3
    pos_end = pos_ini+url[pos_ini:].find('/')
    url_domain = url[pos_ini:pos_end]
    if url_domain.startswith('www.'):
        url_domain = url_domain[4:]
    return url_domain

#Generating a dictionary based on unique domains. That dictionary is indexed by domain names (keys). Each domain name is associated with the total number of urls under that domain (value). 
#input: csv file.
#output: dictionary. ['domain_name_1':total_of_urls_domain_1, 'domain_name_2':total_of_urls_domain_2, ...]
def getUniqueDomains_urls(filename):
    pages_per_domain = {}
    with open(filename, "r") as arq_in:
        reader_in = reader(arq_in, delimiter=',', quoting=QUOTE_ALL)
        for t in reader_in:
            domain_name = get_url_domain(t[0])
            if domain_name=='':
                domain_name='EmptyDomain'
            if domain_name not in pages_per_domain:
                pages_per_domain[domain_name] = 1
            else:
                pages_per_domain[domain_name] = pages_per_domain[domain_name]+1
    return pages_per_domain

#Adding missed domain info as a feature
def add_domain_info(filename):
    pages_per_domain = {}
    all_instances = []
    with open(filename, "r") as arq_in:
        reader_in = reader(arq_in, delimiter=',', quoting=QUOTE_ALL)
        for t in reader_in:
            instance=[]
            domain_name = get_url_domain(t[0])
            if domain_name=='':
                domain_name='EmptyDomain'
            if domain_name not in pages_per_domain:
                pages_per_domain[domain_name] = 1
            else:
                pages_per_domain[domain_name] = pages_per_domain[domain_name]+1
            instance = [t[0], domain_name]+t[1:len(t)]
            all_instances.append(instance)
    new_namefile = filename[:-4]+'_complete'+'.csv'         
    save_csv_file(all_instances, new_namefile)
    return "completed"

# Present information about the data: Total number of domains per dataset, number of urls per domain, total of urls per dataset
def showInfoData(filename):
    unique_fakedomains = getUniqueDomains_urls(filename)
    print ("TOTAL OF UNIQUE DOMAINS: " + str(len(unique_fakedomains)))
    count_fn =0 #total of urls
    traindata_pos_domain = [] #traindata from fake domains
    for domain in unique_fakedomains:
        count_fn = count_fn + unique_fakedomains[domain]
        print(domain + "," + str(unique_fakedomains[domain]))
    print ("TOTAL OF URLS: " + str(count_fn) + "\n")

#Save instances if they below to a domain   
def saveTrainTestData(filename, update_filename, train_domains):
    pages_per_domain = {}
    train_instances = []
    test_instances = []
    with open(filename, "r") as arq_in:
        reader_in = reader(arq_in, delimiter=',', quoting=QUOTE_ALL)
        for t in reader_in:
            instance=[]
            domain_name = get_url_domain(t[0])#domain_name = t[1]
            instance = t[0:len(t)]
            if domain_name in train_domains:
                train_instances.append(instance)
            else:
                test_instances.append(instance)
    new_namefile_train = update_filename[:-4]+'_train'+'.csv'
    new_namefile_test = update_filename[:-4]+'_test'+'.csv'
    
    save_csv_file(train_instances, new_namefile_train)
    save_csv_file(test_instances, new_namefile_test)
    return "completed"

def get_consecutive_domain(domains, total_train, total_test):
    train_domains = domains[total_test:len(domains)]
    test_domains = domains[0:total_test]
    return train_domains, test_domains
def isNotInList(current_index, list_index):
    if current_index in list_index:
        return False
    return True

def get_random_domain(domains, total_test):
    indexes = list(range(len(domains)))
    indexes_test = sample(indexes, total_test) #choose random indexes (total_test = number of random indexes)
    indexes_train = [x for i,x in enumerate(indexes) if isNotInList(i,indexes_test)]

    test_domains = [ domains[i] for i in indexes_test ]
    train_domains = [ domains[i] for i in indexes_train ]
    return train_domains, test_domains

#Split dataset to get training and test dataset for FAKENEWS or MSMNEWS
def getTraningData(filename, percent, index_new_data):
    unique_fakedomains = getUniqueDomains_urls(filename)
    domains = list(unique_fakedomains.keys())
    #print(domains)
    #print(unique_fakedomains)
    print ("TOTAL OF UNIQUE DOMAINS: " + str(len(domains)))
    count_fn =0 #total of urls
    
    #fakenews: with 32% of domains we get TOTAL OF URLS: 1863 which represent almost 19.6% almost 20%
    #msmnews: # with 19% of domains we get TOTAL OF URLS: 1885 which represent almost 19.8% almost 20%
    total_test = math.floor(len(domains)*percent/100) 
    total_train = len(domains)-total_test
    
    #old approach: Fixed split
    #train_domains_fn = domains[total_test:len(domains)]
    #test_domains_fn = domains[0:total_test]
    #train_domains_msm = domains[total_test:len(domains)]
    #test_domains_msm = domains[0:total_test]
    train_domains, test_domains = get_random_domain(domains, total_test)
    
    count_fn = 0
    for domain in test_domains:
        count_fn = count_fn + unique_fakedomains[domain]
        #print(domain + "," + str(unique_fakedomains[domain]))
        
    update_filename = filename[:-4]+'_'+index_new_data+'.csv'
    
    saveTrainTestData(filename, update_filename, train_domains)    
    print ("TOTAL OF URLS: " + str(count_fn)+"\n")


In [4]:
# path_data = "/Users/.../Fakenews/paper_fn/Experiments/data/politicsFN/"
# showInfoData(path_data+'fakenews_politic.csv')
# showInfoData(path_data+'realnews_politic.csv')

In [5]:
# getTraningData(path_data+'fakenews_politic.csv', 27, 'REMOVE1')
# getTraningData(path_data+'realnews_politic.csv', 20, 'REMOVE1')