In [1]:
import pandas as pd
import numpy as np
import os
import regex as re
import string
import csv
from collections import OrderedDict
import tensorflow as tf
import math

In [2]:
# The following class and two functions have been taken from wikipedia at https://en.wikipedia.org/wiki/Trie#Algorithms

class Node():
    def __init__(self):
       # Note that using dictionary for children (as in this implementation) would not allow lexicographic sorting mentioned in the next section (Sorting),
       # because ordinary dictionary would not preserve the order of the keys
        self.children = {}  # mapping from character ==> Node
        self.value = None

def find(node, key):
    for char in key:
        if char in node.children:
            node = node.children[char]
        else:
            return None
    return node.value
    
def insert(root, string, value):
    node = root
    index_last_char = None
    for index_char, char in enumerate(string):
        if char in node.children:
            node = node.children[char]
        else:
            index_last_char = index_char
            break

    # append new nodes for the remaining characters, if any
    if index_last_char is not None: 
        for char in string[index_last_char:]:
            node.children[char] = Node()
            node = node.children[char]

    # store value in the terminal node
    node.value = value

# The following two functions have been written by the programmers for additional purposes of the trie    
    
def find_multiple(node, keys):
    # Return values for multiple Keys in the trie Node in order that keys are presented
    holder = node
    vals = [None]*len(keys)
    counter = 0
    for key in keys:
        node = holder
        for char in key:
            if char in node.children:
                node = node.children[char]
        vals[counter] = node.value
        counter += 1
    return vals

def update(node, key, difference):
    # Change the value which is currently stored for the Key in the trie Node by a value of Difference
    for char in key:
        if char in node.children:
            node = node.children[char]
    node.value += difference  

In [3]:
# the sets are a data structure which was solely used for checking results with the trie
listtopics=set()
listplaces=set()
listwords = set()
article_tries = [None]*21812
# counts for no topics and no places in articles
cntnotop=0 
cntnoplc=0 
# Trie for the different topics, locations, and count of every word present across all articles
trieTopics = Node()
trieLoc = Node()
WordCount = Node()
# csv to hold all word values per article (row)

articleCount = 0

for i in range(0,22):
    # over all files
    if(i>=10):
        # file names differ by the #, which is double digit for i>=10
        filename = 'reut2-0'+str(i)+'.sgm'
    else:
        filename = 'reut2-00'+str(i)+'.sgm'
    path = ''+filename
    file = open(path, 'rb')
    data = file.read()
    x = re.findall(r'<REUTERS(.*?)</REUTERS>', data.decode("windows-1252"), re.DOTALL, overlapped=True)
    # finds all instances of "<REUTERS . . ." in a given file and save them 

    for j in range(0,len(x)):
        # for all articles in a file since every article starts with the REUTERS tag
        yTopic = re.findall(r'<TOPICS>(.*?)</TOPICS>', x[j], re.DOTALL, overlapped=True)
        # store all topics in an article since an article can have multiple topics
        for k in range(0,len(yTopic)):
            lt = yTopic[k]

            article_topics = Node()
            topics = re.findall(r'<D>(.*?)</D>', lt, re.DOTALL, overlapped=True)
            # Make sure D tag does not included as part of the topic name
            if(len(topics)==0):
                # length is 0 when there is no topic
                cntnotop=cntnotop+1
            for l in topics:
                # for every topic found in an article
                if (find(trieTopics,l) == None):
                    # check if the topic is already in the trie, and if not insert it with value 1
                    insert(trieTopics, l, 1)
                    insert(article_topics, l, 1)
                elif (find(article_topics, l) == None):
                    insert(article_topics, l, 1)
                    update(trieTopics, l, 1)
                else:
                    # its been found already in the trie so increase the value by 1
                    update(trieTopics, l, 1)
                    update(article_topics, l, 1)
                #article_topics.append(l)
                listtopics.add(l)
        
        article_places = []
        yPlace = re.findall(r'<PLACES>(.*?)</PLACES>', x[j], re.DOTALL, overlapped=True)
        for k in range(0,len(yPlace)):
            lt = yPlace[k]
            places = re.findall(r'<D>(.*?)</D>', lt, re.DOTALL, overlapped=True)
            if(len(places)==0):
                cntnoplc=cntnoplc+1
            for l in places:
                if (find(trieLoc, l) == None):
                    insert(trieLoc, l, 1)
                else:
                    update(trieLoc, l, 1)
                article_places.append(l)
                listplaces.add(l)

        article_words = Node()        
        yBody = re.findall(r'<BODY>(.*?)</BODY>', x[j], re.DOTALL, overlapped=True)
        for b,word in enumerate(yBody):
            # split the body into a bunch of different words
            body = word.split()
            body = [element.lower() for element in body] ; body            
            for l in body:
                if (find(WordCount, l) == None):
                    insert(WordCount, l, 1)
                    insert(article_words, l, 1)
                elif (find(article_words, l) == None):
                    insert(article_words, l, 1)
                    update(WordCount, l, 1)
                else:
                    update(WordCount, l, 1)
                    update(article_words, l, 1)
                listwords.add(l)
        #print (article_topics)
        article_tries[articleCount] = [article_topics, article_places, article_words]
        articleCount += 1

        
# end of main for loop for all files

# Print statements for the distinct list of topics, distinct list of places, and counts of topic-less and/or place-less 
# articles.  Although, we used set data structures to display the different keys here, it is easy to fetch values for keys 
# using a trie displayed below each.  Usage of sets was only done as part of "developing our domain-specific knowledge".
listtopics = list(listtopics)
listplaces = list(listplaces)
listwords = list(listwords)
#print(listtopics)
print(find_multiple(trieTopics, listtopics))
#print(listplaces)
#print(find_multiple(trieLoc, listplaces))
#print("Data objects with no entries for topics: " + str(cntnotop))
#print("Data objects with no entries for places: " + str(cntnoplc))

[137, 190, 2, 3, 54, 65, 120, 513, 2, 3, 634, 25, 306, 9, 552, 7, 801, 63, 35, 29, 32, 2, 3, 3, 3987, 18, 3, 44, 163, 8, 184, 2, 12, 27, 3, 7, 135, 35, 76, 1, 628, 1, 254, 78, 65, 1, 1, 75, 15, 3, 2, 21, 3, 4, 6, 305, 7, 1, 17, 15, 192, 2448, 1, 32, 2, 3, 7, 8, 4, 6, 116, 41, 130, 4, 3, 1, 112, 1, 51, 14, 1, 84, 1, 11, 4, 1, 51, 3, 33, 1, 17, 1, 63, 8, 25, 76, 37, 28, 1, 27, 114, 67, 3, 43, 27, 2, 2, 4, 17, 21, 69, 4, 217, 1, 35, 67, 2, 6, 145, 10]


In [4]:
# Some tests for "finds" on the tries are shown below

print (find(trieLoc, "usa"))

12542


In [5]:
print(find(trieTopics, "sugar"))

184


In [6]:
print (find_multiple(trieLoc, ['usa', 'west-germany']))

[12542, 567]


In [7]:
print (find(WordCount, 'agriculture'))
print (find(WordCount, 'a'))

849
50596


In [8]:
print (len(listtopics) ) #120
trieVals = find_multiple(trieTopics, listtopics)
#print (trieVals)
#topictrie[0] = listtopics
#topictrie[1] = trieVals
#print (topictrie[1])
'''with open('output_trie_topics.csv', 'w') as csvfile:
    fieldnames = ['topic', 'value']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(len(listtopics)):
        writer.writerow({'topic': topictrie[0][i], 'value': topictrie[1][i]})'''
#np.savetxt("output_trei_topics.csv", topictrie, delimiter=",")
print (len(find_multiple(article_tries[0][2], listwords)))

120
104121


In [9]:
'''
print (len(article_tries))
print (article_tries[1][0])
with open('output_article_words.csv', 'w') as csvfile:
    #fieldnames = ['article#', 'topics', 'places', listwords]
    #writer = csv.writer(csvfile)
    #writer.writeheader()
    for i in range(len(article_tries)):
        topicString = ''
        placeString = ''
        valueString = ''
        wordVals = find_multiple(article_tries[i][2], listwords)
        if article_tries[i] != None:
            for string in article_tries[i][0]:
                topicString +=  string + ' '
            for string in article_tries[i][1]:
                placeString +=  string + ' '
            for value in wordVals:
                valueString +=  str(value) + ', '
        csvfile.write(str(i) + ',' + topicString + ', ' + placeString + ', ' + valueString + '\n')
    '''

"\nprint (len(article_tries))\nprint (article_tries[1][0])\nwith open('output_article_words.csv', 'w') as csvfile:\n    #fieldnames = ['article#', 'topics', 'places', listwords]\n    #writer = csv.writer(csvfile)\n    #writer.writeheader()\n    for i in range(len(article_tries)):\n        topicString = ''\n        placeString = ''\n        valueString = ''\n        wordVals = find_multiple(article_tries[i][2], listwords)\n        if article_tries[i] != None:\n            for string in article_tries[i][0]:\n                topicString +=  string + ' '\n            for string in article_tries[i][1]:\n                placeString +=  string + ' '\n            for value in wordVals:\n                valueString +=  str(value) + ', '\n        csvfile.write(str(i) + ',' + topicString + ', ' + placeString + ', ' + valueString + '\n')\n    "

In [10]:
def weight_variable(shape, myname):
    initial = tf.truncated_normal(shape, stddev=0.1, name=myname)
    return tf.Variable(initial)


def bias_variable(shape, myname):
    initial = tf.constant(0.1, shape=shape, name=myname)
    return tf.Variable(initial)

In [11]:
# NETWORK

# 1st layer
x = tf.placeholder(tf.float32, shape=[None, 104121], name="x")
y_input = tf.placeholder(tf.float32, shape=[None, 120], name="y_input")

W = weight_variable([104121, 50], "W")
b = bias_variable([50], "b")
y_1 = tf.nn.leaky_relu(tf.add(tf.matmul(x, W), b))

W2 = weight_variable([50, 120], "W2")
b2 = bias_variable([120], "b2")
y_test = tf.add(tf.matmul(y_1, W2), b2, name="out")

In [None]:

ssd = tf.squared_difference(y_input, y_test)
test = tf.squeeze(ssd)
#divres = tf.div(test, ugh2)
cross_entropy = tf.reduce_mean(ssd)  # switched ssd in for divres
tv = tf.trainable_variables()
grads = list(zip(tf.gradients(cross_entropy, tv), tv))
for grad, variables in grads:
    variable_3 = grad
tf.summary.scalar('loss', cross_entropy)
train_step = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(cross_entropy, name="gradDescent")
# switched adam to gradientdescent and removed epsilon of e-3

myind = tf.argmax(y_test,1)
myindTrue = tf.argmax(y_input,1)
myacc = tf.equal(myind,myindTrue)

some = tf.clip_by_value(ssd, clip_value_min=1e-7, clip_value_max=1e6)  # switched ssd in for divres
condition = tf.less(some, 1e-5)  # changed 1e-5 to 1
correct_prediction = tf.where(condition, tf.ones_like(condition), tf.zeros_like(condition))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="acc")
tf.summary.scalar('acc', accuracy)
merge = tf.summary.merge_all()
saver = tf.train.Saver()
tf.add_to_collection("optimizer", train_step)


In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    # train_writer = tf.summary.FileWriter('beginner', sess.graph)
    train_writer2 = tf.summary.FileWriter('validation', sess.graph)
    # sess.run()
    sess.run(tf.global_variables_initializer())
    # example input: find_multiple(article_tries[0][2], listwords)
    # input setup
    train_size = 5  # 1234
    test_size = 1
    batch_size = 10
    i = 0
    yans = [None] * train_size
    xans = [None] * train_size
    xans2 = [None] * test_size  # 26
    yans2 = [None] * test_size
    epochs = 0
    count = 0
    count2 = 0
    loss = 1e10
    while i < 6:  
        temp = []
        temp2 = []
        mytemp = find_multiple(article_tries[i][2], listwords)
        mytemp = [j if (j != None) else 0 for j in mytemp]
        mytemp2 = find_multiple(article_tries[i][0], listtopics)
        mytemp2 = [j if (j != None) else 0 for j in mytemp2]
        #print (mytemp)
        for j in range(len(mytemp)):
            temp.append(float(mytemp[j]))
        for j in range (len(mytemp2)):
            temp2.append(float(mytemp2[j]))
        if i % 6  != 0:
            xans[count] = temp
            yans[count] = temp2
            count += 1
        else:
            xans2[count2] = temp
            yans2[count2] = temp2
            count2 += 1
        i +=1
    yans = np.array(yans, dtype=np.float64)
    yans2 = np.array(yans2, dtype=np.float64)
   # yans2[0] = np.array([i for i in range(120)], dtype=np.float)
    print (yans2.shape)
    while epochs < 50000:
            [summary2, acc, res] = sess.run([merge, accuracy, y_test], feed_dict={x: xans2, y_input: yans2})
            train_writer2.add_summary(summary2, epochs)
            if epochs % 100 == 0:
                [summaryOut, train_accuracy, something, true_val, c, myacc2, ind, ind2, yin] = sess.run([merge, accuracy, y_test, y_input,
                                            cross_entropy, myacc, myind, myindTrue, y_input], feed_dict={x: xans2, y_input: yans2})
                if c < loss:
                  saver.save(sess, './my_test_model', global_step=epochs)
                  #np.savetxt("output_fourier_check.csv", true_val, delimiter=",")
                  loss = c
                a = abs(something - true_val)
                numpy_accuracy = (a <= 1e-5).mean()   # changed 1e-5 to 1
                try:
                    # print (v3)
                    print('step %d, training accuracy %g' % (epochs, train_accuracy))
                    print('step %d, training accuracy %g' % (epochs, numpy_accuracy))
                    #print('step %d, idk accuracy %g' % (epochs, myacc2))
                    print('ind %d, index true %g' % (ind, ind2))
                    print('validation accuracy %g' % acc)
                    print (something)
                    print(yin)
                    print (c)
                except OSError as e:
                    pass
            train_step.run(feed_dict={x: xans, y_input: yans})
            epochs += 1
    # print (something[0])
    #[acc, res] = sess.run([accuracy, y_test], feed_dict={x: xans2, y: yans2})
    #np.savetxt("output_fourier9.csv", res, delimiter=",")
    #print('validation accuracy %g' % acc)

(1, 120)
step 0, training accuracy 0
step 0, training accuracy 0
ind 13, index true 38
validation accuracy 0
[[-28.004955   -36.05069    -39.54859     54.315006   -43.037235
   -0.62411064  41.20946    -13.107423     6.4129944   58.424145
  -28.280302     8.317188   -48.961704    76.446884    17.745432
   18.456127    11.046552   -22.714123    47.311596   -17.628038
  -30.585367   -26.024021    -9.144962   -35.689087   -16.983421
   19.775702   -28.402918   -55.63756    -64.623436   -45.54339
  -21.64617    -34.45556     -5.0017285   15.071354   -29.257824
   43.0074      24.334518   -12.5821705   44.428917     9.7956295
   44.53062      5.8382545   35.630676   -37.30334     54.442898
   38.77463     -4.5728207  -27.08182      4.8756275    6.895401
  -22.54658     19.933401   -15.4204445   -6.1629744   27.597912
   -5.2793875    8.19855     31.800007   -59.23969      7.4925656
   -6.5478363   10.117939   -27.3233     -17.006014    -1.2787085
   -0.21877614  22.83069    -34.624634   -42

step 400, training accuracy 0
step 400, training accuracy 0
ind 13, index true 38
validation accuracy 0
[[ -7.217411   -31.770046   -48.865845    21.26835    -39.048904
  -19.63005     22.07645    -21.323116    -6.493018    30.175007
   -8.385236    12.06858    -17.73897     65.24854     -6.9264855
   16.513475    -2.4067469  -15.365644    15.031938     0.24078074
  -28.38701    -27.424725    -8.514429   -21.057318     3.0856087
   30.613695   -13.6502     -17.020988   -45.930702   -17.382935
  -11.496711   -20.389402    -3.2066057   10.028146    -7.118074
   27.165688    11.286044   -15.332037    35.192413    -3.6082878
   25.732311     9.787906    19.795368   -53.35007     35.518448
   15.681291    -0.11028124 -24.042841    35.13948     -6.65495
   -8.243175    15.205106    -7.330502   -11.925693    10.849756
    0.90104854  -4.071379    26.741333   -43.718796     8.950721
   -6.294257    30.003279   -38.581196   -23.061457    17.012436
   -0.408923     7.6027756  -26.333916   -35.55

step 800, training accuracy 0
step 800, training accuracy 0
ind 13, index true 38
validation accuracy 0
[[ -7.6396     -32.584488   -49.69337     21.857725   -39.489616
  -19.597107    22.944754   -21.842455    -6.734906    30.933182
   -9.570794    11.18803    -17.468567    66.07489     -6.8890224
   18.126411    -2.5111558  -16.197018    16.159956    -0.35115814
  -28.94453    -26.947578    -8.463364   -21.691105     3.0249786
   30.826418   -13.952406   -16.520657   -46.274353   -18.864311
  -12.264576   -20.37561     -3.6793163   10.436292    -7.263895
   26.302738    11.351015   -15.123293    35.897686    -3.6609516
   26.15624      9.579005    20.455149   -53.813343    35.144325
   16.176174    -0.15431778 -24.268278    35.853283    -6.338696
   -7.927994    14.972657    -6.532302   -12.635058    11.592648
    1.7636684   -3.583511    27.267887   -45.026        9.707953
   -6.5417585   30.104929   -39.421803   -22.480173    17.642569
   -0.76684946   7.6362557  -26.963339   -35.7

In [None]:
print(listtopics)