# Import Statements

In [1]:
import nltk
from nltk.stem import PorterStemmer

import pandas as pd
import numpy as np
import re
import string
import csv
import os

In [2]:
topics = []
prim_key = -1
vocabDict = {}

# Create required directories

In [3]:
directory = "files"
if not os.path.exists(directory):
    os.makedirs(directory)
    
directory = "vocab"
if not os.path.exists(directory):
    os.makedirs(directory)

# Import topics data

In [4]:
data_topics_csv = pd.read_csv("reuters/all-topics-strings.lc.txt", header = None)
data_topics = np.matrix(data_topics_csv)

for ptr in range(0, data_topics.shape[0]):
    data_topics[ptr, 0] = data_topics[ptr, 0].strip()

In [5]:
#print data_topics
#for topic in data_topics[:,0]:
#    vocab_file = open()

# Extract Tag method

In [6]:
def project_extract_tag(temp_line, tag_name):
    if tag_name == 'ID':
        temp_id = temp_line.split('NEWID="')[1].replace('">','')
        return temp_id
    
    if tag_name == 'TOPICS':
        temp_line = temp_line.replace('<TOPICS>','<D>').replace('</TOPICS>','</D>').replace('</D>','')
        topics1 = temp_line.split('<D>')
        topics1 = filter(None, topics1)
        return topics1


# Text Normalize method

In [7]:
def project_text_normalize(temp_line):
    words=[]

    ''' ### Convert to the lower case email contents### '''
    temp_line = temp_line.lower()

    ''' ### Strip the HTML tags ### '''
    regex = re.compile("[<\[^<>\]+>]")
    temp_line = regex.sub('', temp_line)

    ''' ### Process Numbers ### '''
    regex = re.compile('[0-9]+')
    temp_line = regex.sub('number', temp_line)

    ''' ### Process URL ### '''
    regex = re.compile('(http|https)://[\S]*')
    temp_line = regex.sub('httpaddr', temp_line)

    ''' ### Process Email Address ### '''
    regex = re.compile('[\S]+@[\S]+')
    temp_line = regex.sub('emailaddr', temp_line)

    ''' ### Process Dollar Sign ### '''
    temp_line = re.sub('[$]+','dollar', temp_line)

    ''' ### remove Puntuaution ### '''
    temp_line = temp_line.translate(None, string.punctuation)

    ''' ### TOkenize the list ### '''
    words += temp_line.split(' ')

    ''' ### Remove Non alpha Numeric ### '''
    words = map(lambda x: re.sub('[^a-zA-z0-9]','',x), words)
    words = filter(None, words)

    ''' ### Stem the Strings ### '''
    words = map(lambda x: PorterStemmer().stem(x), words)

    return words

# Add to dictionary method

In [8]:
def project_add_to_dict(vocab, topics):
    global vocabDict
    
    for topic in topics:
        loc = np.where(data_topics == topic)
        word_list = vocabDict.get(loc[0][0])
        
        word_list_temp = []
        
        for word in vocab:
            if word_list == None:
                for ptr in range(0, len(vocab)):
                    if vocab[ptr] not in vocab[0: ptr] and vocab[ptr] not in range(ptr + 1, len(vocab)):
                        word_list_temp += [vocab[ptr]]
                        
                word_list = word_list_temp
                break
            
            if word in word_list:
                continue
                
            else:
                word_list += [word]
        
        vocabDict.update({str(loc[0][0]): word_list})
        

# Save document method

In [9]:
def project_save_document(doc_id, vocab, topics):
    project_add_to_dict(vocab, topics)
    
    output_file = open('files/doc_'+str(doc_id)+'.csv','w')
    writer = csv.writer(output_file)
    writer.writerow(vocab)
    
    #print "DATA TOPICS", data_topics.shape
    #print "TOPICS", topics
    
    Y = np.zeros((data_topics.shape[0], 1))
    Y[np.where(data_topics == topics)[0]] = 1
    Y = Y.flatten()
    writer.writerow(Y)
    
    output_file.close()
    

# Read datafile method

In [10]:
def project_read_file(temp_fileline):
    
    global prim_key
    global topics
    
    for line_num in range(0, len(temp_fileline)):
        temp_line = temp_fileline[line_num]
        body = ''
        if temp_line.startswith('<REUTERS'):
            prim_key = project_extract_tag(temp_line.strip(), 'ID')
        
        if temp_line.startswith('<TOPICS>'):
            topics = project_extract_tag(temp_line.strip(), 'TOPICS')
         
        if len(topics) == 0:
            continue
        
        if temp_line.__contains__('<BODY>'):
            while True ^ temp_line.__contains__('</BODY>'):
                body += temp_line
                line_num += 1
                temp_line = temp_fileline[line_num]

            body = body.split('<BODY>')[1].replace('Reuter','')
            vocab = project_text_normalize(body)
            project_save_document(prim_key, vocab, topics)
            topics = []

    

# Save Vocab Dictionary

In [11]:
def project_save_vocab():
    #print "VOCAB DICT: ", vocabDict
    #print "DATA TOPICS: ", data_topics
    for topic in data_topics[:,0]:
        vocab_file = open('vocab/Vocab'+str(topic[0,0])+'.csv', 'w')
        writer = csv.writer(vocab_file)
        x = vocabDict.get(str(np.where(data_topics==topic)[0][0]))
        #print "TOPIC: ", topic
        #print "x: ", x
        if x != None:
            writer.writerow([topic[0,0]]+x)
        else:
            writer.writerow([topic[0,0]])
        vocab_file.close()
        
    print "Created Vocab Files."

# Main method

In [12]:
if __name__ == "__main__":
    files=['reut2-000.sgm','reut2-001.sgm','reut2-002.sgm','reut2-003.sgm','reut2-004.sgm','reut2-005.sgm','reut2-006.sgm','reut2-007.sgm','reut2-008.sgm','reut2-009.sgm','reut2-010.sgm','reut2-011.sgm','reut2-012.sgm','reut2-013.sgm','reut2-014.sgm','reut2-015.sgm','reut2-016.sgm','reut2-017.sgm','reut2-018.sgm','reut2-019.sgm','reut2-020.sgm','reut2-021.sgm']
    #files=['reut2-000.sgm']
    for temp_filename in files:
        print "Parsing File: " + str(temp_filename)
        temp_file = open("reuters/"+temp_filename, 'r')
        temp_fileline = temp_file.readlines()
        project_read_file(temp_fileline)
        temp_file.close()
    
    project_save_vocab()

Parsing File: reut2-000.sgm
Parsing File: reut2-001.sgm
Parsing File: reut2-002.sgm
Parsing File: reut2-003.sgm
Parsing File: reut2-004.sgm
Parsing File: reut2-005.sgm
Parsing File: reut2-006.sgm
Parsing File: reut2-007.sgm
Parsing File: reut2-008.sgm
Parsing File: reut2-009.sgm
Parsing File: reut2-010.sgm
Parsing File: reut2-011.sgm
Parsing File: reut2-012.sgm
Parsing File: reut2-013.sgm
Parsing File: reut2-014.sgm
Parsing File: reut2-015.sgm
Parsing File: reut2-016.sgm
Parsing File: reut2-017.sgm
Parsing File: reut2-018.sgm
Parsing File: reut2-019.sgm
Parsing File: reut2-020.sgm
Parsing File: reut2-021.sgm
Created Vocab Files.
