In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import re
import xml.sax.saxutils as saxutils

from bs4 import BeautifulSoup

from gensim.models.word2vec import Word2Vec

from multiprocessing import cpu_count

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

from pandas import DataFrame

from sklearn.model_selection import train_test_split
import random
import numpy as np
import pandas as pd

# General constants (modify them according to you environment)

In [3]:
# Set Numpy random seed
random.seed(1000)

# Newsline folder and format
data_folder = './reuters21578/'

sgml_number_of_files = 22
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}

# Word2Vec number of features
num_features = 500
# Limit each newsline to a fixed number of words
document_max_num_words = 100
# Selected categories
selected_categories = ['pl_usa', 'to_earn', 'to_acq', 'pl_uk', 'pl_japan', 'pl_canada', 'to_money-fx', 'to_crude', 'to_grain', 'pl_west-germany']


# Prepare documents and categories

In [4]:
# Create category dataframe

# Read all categories
category_data = []

for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0], 
                                  0])

# Create category dataframe
news_categories = DataFrame(data=category_data, columns=['Name', 'Type', 'Newslines'])

In [5]:
news_categories

Unnamed: 0,Name,Type,Newslines
0,to_acq,Topics,0
1,to_alum,Topics,0
2,to_austdlr,Topics,0
3,to_austral,Topics,0
4,to_barley,Topics,0
...,...,...,...
667,ex_stse,Exchanges,0
668,ex_tose,Exchanges,0
669,ex_tse,Exchanges,0
670,ex_wce,Exchanges,0


In [6]:
def update_frequencies(categories):
    for category in categories:
        idx = news_categories[news_categories.Name == category].index[0]
        f = news_categories._get_value(idx, 'Newslines')
        news_categories._set_value(idx, 'Newslines', f+1)
    
def to_category_vector(categories, target_categories):
    vector = np.zeros(len(target_categories)).astype(np.float32)
    
    for i in range(len(target_categories)):
        if target_categories[i] in categories:
            vector[i] = 1.0
    
    return vector

In [7]:
# Parse SGML files
document_X = {}
document_Y = {}

def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)

# Iterate all files
for i in range(sgml_number_of_files):
    if i < 10:
        seq = '00' + str(i)
    else:
        seq = '0' + str(i)
        
    file_name = sgml_file_name_template.replace('NNN', seq)
    print('Reading file: %s' % file_name)
    
    with open(data_folder + file_name, 'rb') as file:
        content = BeautifulSoup(file.read().lower())
        
        for newsline in content('reuters'):
            document_categories = []
            
            # News-line Id
            document_id = newsline['newid']
            
            # News-line text
            document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
            document_body = unescape(document_body)
            
            # News-line categories
            topics = newsline.topics.contents
            places = newsline.places.contents
            people = newsline.people.contents
            orgs = newsline.orgs.contents
            exchanges = newsline.exchanges.contents
            
            for topic in topics:
                document_categories.append('to_' + strip_tags(str(topic)))
                
            for place in places:
                document_categories.append('pl_' + strip_tags(str(place)))
                
            for person in people:
                document_categories.append('pe_' + strip_tags(str(person)))
                
            for org in orgs:
                document_categories.append('or_' + strip_tags(str(org)))
                
            for exchange in exchanges:
                document_categories.append('ex_' + strip_tags(str(exchange)))
                
            # Create new document    
            update_frequencies(document_categories)
            
            document_X[document_id] = document_body
            document_Y[document_id] = to_category_vector(document_categories, selected_categories)

Reading file: reut2-000.sgm
Reading file: reut2-001.sgm
Reading file: reut2-002.sgm
Reading file: reut2-003.sgm
Reading file: reut2-004.sgm
Reading file: reut2-005.sgm
Reading file: reut2-006.sgm
Reading file: reut2-007.sgm
Reading file: reut2-008.sgm
Reading file: reut2-009.sgm
Reading file: reut2-010.sgm
Reading file: reut2-011.sgm
Reading file: reut2-012.sgm
Reading file: reut2-013.sgm
Reading file: reut2-014.sgm
Reading file: reut2-015.sgm
Reading file: reut2-016.sgm
Reading file: reut2-017.sgm
Reading file: reut2-018.sgm
Reading file: reut2-019.sgm
Reading file: reut2-020.sgm
Reading file: reut2-021.sgm


In [8]:
type(document_X)

dict

In [9]:
document_Y

{'1': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '2': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '3': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '4': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '5': array([1., 0., 0., 0., 0., 0., 0., 0., 1., 0.], dtype=float32),
 '6': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], dtype=float32),
 '7': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '8': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '9': array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '10': array([1., 0., 1., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '11': array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '12': array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '13': array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '14': array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 '15': array([1

# Top 20 categories (by number of newslines)

In [10]:
news_categories.sort_values(by='Newslines', ascending=False, inplace=True)
news_categories.head(20)

Unnamed: 0,Name,Type,Newslines
296,pl_usa,Places,12542
35,to_earn,Topics,3987
0,to_acq,Topics,2448
293,pl_uk,Places,1489
219,pl_japan,Places,1138
166,pl_canada,Places,1104
73,to_money-fx,Topics,801
28,to_crude,Topics,634
45,to_grain,Topics,628
302,pl_west-germany,Places,567


In [11]:
header = ['text'] + selected_categories

In [12]:
Y = pd.DataFrame(np.array(list(document_Y.values()))[:,:10], columns=['pl_usa', 'to_earn', 'to_acq', 'pl_uk', 'pl_japan', 'pl_canada', 'to_money-fx', 'to_crude', 'to_grain', 'pl_west-germany'])
Y['id']=Y.index

In [13]:
X = pd.DataFrame(np.array(list(document_X.values())), columns=['text'])
X['id']=X.index

In [14]:
X.columns

Index(['text', 'id'], dtype='object')

In [15]:
df=pd.merge(left=X, right=Y, on='id').drop('id', axis=1)

In [17]:
df['y']=df.iloc[:,1:10].sum(axis=1)

In [18]:
df.loc[df['y'] != 0,:].iloc[:,:10].head(5)

Unnamed: 0,text,pl_usa,to_earn,to_acq,pl_uk,pl_japan,pl_canada,to_money-fx,to_crude,to_grain
0,showers continued throughout the week in\nthe ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,standard oil co and bp north america\ninc said...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,texas commerce bancshares inc's texas\ncommerc...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bankamerica corp is not under\npressure to act...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,the u.s. agriculture department\nreported the ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [21]:
df.loc[df['y'] != 0,:].iloc[:2000,:10].to_csv('reuters_10class_2000.csv', index=False)