In [7]:
import re
import os
from os.path import join, dirname
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from glob import glob
stop_words = set(stopwords.words('english'))

def fix(fname):
    reg = re.compile('^[a-zA-Z\-]*\:')
    #fname = '20news-bydate/20news-bydate-train/alt.atheism/49960'
    outfname = 'fixed-'+fname
    os.makedirs(dirname(outfname), exist_ok=True)
    lines = []
    with open(fname, errors='ignore') as f:        
        for line in f:
            line = line.strip()
            if line != '' and reg.match(line) is None and not line.startswith('>'):
                lines.append(line)
    s = '\n'.join(lines)
    clean_text = clean(s)
    with open(outfname, 'w') as outf:
        outf.write(' '.join(clean_text))
    
                    
                    
def clean(text):   
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    words = [word for word in words if 3 <= len(word) <=11]
    # filter out stop words
    words = [w for w in words if not w in stop_words]
    return words
        

In [2]:
from glob import glob
files = glob("20news-bydate/20news-bydate-*/**/*")
for i, file in enumerate(files):
    if(i%1000 == 0):
        print(file, i, len(files))
    fix(file)

20news-bydate/20news-bydate-test/talk.politics.mideast/77197 0 18846
20news-bydate/20news-bydate-test/comp.sys.mac.hardware/52243 1000 18846
20news-bydate/20news-bydate-test/comp.os.ms-windows.misc/10160 2000 18846
20news-bydate/20news-bydate-test/sci.crypt/15822 3000 18846
20news-bydate/20news-bydate-test/rec.motorcycles/104861 4000 18846
20news-bydate/20news-bydate-test/comp.sys.ibm.pc.hardware/60940 5000 18846
20news-bydate/20news-bydate-test/talk.politics.guns/54696 6000 18846
20news-bydate/20news-bydate-test/misc.forsale/76582 7000 18846
20news-bydate/20news-bydate-train/talk.politics.mideast/75958 8000 18846
20news-bydate/20news-bydate-train/comp.sys.mac.hardware/51831 9000 18846
20news-bydate/20news-bydate-train/rec.sport.baseball/104442 10000 18846
20news-bydate/20news-bydate-train/rec.sport.hockey/54723 11000 18846
20news-bydate/20news-bydate-train/sci.crypt/15270 12000 18846
20news-bydate/20news-bydate-train/talk.politics.misc/178571 13000 18846
20news-bydate/20news-bydate-tr

In [23]:
from os.path import basename, dirname
class Vectorizer:
    def __init__(self, base_folder, word_map=None, word_list=None):
        self.base_folder = base_folder
        if(word_map is None or word_list is None):
            self.word_map, self.word_list = self.build_map()
        else: 
            self.word_map, self.word_list = word_map, word_list
        print(len(self.word_list))
    
    def build_map(self):
        files = glob(self.base_folder+'**/*')
        words = {}
        for file in files:
            with open(file) as f:
                tokens = f.read().strip().split()
                for t in tokens:
                    old = words.get(t, 0)
                    words[t] = old+1
        words = {k: v for k, v in words.items() if v > 50}
        return {w: i for i, (w, c) in enumerate(words.items())}, [k for k, v in words.items()]
    
    def get_topic(self, file):
        return basename(dirname(file))
    
    def vectorize(self, outfname):
        files = glob(self.base_folder+'**/*')
        nwords = len(self.word_map)
        
        def build_vector(tokens):
            tmp = [0]*nwords
            for t in tokens:
                if t in self.word_map:
                    tmp[self.word_map[t]] += 1
            return tmp
        with open(outfname, 'w') as outf:  
            outf.write(','.join(self.word_list+['x_class']))
            outf.write('\n')
            for i, file in enumerate(files):
                if i%1000 == 0:
                    print(i, file, len(files))
                with open(file) as f:
                    tokens = f.read().strip().split()
                    vector = build_vector(tokens)
                    vector.append(self.get_topic(file))
                    outf.write(','.join(map(str, vector)))
                    outf.write('\n')
            
                
        
        
        

         

In [25]:
train = Vectorizer('fixed-20news-bydate/20news-bydate-train/')

3611


In [26]:
test = Vectorizer('fixed-20news-bydate/20news-bydate-test/', word_map=train.word_map, word_list=train.word_list)

3611


In [28]:
train.vectorize('news_topic_train.csv')

0 fixed-20news-bydate/20news-bydate-train/talk.politics.mideast/75895 11314
1000 fixed-20news-bydate/20news-bydate-train/rec.autos/101605 11314
2000 fixed-20news-bydate/20news-bydate-train/alt.atheism/51252 11314
3000 fixed-20news-bydate/20news-bydate-train/comp.os.ms-windows.misc/9790 11314
4000 fixed-20news-bydate/20news-bydate-train/rec.sport.hockey/53977 11314
5000 fixed-20news-bydate/20news-bydate-train/sci.med/58859 11314
6000 fixed-20news-bydate/20news-bydate-train/rec.motorcycles/103222 11314
7000 fixed-20news-bydate/20news-bydate-train/comp.graphics/38563 11314
8000 fixed-20news-bydate/20news-bydate-train/comp.sys.ibm.pc.hardware/60199 11314
9000 fixed-20news-bydate/20news-bydate-train/talk.politics.guns/54460 11314
10000 fixed-20news-bydate/20news-bydate-train/soc.religion.christian/20872 11314
11000 fixed-20news-bydate/20news-bydate-train/talk.religion.misc/82779 11314


In [30]:
test.vectorize('news_topic_test.csv')

0 fixed-20news-bydate/20news-bydate-test/talk.politics.mideast/77197 7532
1000 fixed-20news-bydate/20news-bydate-test/comp.sys.mac.hardware/52243 7532
2000 fixed-20news-bydate/20news-bydate-test/comp.os.ms-windows.misc/10160 7532
3000 fixed-20news-bydate/20news-bydate-test/sci.crypt/15822 7532
4000 fixed-20news-bydate/20news-bydate-test/rec.motorcycles/104861 7532
5000 fixed-20news-bydate/20news-bydate-test/comp.sys.ibm.pc.hardware/60940 7532
6000 fixed-20news-bydate/20news-bydate-test/talk.politics.guns/54696 7532
7000 fixed-20news-bydate/20news-bydate-test/misc.forsale/76582 7532


In [19]:
import pandas as pd

In [31]:
df = pd.read_csv('data/news_topic_test.csv.gz')

In [32]:
df

Unnamed: 0,article,hernlem,brad,writes,lebanese,resistance,forces,bomb,israeli,occupation,...,missions,baalke,adaptec,firearm,larson,lunar,orbiter,clh,wolverine,x_class
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
1,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
2,1,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
3,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
5,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
6,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
7,1,0,0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
8,0,0,0,1,0,1,7,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
9,2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast


In [21]:
df

Unnamed: 0,article,hernlem,brad,writes,lebanese,resistance,forces,bomb,israeli,occupation,...,missions,baalke,adaptec,firearm,larson,lunar,orbiter,clh,wolverine,x_class
0,1,5,2,1,4,2,2,1,6,3,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
3,2,0,0,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
5,1,0,0,2,0,0,0,0,7,2,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
7,0,0,0,1,1,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
9,2,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,talk.politics.mideast
