In [1]:
import re
import os
import nltk
import numpy as np
import pandas as pd
import data_helpers as dh
from textblob import Word
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk import download as nltk_download
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups

stemmer = SnowballStemmer("english")
stop_words=stopwords.words('english')
nltk_download('reuters')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package reuters to /home/silas/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/silas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/silas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/silas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### load data (labeled)

#### news20
- https://archive.ics.uci.edu/ml/datasets/Twenty+Newsgroups

In [2]:
def load_newsgroups():
    
    categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
    newsgroups = fetch_20newsgroups(categories=categories)
    y_true= [newsgroups.target_names[idx] for idx in newsgroups.target]
    # y_true = newsgroups.target
    data = pd.DataFrame({'text': newsgroups.data, 'labels':y_true})
    
    return data

#### Reuters
- https://archive.ics.uci.edu/ml/datasets/Reuters-21578+Text+Categorization+Collection
- (description: https://martin-thoma.com/nlp-reuters/)
- http://www.nltk.org/book/ch02.html

In [3]:
# http://www.vision.caltech.edu/Image_Datasets/Caltech101/
# https://martin-thoma.com/nlp-reuters/

n_classes = 90
labels = reuters.categories()


# def load_reuters(config={}):
def load_reuters():
    """
    Load the Reuters dataset.

    Returns
    -------
    Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
     
    train_lst = [reuters.raw(doc_id) for doc_id in train]
    test_lst = [reuters.raw(doc_id) for doc_id in test]
    text = train_lst + test_lst
    
    train_labels = [reuters.categories(doc_id) for doc_id in train]
    test_labels = [reuters.categories(doc_id) for doc_id in test]
    labels = train_labels + test_labels
    
    data = pd.DataFrame({'text':text, 'labels':labels})
    
    return data


#### BBC news
- http://mlg.ucd.ie/datasets/bbc.html

In [4]:
def load_bbc():
    
    os.chdir('/home/silas/MIDS/W266/W266-final-project/')
    cwd = os.getcwd()
    os.chdir(cwd+"/Input/bbc")
    folders = ["business","entertainment","politics","sport","tech"]
    
    x = []
    y = []

    for i in folders:
        files = os.listdir(i)
        for text_file in files:
            file_path = i + "/" +text_file
            # print("reading file:", file_path)
            with open(file_path, 'r') as f:
                try:
                    data = f.readlines()
                except:
                    pass
            data = ' '.join(data)
            x.append(data)
            y.append(i)
   

    df_bbc = pd.DataFrame({'text': x, 'label': y})

    return df_bbc
#        print('writing csv flie ...')
#        df_bbc.to_csv('../bbc_dataset.csv', index=False)

#### Output

In [5]:
df_newsgroups = load_newsgroups()
df_reuters = load_reuters()
df_bbc = load_bbc()

FileNotFoundError: [Errno 2] No such file or directory: '/home/silas/MIDS/W266/W266-final-project/'

### Pre-processing

In [None]:
def tokenizer(text):
    words = nltk.word_tokenize(text)
    return words

In [None]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [None]:
def pre_processing_steps(df):
    
    """
    Tokenizes, removes stop words, stems (requires df "text" labeled)
    Args: dataframe
    Returns: dataframe with processed text
    """
    filtered_tokens=[]
    for doc in df.text:
        token = nltk.word_tokenize(doc)
        filtered_tokens_temp=[]
        for tok in token:
            if tok not in stop_words:
                clean = clean_str(tok)
                if re.search('[a-zA-Z]', clean):
                    filtered_tokens_temp.append(stemmer.stem(clean))
        filtered_tokens.append(", ".join(filtered_tokens_temp))
    df['processed_text'] = filtered_tokens    
    return df

### Save

In [None]:
for df,df_name in zip([df_newsgroups,df_reuters,df_bbc],['newsgroups','reuters','bbc']):
    pre_processing_steps(df).to_pickle("./{}.pkl".format(df_name))

### Load

In [None]:
df_newsgroups = pd.read_pickle("./newsgroups.pkl")
df_reuters = pd.read_pickle("./reuters.pkl")
df_bbc = pd.read_pickle("./bbc.pkl")

### Train, test, split

In [None]:
def train_test_splitter(df,holdout=.2):    
    mask = np.random.rand(len(df)) < 1 - holdout
    train,test = df[mask],df[~mask]
    return train, test

In [None]:
train, test = train_test_splitter(df)

In [None]:
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.feature_extraction.text import TfidfVectorizer