In [1]:
import re
import os
import io
import nltk
import requests
import numpy as np
import pandas as pd
# import data_helpers as dh
from textblob import Word
from zipfile import ZipFile
from nltk.tag import pos_tag
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk import download as nltk_download
from text_processing import preprocess_spacy, clean_str
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups

stemmer = SnowballStemmer("english")
stop_words=stopwords.words('english')
nltk_download('reuters')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
full_data_path= '/home/silas/final_project/Data/'

[nltk_data] Downloading package reuters to /home/silas/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/silas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/silas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/silas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
full_data_path= '/home/silas/final_project/Data/'

### load data (labeled)

#### climate data

In [3]:
def load_climate():

    # load = pd.read_json(full_path + '/Data/ieapandm.json', 'column')
    load = pd.read_json('/home/silas/final_project/Data/ieapandm.json', 'column')
    load['label'] = 1
    data = load[['text', 'label']]
    data.columns = ['texts', 'labels']
    
    return data

#### spam

 - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [4]:
def load_spam():

    zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
    r = requests.get(zip_url)
    z = ZipFile(io.BytesIO(r.content))
    file = z.read('SMSSpamCollection')
    # Format Data
    text_data = file.decode()
    text_data = text_data.encode('ascii',errors='ignore')
    text_data = text_data.decode().split('\n')
    text_data = [x.split('\t') for x in text_data if len(x)>=1]
    df = pd.DataFrame(text_data)
    df[0] = pd.Categorical(df[0]).codes
    data = pd.DataFrame({'texts': df[1], 'labels': df[0]})
    
    return data

#### news20
- https://archive.ics.uci.edu/ml/datasets/Twenty+Newsgroups

In [5]:
def load_newsgroups():
    
    categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
    newsgroups = fetch_20newsgroups(categories=categories)
    y_true= [newsgroups.target_names[idx] for idx in newsgroups.target]
    # y_true = newsgroups.target
    data = pd.DataFrame({'texts': newsgroups.data, 'labels':y_true})
    
    return data

#### Reuters
- https://archive.ics.uci.edu/ml/datasets/Reuters-21578+Text+Categorization+Collection
- (description: https://martin-thoma.com/nlp-reuters/)
- http://www.nltk.org/book/ch02.html

In [6]:
# http://www.vision.caltech.edu/Image_Datasets/Caltech101/
# https://martin-thoma.com/nlp-reuters/

n_classes = 90
labels = reuters.categories()


# def load_reuters(config={}):
def load_reuters():
    """
    Load the Reuters dataset.

    Returns
    -------
    Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
     
    train_lst = [reuters.raw(doc_id) for doc_id in train]
    test_lst = [reuters.raw(doc_id) for doc_id in test]
    text = train_lst + test_lst
    
    train_labels = [reuters.categories(doc_id)[0] for doc_id in train]
    test_labels = [reuters.categories(doc_id)[0] for doc_id in test]
    labels = train_labels + test_labels
    
    data = pd.DataFrame({'texts':text, 'labels':labels})
    
    return data


#### BBC news
- http://mlg.ucd.ie/datasets/bbc.html
- http://mlg.ucd.ie/files/datasets/bbc.zip

In [7]:
def load_bbc():
    
    TRAIN_PATH = '/home/silas/final_project/W266-final-project-/Input/bbc_dataset.csv'
    df = pd.read_csv(TRAIN_PATH)
    df_bbc = pd.DataFrame({'texts': df.news, 'labels': df.type})

    return df_bbc
#        print('writing csv flie ...')
#        df_bbc.to_csv('../bbc_dataset.csv', index=False)

#### Web of Science
- https://data.mendeley.com/datasets/9rw3vkcfy4/6#file-bac53024-9266-4a46-b9fe-c193dfeb0b7a

In [8]:
def load_science():
    file_x = open(full_data_path+'WOS5736/X.txt', 'r')
    file_y = open(full_data_path+'WOS5736/Y.txt', 'r')
    texts=[]
    labels=[]
    for i in file_x:
        texts.append(i)
    for i in file_y:
        labels.append(i.replace('\n',''))
    data = pd.DataFrame({'texts':texts, 'labels':labels})
    return data

#### Output

In [9]:
# Optional to run:
df_climate = load_climate()
df_spam = load_spam()
df_newsgroups = load_newsgroups()
df_reuters = load_reuters()
df_bbc = load_bbc()
df_science = load_science()

### Pre-processing

In [10]:
def pre_processing_steps(df):
    
    """
    Tokenizes, removes stop words, stems (requires df "text" labeled)
    Args: dataframe
    Returns: dataframe with processed text
    """
    filtered_tokens=[]
    for doc in df.texts:
        token = nltk.word_tokenize(doc)
        filtered_tokens_temp=[]
        for tok in token:
            if tok not in stop_words:
                clean = clean_str(tok)
                if re.search('[a-zA-Z]', clean):
                    filtered_tokens_temp.append(stemmer.stem(clean))
        filtered_tokens.append(filtered_tokens_temp)
    df['processed_text'] = filtered_tokens    
    return df

In [1]:
def pre_processing_steps_spacy(df):
    df['processed_text'] = preprocess_spacy(df)
    return df

### Process and Save

In [13]:
for df,df_name in zip([df_climate, df_spam, df_newsgroups,df_reuters,df_bbc,df_science],['climate','spam','newsgroups','reuters','bbc','science']):
    print("Preprocessing {} dataset...".format(df_name))
    pre_processing_steps_spacy(df).drop_duplicates(subset='processed_text').to_pickle(full_data_path + '{}.pkl'.format(df_name))

Preprocessing climate dataset...
Preprocessing spam dataset...
Preprocessing newsgroups dataset...
Preprocessing reuters dataset...
Preprocessing bbc dataset...
Preprocessing science dataset...


### Load

In [14]:
# examples to load
#df_spam = pd.read_pickle(full_data_path + "spam.pkl")
#df_newsgroups = pd.read_pickle(full_data_path + "newsgroups.pkl")
#df_reuters = pd.read_pickle(full_data_path + "reuters.pkl")
#df_bbc = pd.read_pickle(full_data_path + "bbc.pkl")