# Newsgroups

In [8]:
import numpy as np, pandas as pd
import os, sys
import math
import shutil
import uuid
import zipfile
import string
import random

# Paths and Variables

In [9]:
dataset_name = "newsgroups"

In [10]:
input_dir = './raw'
output_dir = './processed'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')

# Get data from sklearn datasets

In [17]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_all = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
# newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers'))

In [18]:
documents = newsgroups_all.data
targets = list(newsgroups_all.target) 
fnames = list(newsgroups_all.filenames) 
len(documents), len(targets), len(fnames)

(18846, 18846, 18846)

In [19]:
id_col = "id"
target_col = "newsgroup"
text_col = "text"

In [20]:
data = pd.DataFrame(data=[fnames,targets, documents]).transpose()
data.columns = [id_col, target_col, text_col]
data[id_col] = data[id_col].apply(lambda p: os.path.basename(p))
data.head()

Unnamed: 0,id,newsgroup,text
0,54367,10,\n\nI am sure some bashers of Pens fans are pr...
1,60215,3,My brother is in the market for a high-perform...
2,76120,17,\n\n\n\n\tFinally you said what you dream abou...
3,60771,3,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,51882,4,1) I have an old Jasmine drive which I cann...


In [21]:
newsgroups_idx_to_name = {i:v for i,v in enumerate(newsgroups_all.target_names)}
data[target_col] = data[target_col].map(newsgroups_idx_to_name)
data.head()

Unnamed: 0,id,newsgroup,text
0,54367,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...
1,60215,comp.sys.ibm.pc.hardware,My brother is in the market for a high-perform...
2,76120,talk.politics.mideast,\n\n\n\n\tFinally you said what you dream abou...
3,60771,comp.sys.ibm.pc.hardware,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,51882,comp.sys.mac.hardware,1) I have an old Jasmine drive which I cann...


# Prepare Data

In [22]:
# There are some documents which are empty strings. we will filter them out. 
data['char_len'] = data['text'].apply(lambda d: len(d))
data.head()

Unnamed: 0,id,newsgroup,text,char_len
0,54367,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...,712
1,60215,comp.sys.ibm.pc.hardware,My brother is in the market for a high-perform...,324
2,76120,talk.politics.mideast,\n\n\n\n\tFinally you said what you dream abou...,1678
3,60771,comp.sys.ibm.pc.hardware,\nThink!\n\nIt's the SCSI card doing the DMA t...,781
4,51882,comp.sys.mac.hardware,1) I have an old Jasmine drive which I cann...,666


In [23]:
data = data[data['char_len'] > 0]
data.shape

(18466, 4)

In [24]:
del data['char_len']

In [25]:
# Drop duplicates
data.drop_duplicates(subset = [id_col], keep='first', inplace=True)

# Shuffle Data

In [26]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,id,newsgroup,text
15243,83468,talk.religion.misc,\n\nOops! Quite right. I got so busy that I sa...
11948,103342,rec.autos,You guys are correct. The Bricklin was produc...
5446,21397,soc.religion.christian,"Hi Damon, No matter what system or explanatio..."
11842,103058,rec.autos,Here is a story. I bought a car about two wee...
11975,60938,comp.sys.ibm.pc.hardware,"==== BEGIN REPOST ====\n\nIBM, Apple, Motorola..."


# Insert Id Column

In [27]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

# Utility to Save DF as a zipped file

In [28]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [29]:
# # save original file as csv
# data.to_csv(outp_fname, index=False)

# save as zipped file 
save_df_to_zipped_csv(data)

# Train Test Split

In [30]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)


data_test_key = data_test[[id_col, target_col]].copy()
data_test = data_test.drop(columns=[target_col])
print(data_train.shape, data_test.shape, data_test_key.shape)

# # Save original files as csv
# data_train.to_csv(outp_train_fname, index=False)
# data_test.to_csv(outp_test_fname, index=False)
# data_test_key.to_csv(outp_test_key_fname, index=False)

(13643, 3) (1516, 2) (1516, 2)


In [31]:
# zip files
save_df_to_zipped_csv(data_train, "train")
save_df_to_zipped_csv(data_test, "test")
save_df_to_zipped_csv(data_test_key, "test_key")