In [16]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import copy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import contractions
import random
from collections import Counter


[nltk_data] Downloading package stopwords to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
class_names = ["comp.graphics", "sci.med", "talk.politics.misc", "rec.sport.hockey", "sci.space"]
class_labels = [0, 1, 2, 3, 4]
class_name_to_label = {class_names[i]:i for i in range(len(class_names))}
class_label_to_name = {i:class_names[i] for i in range(len(class_names))}
data_folder = "./data/20_newsgroups/20_newsgroups"

In [4]:
def splitData(class_names, data_folder, train_ratio):

    class_wise_data = []
    for i in range(len(class_names)):
        class_dir = data_folder + '/' + class_names[i]
        file_names = os.listdir(class_dir)
        n_docs_class = len(file_names)
        shuffled_docs = random.sample(file_names, n_docs_class)
        n_train = int(train_ratio * n_docs_class)
        n_test = n_docs_class - n_train
        train_docs_class = shuffled_docs[:n_train]
        test_docs_class = shuffled_docs[n_train:]
        class_wise_data.append({'train' : train_docs_class, 'test' : test_docs_class})
    return class_wise_data

In [5]:
class_wise_data = splitData(class_names, data_folder, 0.8)

In [6]:
def check_alnum(tok):
    '''
        Remove non-alphanumeric characters from a string
    '''

    tok = ''.join(ch for ch in tok if ch.isalnum() == True)
    return tok

def remove_punct(tok):
    '''
        Remove the punctuation in token
    '''
    punctuations = string.punctuation
    tok = ''.join(ch for ch in tok if ch not in punctuations)
    return tok

def remove_blank_space(tok):
    '''
        Remove the spaces in token
    '''
    tok = ''.join(ch for ch in tok if ch != ' ')
    return tok
def preprocess(text):

    text = text.lower()

    text = contractions.fix(text)

    all_tokens = word_tokenize(text)

    all_tokens = [check_alnum(tok) for tok in all_tokens]

    stop_words = list(set(stopwords.words('english')))

    all_tokens = [tok for tok in all_tokens if tok not in stop_words]

    toks_no_punct = []
    for tok in all_tokens:
        ctok = remove_punct(tok)
        if(ctok != ""):
            toks_no_punct.append(ctok)

    cleaned_toks = []
    for tok in toks_no_punct:
        ctok = remove_blank_space(tok)
        if(ctok != ""):
            cleaned_toks.append(ctok)
    
    final_tokens = [tok for tok in cleaned_toks]

    return final_tokens


In [45]:
# def readClassFiles(class_file_paths):
#     class_doc_tokens = []
#     for fpath in class_file_paths:
#         f = open(fpath, 'r', encoding='utf-8', errors='ignore')
#         ftxt_unprocessed = f.read()
#         doc_toks = preprocess(ftxt_unprocessed)
#         class_doc_tokens.append(doc_toks)
#     return class_doc_tokens

def process_data(class_wise_data, class_labels):
    class_wise_train_unique_tokens = {}
    class_wise_tokens = {i: {'train' : [], 'test': []} for i in range(5)}
    class_wise_train_tfs = {}
    for label in class_labels:
        print(f"--- Calculating for label = {label} ---")
        class_train_data = class_wise_data[label]['train']
        class_test_data = class_wise_data[label]['test']
        class_train_tokens = []
        print(">> reading through train files and preprocessing")
        for doc in tqdm(class_train_data):
            f = open(data_folder + '/' + class_label_to_name[label] + '/' + doc, encoding='utf-8', errors='ignore')
            ftxt_unproc = f.read()
            doc_toks = preprocess(ftxt_unproc)
            class_train_tokens.append(doc_toks)
        print("--- Done")
        class_wise_tokens[label]['train'] = class_train_tokens
        class_test_tokens = []
        print(">> reading through files and preprocessing")
        for doc in tqdm(class_test_data):
            f = open(data_folder + '/' + class_label_to_name[label] + '/' + doc, encoding='utf-8', errors='ignore')
            ftxt_unproc = f.read()
            doc_toks = preprocess(ftxt_unproc)
            class_test_tokens.append(doc_toks)
        print("--- Done")
        class_wise_tokens[label]['test'] = class_test_tokens

        # x = np.array(class_train_tokens[0])
        # print(x.shape)
        # all_class_train_tokens = list(np.array(class_train_tokens).flatten())
        print(">> Processing all tokens")
        all_class_train_tokens = []
        for doc_toks in class_train_tokens:
            for tok in doc_toks:
                all_class_train_tokens.append(tok)
        print("--- Done")
        # print(all_class_train_tokens.shape)
        print(">> Calculating class-wise TF")
        class_tfs = dict(Counter(all_class_train_tokens))
        class_wise_train_tfs[label] = class_tfs
        print("--- Done")
        class_wise_train_unique_tokens[label] = list(set(all_class_train_tokens))
        print("\n--------------------------\n\n")
    return class_wise_tokens, class_wise_train_unique_tokens, class_wise_train_tfs

def compute_icf(class_wise_unq_toks):
    all_class_toks = set()
    for label in class_labels:
        for tok in class_wise_unq_toks[label]:
            all_class_toks.add(tok)
        # all_class_toks.add(class_wise_unq_toks[label])
    term_icf = {}
    num_classes = len(class_labels)
    for tok in all_class_toks:
        present = 0
        for label in class_labels:
            if(tok in class_wise_unq_toks[label]):
                present += 1
        term_icf[tok] = np.log10(num_classes / present)
    return term_icf

In [43]:
class_wise_tokens, class_wise_train_unique_tokens, class_wise_train_tfs = process_data(class_wise_data, class_labels)

--- Calculating for label = 0 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:04<00:00, 165.03it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 187.86it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------


--- Calculating for label = 1 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:05<00:00, 148.70it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 163.86it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------


--- Calculating for label = 2 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:06<00:00, 114.94it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 105.25it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------


--- Calculating for label = 3 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:06<00:00, 133.29it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 160.57it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------


--- Calculating for label = 4 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:06<00:00, 132.67it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 142.83it/s]

--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------







In [46]:
term_icfs = compute_icf(class_wise_train_unique_tokens)

Counter({'a': 3, 'b': 2, 'c': 1, 'd': 1, 'e': 1})
[3, 1, 1, 1, 0, 0]


In [None]:
# def feature_selection