## Estudio de los datasets

In [1]:
import re

class Dataset:
    
    def clean_str(self, string):
        """
        Tokenization/string cleaning for all datasets except for SST.
        Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
        """
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)

        return string.strip().lower()
    
    #################################################################
    ##################### 20 Newsgroups #############################
    #################################################################
    
    path_train_20newsgroups = "../20NewsGroup/20ng-train-stemmed.txt"
    path_test_20newsgroups = "../20NewsGroup/20ng-test-stemmed.txt"
    
    target_names_20newsgroups = [
        "alt.atheism", 
        "comp.graphics",
        "comp.os.ms-windows.misc",
        "comp.sys.ibm.pc.hardware", 
        "comp.sys.mac.hardware",
        "comp.windows.x",
        "misc.forsale",
        "rec.autos",
        "rec.motorcycles",
        "rec.sport.baseball",
        "rec.sport.hockey",
        "sci.crypt",
        "sci.electronics",
        "sci.med",
        "sci.space",
        "soc.religion.christian",
        "talk.politics.guns",
        "talk.politics.mideast",
        "talk.politics.misc",
        "talk.religion.misc"
    ]
    
    def read_20newsgroups_file(self, path_test_20newsgroups):
        data = []
        target = []
        
        with open(path_test_20newsgroups) as file:
            for index, line in enumerate(file):
                tokens_count = len(line.split())

                if tokens_count > 1 and tokens_count <= 301:
                    category, text = line.split(None, 1)
                    data.append(self.clean_str(text))
                    target.append(self.target_names_20newsgroups.index(category))
                
        return data, target
        
    
    def fetch_20newsgroups(self, subset = "train"):
        dataset = {'data': None,  'target': None , 'target_names': self.target_names_20newsgroups}
    
        if subset == 'train':
            dataset['data'], dataset['target'] = self.read_20newsgroups_file(self.path_train_20newsgroups)
        elif subset == 'test':
            dataset['data'], dataset['target'] = self.read_20newsgroups_file(self.path_test_20newsgroups)
        elif subset == 'all':
            data_train, target_train = self.read_20newsgroups_file(self.path_train_20newsgroups)
            data_test, target_test = self.read_20newsgroups_file(self.path_test_20newsgroups)
            
            dataset['data'], dataset['target'] = data_train + data_test, target_train + target_test
        
        return dataset
    
    #################################################################
    ##################### SearchSnippets ############################
    #################################################################
    
    path_train_search_snippets = "../SearchSnippets/train.txt"
    path_test_search_snippets = "../SearchSnippets/test.txt"
    
    target_names_search_snippets = [
        "business",
        "computers",
        "culture-arts-entertainment",
        "education-science",
        "engineering",
        "health",
        "politics-society",
        "sports"
    ]
    
    def read_search_snippets_file(self, path_test_search_snippets):
        data = []
        target = []
        
        with open(path_test_search_snippets, encoding="utf8") as file:
            for index, line in enumerate(file):
                tokens_count = len(line.split())
               
                if tokens_count > 1 and tokens_count <= 301:
                    text, category = line.rsplit(None, 1)
                    data.append(self.clean_str(text))
                    target.append(self.target_names_search_snippets.index(category))
                
        return data, target
        
    
    def fetch_search_snippets(self, subset = "train"):
        dataset = {'data': None,  'target': None , 'target_names': self.target_names_search_snippets}
    
        if subset == 'train':
            dataset['data'], dataset['target'] = self.read_search_snippets_file(self.path_train_search_snippets)
        elif subset == 'test':
            dataset['data'], dataset['target'] = self.read_search_snippets_file(self.path_test_search_snippets)
        elif subset == 'all':
            data_train, target_train = self.read_search_snippets_file(self.path_train_search_snippets)
            data_test, target_test = self.read_search_snippets_file(self.path_test_search_snippets)
            
            dataset['data'], dataset['target'] = data_train + data_test, target_train + target_test
        
        return dataset

### 20 Newsgroup

In [None]:
dataset = Dataset()
print("Dataset structure:")
train_dataset = dataset.fetch_20newsgroups(subset = 'train')
print(train_dataset.keys())
print("\n")
print("Dataset example:")
print(train_dataset['data'][0])
print("\n")
print("Train dataset:")
print("Set size: {}".format(len(train_dataset['data'])))
print("\n")

print("Test dataset:")
test_dataset = dataset.fetch_20newsgroups(subset = 'test')
print("Set size: {}".format(len(test_dataset['data'])))
print("\n")

complete_dataset = train_dataset['data'] + test_dataset['data']

mean_length = sum(len(document.split()) for document in complete_dataset)/len(complete_dataset)
max_length = max(map(lambda document: len(document.split()), complete_dataset))

print('Mean Lenght: {}'.format(round(mean_length, 1)))
print('Max Lenght: {}'.format(max_length))
print("\n")

vocabulary = list()
for document in complete_dataset:
    for word in document.split():
        if word not in vocabulary:
            vocabulary.append(word)

print('Vocabulary Size: {}'.format(len(vocabulary)))

Dataset structure:
dict_keys(['data', 'target', 'target_names'])


Dataset example:
univers violat separ church state dmn kepler unh edu king becom philosoph philosoph becom king write recent ra order and resist care appar post religi flyer entitl soul scroll thought religion spiritu and matter soul insid bathroom stall door school univers hampshir sort newslett assembl hall director campu pose question spiritu each issu and solicit respons includ issu pretti vagu assum put christian care not mention jesu bibl heard defend doesn support religion thi state univers and strong support separ church and state enrag can thi sound scream for parodi give copi your friendli neighbourhood subgeniu preacher luck run mental mincer and hand you back outrag offens and gut bustingli funni parodi you can past origin can stool scroll thought religion spiritu and matter colon you can us thi text wipe mathew


Train dataset:
Set size: 10443


Test dataset:
Set size: 6972


Mean Lenght: 94.2
Max Lenght: 3

### SearchSnippets

In [112]:
dataset = Dataset()
print("Dataset structure:")
train_dataset = dataset.fetch_search_snippets(subset = 'train')
print(train_dataset.keys())
print("\n")
print("Dataset example:")
print(train_dataset['data'][0])
print("\n")
print("Train dataset:")
print("Set size: {}".format(len(train_dataset['data'])))
print("\n")

print("Test dataset:")
test_dataset = dataset.fetch_search_snippets(subset = 'test')
print("Set size: {}".format(len(test_dataset['data'])))
print("\n")

complete_dataset = train_dataset['data'] + test_dataset['data']

mean_length = sum(len(document.split()) for document in complete_dataset)/len(complete_dataset)
max_length = max(map(lambda document: len(document.split()), complete_dataset))

print('Mean Lenght: {}'.format(round(mean_length, 1)))
print('Max Lenght: {}'.format(max_length))
print("\n")

vocabulary = list()
for document in complete_dataset:
    for word in document.split():
        if word not in vocabulary:
            vocabulary.append(word)

print('Vocabulary Size: {}'.format(len(vocabulary)))

Dataset structure:
dict_keys(['data', 'target', 'target_names'])


Dataset example:
manufacture manufacturer directory directory china taiwan products manufacturers directory taiwan china products manufacturer direcory exporter directory supplier directory suppliers


Train dataset:
Set size: 10060


Test dataset:
Set size: 2280


Mean Lenght: 18.1
Max Lenght: 50


Vocabulary Size: 29257
