# COMP 550 Project - Data Cleaning 

In [1]:
# Import block
import os
import pickle
import nltk
import re
from nltk.corpus import stopwords
from collections import Counter
nltk.download('punkt') # link to documentation on punkt tokenizers: https://www.nltk.org/_modules/nltk/tokenize/punkt.html

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Samy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data and Storage - 20 Newsgroup
### This notebook assumes that you've downloaded the [20 newsgroups dataset](https://archive.ics.uci.edu/ml/datasets/Twenty+Newsgroups) and unpacked the tarball in the same directory as this notebook. 

### You should therefore have a folder called "20_newsgroup" in the same directory as this notebook. The "20_newsgroup" folder should have 20 subfolders ("comp.os.ms-windows.misc", "soc.religion.christian", "rec.sport.baseball", etc...), each containing a long list of files.

# File Format - 20 Newsgroup
### Fortunately, the files are all raw text. Unfortunately, their headers can vary quite a lot. Take the following examples:

##### Sample file #1 from alt.atheism

In [None]:
with open(os.path.abspath(os.path.join(os.getcwd(), "20_newsgroups", "alt.atheism", "51128"))) as atheism_sample_file_1:
    print(''.join(atheism_sample_file_1.readlines()))

##### Sample file #2 from alt.atheism

In [None]:
with open(os.path.abspath(os.path.join(os.getcwd(), "20_newsgroups", "alt.atheism", "49960"))) as atheism_sample_file_2:
    print(''.join(atheism_sample_file_2.readlines()))

##### Sample file #1 from alt.autos

In [None]:
with open(os.path.abspath(os.path.join(os.getcwd(), "20_newsgroups", "rec.autos", "101553"))) as auto_sample_file_1:
    print(''.join(auto_sample_file_1.readlines()))

##### Sample file #2 from alt.autos

In [None]:
with open(os.path.abspath(os.path.join(os.getcwd(), "20_newsgroups", "rec.autos", "103338"))) as auto_sample_file_2:
    print(''.join(auto_sample_file_2.readlines()))

# File Format (cont'd)
### The four examples above each had a "Subject" line in their headers, and it might be useful to use the subsequent words in our analysis. I however believe that we should only do so if we can do it consistently (for every file in every newsgroup), so I checked that every file in the 20 newsgroups had a line starting with "Subject:".

In [None]:
root_dir = '\\\\?\\' + os.path.abspath(os.path.join(os.getcwd(), '20_newsgroups'))
file_count = 0
missing_subject_line_count = 0
total_files_count = 0
subdirectory_list = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
for subdir in subdirectory_list:
    files = [ os.path.join(root_dir, subdir, f) for f in os.listdir(os.path.join(root_dir, subdir)) if '.onetoc2' not in f ]
    total_files_count += len(files)
    for current_file in files:
        with open(current_file,'r') as input_file:
            contents = input_file.readlines()
        found_subject_line = False
        for line in contents:
            if "subject:" in line.lower():
                found_subject_line = True
                break
        if not found_subject_line:
            print(f"{current_file} didn't have a subject line.")
            missing_subject_line_count += 1
        file_count += 1

print(f"examined a total of {file_count} files out of {total_files_count}, {missing_subject_line_count} of which didn't have a subject line.")

### OK, so every file has a subject line that we can use!

# Content Extraction
### Some heuristics:
1. The subject line for every file will be considered as valid content.
2. All lines below the "Lines: #" line will be considered as valid content. Some invalid lines may be included (e.g. "NNTP-Posting-Host: punisher.caltech.edu", or "sandvik@newton.apple.com (Kent Sandvik) writes:"), but we can process those lines' contents afterwords with NLTK or another english dictionary API.

### super_dictionary is a dictionary with subdirectory::dict() key::value pairs. The values start off as empty dictionaries, but they are eventually populated with filename::list(valid lines) key::value pairs. 

### In case that's not clear, super_dictionary is supposed to end up looking like a tree like this:

    super_dictionary = {

        'alt.atheism' : {

                            '49960' : [ list of all the valid lines in file 49960 in the 'alt.atheism' subdirectory ],

                            '51060' : [ list of all the valid lines in file 51060 in the 'alt.atheism' subdirectory ],

                            ...

                        },

        'comp.graphics' : {

                              '37261': [ list of all the valid lines in file 37261 in the 'comp.graphics' subdirectory ],

                              '37913': [ list of all the valid lines in file 37913 in the 'comp.graphics' subdirectory ],

                              ...

                          } ,

        ...

    }

In [None]:
super_dictionary = {
    'alt.atheism': dict([]), 
    'comp.graphics': dict([]), 
    'comp.os.ms-windows.misc': dict([]), 
    'comp.sys.ibm.pc.hardware': dict([]), 
    'comp.sys.mac.hardware': dict([]), 
    'comp.windows.x': dict([]), 
    'misc.forsale': dict([]), 
    'rec.autos': dict([]), 
    'rec.motorcycles': dict([]), 
    'rec.sport.baseball': dict([]), 
    'rec.sport.hockey': dict([]), 
    'sci.crypt': dict([]), 
    'sci.electronics': dict([]), 
    'sci.med': dict([]), 
    'sci.space': dict([]), 
    'soc.religion.christian': dict([]), 
    'talk.politics.guns': dict([]), 
    'talk.politics.mideast': dict([]), 
    'talk.politics.misc': dict([]), 
    'talk.religion.misc': dict([])
}

### Populating super_dictionary 

In [None]:
root_dir = '\\\\?\\' + os.path.abspath(os.path.join(os.getcwd(), '20_newsgroups'))
file_count = 0
subdirectory_list = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
for subdir in subdirectory_list:
    files = [ os.path.join(root_dir, subdir, f) for f in os.listdir(os.path.join(root_dir, subdir)) if '.onetoc2' not in f ]
    for current_file in files:
        lines_with_valid_content = []
        with open(current_file,'r') as input_file:
            contents = input_file.readlines()

        for i, line in enumerate(contents):
            if "lines: " in line.lower():
                lines_with_valid_content.append(line.rstrip())
                break
        
        i += 1   
        while i < len(contents):
            lines_with_valid_content.append(contents[i].rstrip())
            i += 1
        
        subdirectory_dictionary = super_dictionary[subdir]
        subdirectory_dictionary[os.path.basename(current_file)] = lines_with_valid_content
        
        file_count += 1

print(f"examined a total of {file_count} files.")

### Visually validating that the super_dictionary's end-value lists have the correct content

In [None]:
alt_atheism_dictionary = super_dictionary["alt.atheism"]
print('\n'.join(alt_atheism_dictionary["49960"]))

### Pickling super_dictionary into 20_newsgroup_content_dictionary.pickle in the root directory

In [None]:
with open('20_newsgroup_content_dictionary.pickle', 'wb') as handle:
    pickle.dump(super_dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

# just to show that pickling works
#with open('20_newsgroup_content_dictionary.pickle', 'rb') as handle:
#    b = pickle.load(handle)
#
#print(super_dictionary == b)
#del b

# Data Cleaning
### For every leaf node (file) in the super_dictionary, we need to "clean" the leaf node's list of valid lines. 
### This can be done in a number of ways: here I've used NLTK's punkt tokenizer (https://www.nltk.org/_modules/nltk/tokenize/punkt.html), as it can tokenize with non-alphanumeric characters as well as with spaces. See the toy example below.

In [4]:
def has_some_alphanumeric_characters(line):
    if re.search('[a-zA-Z]', line):
        return True
    else:
        return False

def tokenize(list_of_lines, verbose=False):
    tokenized_list = []
    for line in list_of_lines:
        tokens = [word for word in nltk.word_tokenize(line) if has_some_alphanumeric_characters(word)]
        if verbose:
            print('The line "{}" becomes:\n{}\n\n'.format(str(line), ', '.join(tokens)))
        for tok in tokens:
            tokenized_list.append(tok)
    return tokenized_list

In [None]:
toy_example_list_of_valid_lines = ["NNTP-Posting-Host: punisher.caltech.edu",
                                   "sandvik@newton.apple.com (Kent Sandvik) writes:",
                                   ">>To borrow from philosophy, you don't truly understand the color red",
                                   ">>until you have seen it."] # taken from /alt.atheism/51128
#help(re.search)

tokenized_toy_example = tokenize(toy_example_list_of_valid_lines, verbose=True)

print(f"\nThe tokens in the toy example are: {', '.join(tokenized_toy_example)}")

### As mentioned above, there are a number of different ways to clean this sort of data. Another (possibly >= useful) method would be to tokenize by spaces and remove (entirely) tokens containing characters which are neither alphanumeric nor punctuation marks (specifically apostrophes and hyphens)

In [None]:
help(Counter)

In [None]:
tokens_and_frequencies = Counter(tokenized_toy_example)
tokens_and_frequencies.most_common()

### This was all done with the toy_example, let's now do it with the 20_newsgroup data


### Finding the (token, frequency) tuples over the entire 20 newsgroup data set
# NOTE: you don't need to run this if you have the *20_newsgroup_tokens_and_frequencies.pickle* file

In [None]:
root_dir = '\\\\?\\' + os.path.abspath(os.path.join(os.getcwd(), '20_newsgroups'))
subdirectory_list = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
with open('20_newsgroup_content_dictionary.pickle', 'rb') as handle:
    data_20_newsgroup = pickle.load(handle)

'''
recall that the data_20_newsgroup dictionary is formatted as: 

data_20_newsgroup = {

    'alt.atheism' : {

                        '49960' : [ list of all the valid lines in file 49960 in the 'alt.atheism' subdirectory ],

                        '51060' : [ list of all the valid lines in file 51060 in the 'alt.atheism' subdirectory ],

                        ...

                    },

    'comp.graphics' : {

                          '37261': [ list of all the valid lines in file 37261 in the 'comp.graphics' subdirectory ],

                          '37913': [ list of all the valid lines in file 37913 in the 'comp.graphics' subdirectory ],

                          ...

                      } ,

    ...

}

'''

data_20_newsgroup_tokens = []

for topic_name, topic_dictionary in data_20_newsgroup.items():
    for filename, content_lines_in_filename in topic_dictionary.items():
        print(f"iterating over {topic_name} , {filename}")
        
        # content_lines_in_filename is a list of strings where each string is an untokenized line 
        # (i.e. the string representation of the line in the file).
        
        # tokenizing the list
        tokenized_content_lines = tokenize(content_lines_in_filename) # returns a list

        # adding the tokens to the 
        for token in tokenized_content_lines: 
            data_20_newsgroup_tokens.append(token)


In [None]:
data_20_newsgroup_tokens_and_frequencies = Counter(data_20_newsgroup_tokens)
with open('20_newsgroup_tokens_and_frequencies.pickle', 'wb') as handle:
    pickle.dump(data_20_newsgroup_tokens_and_frequencies, handle)

In [None]:
with open('20_newsgroup_tokens_and_frequencies.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [None]:
print(list(data_20_newsgroup_tokens_and_frequencies.items())[0:10])
print(list(data.items())[0:10])

### Repeating the (token, frequency) computing code but in a topic-specific manner

# DON'T NEED TO DO THIS IF YOU HAVE THE *20_newsgroup_tokens_and_frequencies_by_topics_dictionary.pickle* FILE

In [None]:
with open('20_newsgroup_content_dictionary.pickle', 'rb') as handle:
    data_20_newsgroup = pickle.load(handle)
assert data_20_newsgroup == super_dictionary
'''
recall that the data_20_newsgroup dictionary is formatted as: 

data_20_newsgroup = {

    'alt.atheism' : {

                        '49960' : [ list of all the valid lines in file 49960 in the 'alt.atheism' subdirectory ],

                        '51060' : [ list of all the valid lines in file 51060 in the 'alt.atheism' subdirectory ],

                        ...

                    },

    'comp.graphics' : {

                          '37261': [ list of all the valid lines in file 37261 in the 'comp.graphics' subdirectory ],

                          '37913': [ list of all the valid lines in file 37913 in the 'comp.graphics' subdirectory ],

                          ...

                      } ,

    ...

}

'''

data_20_newsgroup_tokens_by_topics = {
    'alt.atheism' : [],
    'comp.graphics' : [],
    'comp.os.ms-windows.misc' : [],
    'comp.sys.ibm.pc.hardware' : [],
    'comp.sys.mac.hardware' : [],
    'comp.windows.x' : [],
    'misc.forsale' : [],
    'rec.autos' : [],
    'rec.motorcycles' : [],
    'rec.sport.baseball' : [],
    'rec.sport.hockey' : [],
    'sci.crypt' : [],
    'sci.electronics' : [],
    'sci.med' : [],
    'sci.space' : [],
    'soc.religion.christian' : [],
    'talk.politics.guns' : [],
    'talk.politics.mideast' : [],
    'talk.politics.misc' : [],
    'talk.religion.misc' : []
}

for topic_name, topic_dictionary in data_20_newsgroup.items():
    print(f"iterating over {topic_name}")

    tokens_in_this_topics_files = []
    
    for filename, content_lines_in_filename in topic_dictionary.items():
        
        # tokenizing the list
        tokenized_content_lines = tokenize(content_lines_in_filename) # returns a list

        # adding the tokens to the tokens_in_this_topics_files list
        for token in tokenized_content_lines: 
            tokens_in_this_topics_files.append(token)
        
    this_topics_tokens_and_frequencies = Counter(tokens_in_this_topics_files)
    this_topic_tokens_and_frequencies_as_list_of_tuples = [ (tok,freq) for (tok,freq) in this_topics_tokens_and_frequencies.items() ]

    data_20_newsgroup_tokens_by_topics[topic_name] = this_topic_tokens_and_frequencies_as_list_of_tuples


In [None]:
with open('20_newsgroup_tokens_and_frequencies_by_topics_dictionary.pickle', 'wb') as handle:
    pickle.dump(data_20_newsgroup_tokens_by_topics, handle)
with open('20_newsgroup_tokens_and_frequencies_by_topics_dictionary.pickle', 'rb') as handle:
    x = pickle.load(handle)
print((x == data_20_newsgroup_tokens_by_topics))

# Data and Storage - Industry Sector
### This notebook assumes that you've downloaded the [industry sector](https://people.cs.umass.edu/~mccallum/data.html) and unpacked the tarball in the same directory as this notebook. 

### You should therefore have a folder called "sector" in the same directory as this notebook. The "sector" folder should have 12 subfolders ("energy.sector", "capital.goods.sector", etc...), each containing a long list of subfolders, themselves containing html-like documents.

# File Format - Industry Sector
### Trying to find a good "landmark line".

In [None]:
root_dir = '\\\\?\\' + os.path.abspath(os.path.join(os.getcwd(), 'sector'))
openable_file_count = 0
file_count = 0
landmark_count = 0
strangely_encoded_files = []
annoying_notebook_files_count = 0
landmarkless_files = []

for dirname, dirnames, filenames in os.walk(root_dir):
    # print path to all subdirectories first.
    for subdirname in dirnames:
        #print(os.path.join(dirname, subdirname))
        pass

    # print path to all filenames.
    annoying_notebook_files = [ x for x in filenames if '.onetoc2' in x ]
    annoying_notebook_files_count += len(annoying_notebook_files)
    for notebookfile in annoying_notebook_files:
        filenames.remove(notebookfile)
        
    file_count += len(filenames)
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        with open(os.path.join(dirname, filename),'r', errors='ignore') as infile:
            contents = infile.readlines()
            found_landmark = False
            for line in contents:
                if 'content-type: text/html' in line.lower():
                    landmark_count += 1
                    found_landmark = True
                    break
            if not found_landmark: 
                landmarkless_files.append(dirname+filename)
            openable_file_count += 1
        

print(f"examined {openable_file_count}/{file_count} and counted {landmark_count} 'Content-type: text/html' lines")
print('found {} notebook files (ignoring them)'.format(str(annoying_notebook_files_count)))
print('these {} files did not have the landmark:\n\n{}'.format( str(len(landmarkless_files)), '\n\n'.join(landmarkless_files)))

### It seems like the line "Content-type: text/html" is a good landmark line, so now I'm going to initialize and populate the sector_super_dictionary.
# NOTE: this dictionary has one level more than the previous super_dictionary

In [2]:
key_subtopic_map = {
    "100" : "sector\\basic.materials.sector\\chemical.manufacturing.industry",
    "101" : "sector\\basic.materials.sector\\chemicals.plastics.and.rubber.industry",
    "102" : "sector\\basic.materials.sector\\containers.and.packaging.industry",
    "103" : "sector\\basic.materials.sector\\fabricated.plastic.and.rubber.industry",
    "104" : "sector\\basic.materials.sector\\forestry.and.wood.products.industry",
    "105" : "sector\\basic.materials.sector\\gold.and.silver.industry",
    "106" : "sector\\basic.materials.sector\\iron.and.steel.industry",
    "107" : "sector\\basic.materials.sector\\metal.and.mining.industry",
    "108" : "sector\\basic.materials.sector\\misc.fabricated.products.industry",
    "109" : "sector\\basic.materials.sector\\paper.and.paper.products.industry",
    "110" : "sector\\capital.goods.sector\\aerospace.and.defense.industry",
    "111" : "sector\\capital.goods.sector\\construction.sector\\construction-raw.materials.industry",
    "112" : "sector\\capital.goods.sector\\construction.sector\\construction-supplies.and.fixtures.industry",
    "113" : "sector\\capital.goods.sector\\construction.sector\\construction.and.agricultural.machinery.industry",
    "114" : "sector\\capital.goods.sector\\construction.sector\\construction.services.industry",
    "115" : "sector\\capital.goods.sector\\misc.capital.goods.industry",
    "116" : "sector\\capital.goods.sector\\mobile.homes.and.rvs.industry",
    "117" : "sector\\conglomerates.industry",
    "118" : "sector\\consumer.cyclical.sector\\appliance.and.tool.industry",
    "119" : "sector\\consumer.cyclical.sector\\audio.and.video.equipment.industry",
    "120" : "sector\\consumer.cyclical.sector\\auto.sector\\auto.and.truck.manufacturers.industry",
    "121" : "sector\\consumer.cyclical.sector\\auto.sector\\auto.and.truck.parts.industry",
    "122" : "sector\\consumer.cyclical.sector\\footwear.industry",
    "123" : "sector\\consumer.cyclical.sector\\furniture.and.fixtures.industry",
    "124" : "sector\\consumer.cyclical.sector\\jewelry.and.silverware.industry",
    "125" : "sector\\consumer.cyclical.sector\\photography.industry",
    "126" : "sector\\consumer.cyclical.sector\\recreational.products.industry",
    "127" : "sector\\consumer.cyclical.sector\\textiles.non.apparel.industry",
    "128" : "sector\\consumer.cyclical.sector\\tires.industry",
    "129" : "sector\\consumer.non-cyclical.sector\\beverages.sector\\alcoholic.beverages.industry",
    "130" : "sector\\consumer.non-cyclical.sector\\beverages.sector\\non.alcoholic.beverages.industry",
    "131" : "sector\\consumer.non-cyclical.sector\\crops.industry",
    "132" : "sector\\consumer.non-cyclical.sector\\fish.livestock.industry",
    "133" : "sector\\consumer.non-cyclical.sector\\food.processing.industry",
    "134" : "sector\\consumer.non-cyclical.sector\\office.supplies.industry",
    "135" : "sector\\consumer.non-cyclical.sector\\personal.and.household.products.industry",
    "136" : "sector\\consumer.non-cyclical.sector\\tobacco.industry",
    "137" : "sector\\energy.sector\\coal.industry",
    "138" : "sector\\energy.sector\\oil.and.gas.integrated.industry",
    "139" : "sector\\energy.sector\\oil.and.gas.operations.industry",
    "140" : "sector\\energy.sector\\oil.well.services.and.equipment.industry",
    "141" : "sector\\financial.sector\\banking.sector\\money.center.banks.industry",
    "142" : "sector\\financial.sector\\banking.sector\\regional.banks.industry",
    "143" : "sector\\financial.sector\\banking.sector\\s.and.ls.savings.banks.industry",
    "144" : "sector\\financial.sector\\consumer.financial.services.industry",
    "145" : "sector\\financial.sector\\insurance.sector\\accident.and.health.insurance.industry",
    "146" : "sector\\financial.sector\\insurance.sector\\life.insurance.industry",
    "147" : "sector\\financial.sector\\insurance.sector\\misc.insurance.industry",
    "148" : "sector\\financial.sector\\insurance.sector\\property.and.casualty.insurance.industry",
    "149" : "sector\\financial.sector\\investment.services.industry",
    "150" : "sector\\financial.sector\\misc.financial.services.industry",
    "151" : "sector\\healthcare.sector\\biotechnology.and.drugs.industry",
    "152" : "sector\\healthcare.sector\\healthcare.facilities.industry",
    "153" : "sector\\healthcare.sector\\major.drugs.industry",
    "154" : "sector\\healthcare.sector\\medical.equipment.and.supplies.industry",
    "155" : "sector\\services.sector\\advertising.industry",
    "156" : "sector\\services.sector\\broadcasting.and.cable.tv.industry",
    "157" : "sector\\services.sector\\business.services.industry",
    "158" : "sector\\services.sector\\casinos.and.gambling.industry",
    "159" : "sector\\services.sector\\communications.services.industry",
    "160" : "sector\\services.sector\\hotels.and.motels.industry",
    "161" : "sector\\services.sector\\law.sector\\immigration.law.industry",
    "162" : "sector\\services.sector\\law.sector\\international.law.industry",
    "163" : "sector\\services.sector\\law.sector\\maritime.law.industry",
    "164" : "sector\\services.sector\\law.sector\\trade.law.industry",
    "165" : "sector\\services.sector\\motion.pictures.industry",
    "166" : "sector\\services.sector\\personal.services.industry",
    "167" : "sector\\services.sector\\printing.and.publishing.industry",
    "168" : "sector\\services.sector\\printing.services.industry",
    "169" : "sector\\services.sector\\real.estate.operations.industry",
    "170" : "sector\\services.sector\\recreational.activities.industry",
    "171" : "sector\\services.sector\\rental.and.leasing.industry",
    "172" : "sector\\services.sector\\restaurants.industry",
    "173" : "sector\\services.sector\\retail.sector\\retail.apparel.industry",
    "174" : "sector\\services.sector\\retail.sector\\retail.catalog.and.mail.order.industry",
    "175" : "sector\\services.sector\\retail.sector\\retail.department.and.discount.industry",
    "176" : "sector\\services.sector\\retail.sector\\retail.drugs.industry",
    "177" : "sector\\services.sector\\retail.sector\\retail.grocery.industry",
    "178" : "sector\\services.sector\\retail.sector\\retail.home.improvement.industry",
    "179" : "sector\\services.sector\\retail.sector\\retail.specialty.industry",
    "180" : "sector\\services.sector\\retail.sector\\retail.technology.industry",
    "181" : "sector\\services.sector\\schools.industry",
    "182" : "sector\\services.sector\\security.systems.and.services.industry",
    "183" : "sector\\services.sector\\waste.management.services.industry",
    "184" : "sector\\technology.sector\\communications.equipment.industry",
    "185" : "sector\\technology.sector\\computer.sector\\computer.hardware.industry",
    "186" : "sector\\technology.sector\\computer.sector\\computer.networks.industry",
    "187" : "sector\\technology.sector\\computer.sector\\computer.peripherals.industry",
    "188" : "sector\\technology.sector\\computer.sector\\computer.services.industry",
    "189" : "sector\\technology.sector\\computer.sector\\computer.storage.devices.industry",
    "190" : "sector\\technology.sector\\computer.sector\\software.and.programming.industry",
    "191" : "sector\\technology.sector\\electronic.instruments.and.controls.industry",
    "192" : "sector\\technology.sector\\office.equipment.industry",
    "193" : "sector\\technology.sector\\scientific.and.technical.instruments.industry",
    "194" : "sector\\technology.sector\\semiconductors.industry",
    "195" : "sector\\transportation.sector\\air.courier.industry",
    "196" : "sector\\transportation.sector\\airline.industry",
    "197" : "sector\\transportation.sector\\misc.transportation.industry",
    "198" : "sector\\transportation.sector\\railroad.industry",
    "199" : "sector\\transportation.sector\\trucking.industry",
    "200" : "sector\\transportation.sector\\water.transportation.industry",
    "201" : "sector\\utilities.sector\\electric.utilities.industry",
    "202" : "sector\\utilities.sector\\natural.gas.industry",
    "203" : "sector\\utilities.sector\\water.utilities.industry"
}

data_industry_sector_tokens_by_topics = {}
for key in key_subtopic_map.keys():
    data_industry_sector_tokens_by_topics[key] = []

### Populating sector_super_dictionary

In [6]:
root_dir = '\\\\?\\' + os.path.abspath(os.path.join(os.getcwd()))
file_count = 0
all_tokens_in_data_set = []

for key, subtopic_path in key_subtopic_map.items():
    tokens_in_this_subtopics_files = []
    for file in os.listdir(os.path.abspath(os.path.join(root_dir, subtopic_path ))):
        lines_with_valid_content = []
        with open(os.path.abspath(os.path.join(root_dir, subtopic_path, file )), 'r', errors='ignore') as inputfile:
            contents = inputfile.readlines()
        for i, line in enumerate(contents):
            if "content-type: text/html" in line.lower():
                lines_with_valid_content.append(line.rstrip())
                break
        i += 1
        while i < len(contents):
            lines_with_valid_content.append(contents[i].rstrip())
            i += 1
        tokenized_content_lines = tokenize(lines_with_valid_content) # returns a list

        # adding the tokens to the tokens_in_this_topics_files list
        for token in tokenized_content_lines: 
            tokens_in_this_subtopics_files.append(token)
            all_tokens_in_data_set.append(token)
        
    this_subtopics_tokens_and_frequencies = Counter(tokens_in_this_subtopics_files)
    this_subtopic_tokens_and_frequencies_as_list_of_tuples = [ (tok,freq) for (tok,freq) in this_subtopics_tokens_and_frequencies.items() ]

    data_industry_sector_tokens_by_topics[key] = this_subtopic_tokens_and_frequencies_as_list_of_tuples


In [8]:
data_industry_sector_tokens_by_topics["100"]

[('Content-type', 77),
 ('text/html', 132),
 ('Generated', 6),
 ('by', 85),
 ('the', 764),
 ('Home', 55),
 ('Page', 41),
 ('Wizard', 6),
 ('CompuServe', 6),
 ('Inc.', 152),
 ('Last', 8),
 ('Updated', 6),
 ('Jul', 6),
 ('html', 54),
 ('The', 128),
 ('following', 31),
 ('HTML', 124),
 ('tags', 21),
 ('are', 117),
 ('header', 10),
 ('and', 1039),
 ('title', 70),
 ('These', 17),
 ('allows', 10),
 ('you', 89),
 ('to', 554),
 ('specify', 7),
 ('a', 898),
 ('for', 308),
 ('this', 68),
 ('page', 16),
 ('head', 57),
 ('NuCO2', 1),
 ('/title', 55),
 ('/head', 54),
 ('BODY', 51),
 ('BACKGROUND=', 15),
 ('Bubbles.gif', 5),
 ('CENTER', 144),
 ('TABLE', 38),
 ('BORDER', 5),
 ('TR', 388),
 ('TD', 807),
 ('STRONG', 43),
 ('FONT', 156),
 ('SIZE=3', 3),
 ('Best', 2),
 ('viewed', 2),
 ('with', 161),
 ('Netscape', 4),
 ('href=', 824),
 ('http', 239),
 ('//home.netscape.com/comprod/upgrades/index.html', 1),
 ('IMG', 219),
 ('SRC=', 130),
 ('aninet30.gif', 1),
 ('/STRONG', 42),
 ('/FONT', 161),
 ('SIZE', 16

In [13]:
all_tokens_and_token_frequencies_in_data_set = Counter(all_tokens_in_data_set)
all_tokens_and_token_frequencies_in_data_set_as_list_of_tuples = [ (tok,freq) for (tok,freq) in all_tokens_and_token_frequencies_in_data_set.items() ]
print(len(all_tokens_and_token_frequencies_in_data_set_as_list_of_tuples))

239860


In [9]:
print(len(data_industry_sector_tokens_by_topics))
for i,x in enumerate(data_industry_sector_tokens_by_topics.keys()):
    
    print(len(data_industry_sector_tokens_by_topics[x]))
    if i > 0:
        print(data_industry_sector_tokens_by_topics[x] == data_industry_sector_tokens_by_topics[list(data_industry_sector_tokens_by_topics.keys())[i-1]])

104
8653
6813
False
6761
False
5922
False
6270
False
7578
False
8224
False
6474
False
9927
False
6812
False
6247
False
2984
False
6957
False
7416
False
6290
False
6407
False
5162
False
8163
False
7880
False
9061
False
6968
False
6533
False
6472
False
7239
False
7718
False
7218
False
7698
False
7530
False
2969
False
6410
False
3248
False
3236
False
4435
False
8574
False
6369
False
6344
False
4092
False
5732
False
8637
False
6659
False
7325
False
8884
False
6786
False
6706
False
7474
False
5895
False
7276
False
6682
False
8367
False
9518
False
7282
False
7642
False
9636
False
6979
False
6876
False
5320
False
8120
False
7091
False
9195
False
11086
False
7808
False
11876
False
10774
False
13593
False
7826
False
9189
False
4965
False
9141
False
6646
False
8372
False
11158
False
7993
False
7318
False
7591
False
6298
False
6797
False
8021
False
7383
False
6188
False
7646
False
8696
False
8082
False
8863
False
9294
False
8083
False
9503
False
7776
False
7176
False
7588
False
7670
False
9161
Fa

In [14]:
data_industry_sector_tokens_by_topics, key_subtopic_map
with open('industry_sector_tokens_and_frequencies_by_topics_dictionary.pickle', 'wb') as handle:
    pickle.dump(data_industry_sector_tokens_by_topics, handle)
with open('industry_sector_key_subtopic_mapping_dictionary.pickle', 'wb') as handle:
    pickle.dump(key_subtopic_map, handle)
with open('industry_sector_tokens_and_frequencies_across_dataset_list_of_tuples.pickle','wb') as handle:
    pickle.dump(all_tokens_and_token_frequencies_in_data_set_as_list_of_tuples, handle)