# Building Industry to NAICS Data Set for Supervised Learning

https://www.census.gov/eos/www/naics/

In [1]:
import pandas as pd
import numpy as np
import re
import string
import difflib
import sys
import time 
import pickle 

pd.set_option('display.max_colwidth', 200)

In [2]:
# Excel file with 2017 NAICS codes 
# Source: https://www.census.gov/eos/www/naics/downloadables/downloadables.html
df = pd.read_excel('2017_NAICS_Descriptions.xlsx')
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2041 entries, 0 to 2195
Data columns (total 3 columns):
Code           2041 non-null object
Title          2041 non-null object
Description    2041 non-null object
dtypes: object(3)
memory usage: 63.8+ KB


In [3]:
df['Title'] = df['Title'].str.strip()
df['Title'] = df['Title'].apply(lambda title: title[:-1] if title.endswith('T') else title)

df['Description'] = df['Description'].apply(lambda desc: desc[:desc.find('Cross-Reference')] if 'Cross-Reference' in desc else desc)
df['Description'] = df['Description'].str.strip()

In [4]:
index = 0
print(df.iloc[index].Title)
print('*'*len(df.iloc[index].Title) + '\n')
print(df.iloc[index].Description)

Agriculture, Forestry, Fishing and Hunting
******************************************

The Sector as a Whole

The Agriculture, Forestry, Fishing and Hunting sector comprises establishments primarily engaged in growing crops, raising animals, harvesting timber, and harvesting fish and other animals from a farm, ranch, or their natural habitats.

The establishments in this sector are often described as farms, ranches, dairies, greenhouses, nurseries, orchards, or hatcheries.  A farm may consist of a single tract of land or a number of separate tracts which may be held under different tenures.  For example, one tract may be owned by the farm operator and another rented.  It may be operated by the operator alone or with the assistance of members of the household or hired employees, or it may be operated by a partnership, corporation, or other type of organization. When a landowner has one or more tenants, renters, croppers, or managers, the land operated by each is considered a farm.

The 

### Parse the Alphabetical Index of the NAICS Document For More Mappings

Source: https://www.census.gov/eos/www/naics/2017NAICS/2017_NAICS_Manual.pdf

In [5]:
import PyPDF2

pdfFileObj = open('naics_book.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print('Number of pages:', pdfReader.numPages)

Number of pages: 963


In [6]:
# Extract the index code descriptions
start_page = 654
end_page = 963

pageObj = pdfReader.getPage(start_page)    
text = pageObj.extractText()
text

"653\n \n \ncensus.gov/naics\n \nAlphabetic Index\n \n \n311611\n \nAbattoirs\n \n621410\n \nAbortion clinics\n \n334519\n \nAbrasion testing machines manufacturing\n \n339114\n \nAbrasive points, wheels, and disks, dental, \nmanufacturing\n \n327910\n \nAbrasive products manufacturing\n \n212322\n \nAbrasive sand quarrying and/or \nbeneficiating\n \n212399\n \nAbrasive stones (e.g., emery, grindstones, \nhones, pumice) mining and/or beneficiating\n \n423840\n \nAbrasives merchant wholesalers\n \n212399\n \nAbrasives, natural, mining and/or \nbeneficiating\n \n322121\n \nAbsorbent paper stock manufacturing\n \n332420\n \nAbsorbers, gas, heavy gauge metal, \nmanufacturing\n \n334513\n \nAbsorption analyzers, industrial process \ntype (e.g., infrared), manufacturing\n \n237310\n \nAbutment construction\n \n611691\n \nAcademic tutoring services\n \n611310\n \nAcademies, college or university\n \n611110\n \nAcademies, elementary or secondary\n \n611210\n \nAcademies, junior college\n \n611

In [7]:
# extract code to industry mappings
pattern = re.compile(r'\n(\d{6})\n\s+\n(.*)\n')
data = re.findall(pattern, text)
data[:5]

[('311611', 'Abattoirs'),
 ('621410', 'Abortion clinics'),
 ('334519', 'Abrasion testing machines manufacturing'),
 ('339114', 'Abrasive points, wheels, and disks, dental, '),
 ('327910', 'Abrasive products manufacturing')]

In [8]:
codes = []
titles = []

for page in range(start_page, end_page):
    pageObj = pdfReader.getPage(page)    
    text = pageObj.extractText()
    matches = re.findall(pattern, text)
    for match in matches:
        code, title = match
        codes.append(code)
        # strip the title of end punctuation
        title = title.strip(string.punctuation + string.whitespace)
        titles.append(title)

In [9]:
# Append the appendix data to the excel file
apendix_df = pd.DataFrame({'Code': codes, 'Title': titles})
out_df = df.append(apendix_df, ignore_index=True)
out_df.tail()

Unnamed: 0,Code,Description,Title
22066,339992,,Zithers and parts manufacturing
22067,925120,,Zoning boards and commissions
22068,712130,,Zoological gardens
22069,712130,,Zoos
22070,111219,,"Zucchini farming, field, bedding plant and"


In [10]:
# Save the results
out_df.to_csv('data.csv')

###  Tokenization

Source: https://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html

In [6]:
import string

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin


class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, special=None,
                 punct=None, lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.special    = special
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(word_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip(string.punctuation) if self.strip else token
                
                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue
                    
                # If token is a number continue
                if token.isdecimal():
                    continue
                
                # If token is a special word, continue
                if self.special and token in self.special:
                    continue
                
                    
                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                    
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

### Build a Vocabulary Based on The NAICS document

In [3]:
# Load data
df = pd.read_csv('data.csv', index_col=0)
print(df.info())

titles = df['Title'].dropna()
descriptions = df['Description'].dropna()

sentences = pd.concat([titles, descriptions], axis=0)
preprocessor = NLTKPreprocessor(special=[
    'company', 'business', 'manufacturing', 'facility',
    'start',   'build',    
])

descriptions = preprocessor.transform(descriptions)
titles = preprocessor.transform(titles)

labels = set(word for lst in titles for word in lst)
ordinary = set(word for lst in descriptions for word in lst)
vocabulary = labels.union(ordinary)

'liability' in labels

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22071 entries, 0 to 22070
Data columns (total 3 columns):
Code           22071 non-null object
Description    2041 non-null object
Title          22071 non-null object
dtypes: object(3)
memory usage: 689.7+ KB
None


True

### Search API for Assigning NAICS codes to User Input

In [4]:
from google import google

In [5]:
user_df = pd.read_csv('user_data.csv', header=None)[0]
user_df = user_df.str.strip().dropna()
tokens = preprocessor.transform(user_df)

In [6]:
def scrape(text, patterns, limit=5):
    """Scrape NAICS codes for `text` from returned Google Search page.
    """
    results = google.search(text + ' ' + 'NAICS code', 1)
    result = []
    for pattern in patterns:
        matches = [pattern.search(result.name) for result in results]
        codes = [match.group(1) for match in matches if match is not None]
        result.extend(codes[:limit-len(result)])
        
        if len(result) == limit:
            break
    
    if len(result) < limit:
        result.extend([None for i in range(limit-len(result))])
    
    return result

In [None]:
results = []

# Regexp for scraping the Google Search page
STATCAT = re.compile(r'NAICS\s+Code\s+(\d+)')
CENSUS = re.compile(r'NAICS:\s+(\d+)')
patterns = [STATCAT, CENSUS]

# In case downloading is interrupted, 
# start from this sample.
start=7200
for j, words in enumerate(tokens[start:]):
    if j % 100 == 0 and j > 0:
        with open('pickles/%d.p' % (j+start), 'wb') as f:
            print('writing results to file...')
            pickle.dump(results, f)
            results = []
            
    text = ' '.join(words)
    print('%d: %s --' % ((j+start), text), end=' ')
    
    tick = time.time()
    codes = scrape(text, patterns)
    if any(val is not None for val in codes):
        results.append((text, codes))
        tock = time.time()
        print('%.2f sec' % (tock-tick))
        continue
    
    print('Closer look at', text)
    # use the words most relevant to the NAICS encoding
    processed = []
    for word in words:
        similar = difflib.get_close_matches(word, vocabulary, n=1)
        processed.extend(similar)
        
    if not processed:
        continue
        
    stride = max(1, len(processed) // 3)
    for i in range(0, len(processed), stride):
        text = ' '.join(processed[i:i+stride])
        print('%d: %s --' % ((j+start), text), end=' ')
        codes = scrape(text, patterns)
        results.append((text, codes))  
    
    tock = time.time()
    
    print('%.2f sec' % (tock-tick))

### Load User Text To NAICS Data

In [26]:
import os
import operator

from collections import Counter

In [27]:
directory = './pickles'
data = []

for pfile in os.listdir(directory):
    path = os.path.join(directory, pfile)
    with open(path, 'rb') as f:
        data.extend(pickle.load(f))

In [28]:
# Truncate the NAICS codes to the first 5 (industry level) digits
n = 5
results = []

for query, code_list in data:
    truncodes = [code[:n] for code in code_list if code is not None]
    counts = Counter(truncodes)
    if len(counts) > 0:
        try:
            likely = max(counts.items(), key=operator.itemgetter(1))[0]
            results.append((query, likely))
        except ValueError as e:
            pass

In [None]:
user_df = pd.DataFrame(results, columns=['Description', 'Code'])
print(user_df.info())
user_df.head()

In [30]:
user_df.to_csv('naics-google-search-v1.csv')