# Small Dataset based on 76K Companies and their Inferred NAICS Code

In [8]:
import pandas as pd
import os
import numpy as np
import concurrent.futures as cf
from glob import glob


pd.set_option('display.max_columns', None)

In [15]:
path = '/Users/ramonperez/Dropbox/Burning Glass/Data/companies_76k/'
path_out = '~/Dropbox/Burning Glass/Analysis/approach_8'

In [16]:
col_names = ['EmployerClean', 'JobDate', 'ConsolidatedInferredNAICS']

dtypes={'ConsolidatedInferredNAICS': np.str, 'EmployerClean': np.str, 'JobDate': np.str}

In [17]:
files = glob(os.path.join(path, 'fil*', '*.csv'))
len(files), files[:5]

(72,
 ['/Users/ramonperez/Dropbox/Burning Glass/Data/companies_76k/filtered_data_12/data_filtered_11.csv',
  '/Users/ramonperez/Dropbox/Burning Glass/Data/companies_76k/filtered_data_15/data_filtered_48.csv',
  '/Users/ramonperez/Dropbox/Burning Glass/Data/companies_76k/filtered_data_14/data_filtered_04.csv',
  '/Users/ramonperez/Dropbox/Burning Glass/Data/companies_76k/filtered_data_13/data_filtered_27.csv',
  '/Users/ramonperez/Dropbox/Burning Glass/Data/companies_76k/filtered_data_07/data_filtered_00.csv'])

In [None]:
%%time


def get_files(file):
    return pd.read_csv(file, usecols=col_names, dtype=dtypes, low_memory=False)


with cf.ProcessPoolExecutor() as executor:
    results = executor.map(get_files, files)
    
df = pd.concat(results)
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
df['EmployerClean'].nunique()

In [None]:
grouped_df = df.groupby(['EmployerClean', 'ConsolidatedInferredNAICS'])['JobDate'].count()
grouped_df.head()

In [None]:
grouped_df.shape

In [None]:
df = grouped_df.reset_index().drop('JobDate', axis=1)

df['langth_of_words'] = df['ConsolidatedInferredNAICS'].apply(lambda x: len(x))
df = df.sort_values(by=['EmployerClean', 'langth_of_words'], ascending=False)

unk_mask = (df['ConsolidatedInferredNAICS'] == 'Unknown')
series_unknown = df[unk_mask].copy()
df_no_unk = df[~unk_mask].copy()

series_unknown = series_unknown.drop_duplicates('EmployerClean')
df_no_unk = df_no_unk.drop_duplicates('EmployerClean')


df_w_unk = pd.concat([df_no_unk, series_unknown], ignore_index=True)

df_w_unk = df_w_unk.drop_duplicates('EmployerClean')


df_w_unk.shape

In [None]:
df_w_unk.head(20)

In [None]:
df_w_unk.drop('langth_of_words', axis=1).to_csv('companies76k_inf_naics.csv', index=False)