In [None]:
import pandas as pd
import requests
import json

In [3]:
# Fetch SIC codes from SEC EDGAR for each ticker

# 1) Map ticker -> CIK
headers = {"User-Agent": "rami.younes@umontreal.ca"}  # SEC requires this
tickers = pd.read_json("https://www.sec.gov/files/company_tickers.json", storage_options=headers).T
# Normalize mapping dict: { 'AAPL': '0000320193', ... }
map_cik = {row['ticker'].upper(): f"{int(row['cik_str']):010d}" for _, row in tickers.iterrows()}

def get_sic_for_ticker(tic):
    cik = map_cik.get(tic.upper())
    if not cik:
        return None, None
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    j = requests.get(url, headers=headers, timeout=20).json()
    sic = str(j.get("sic") or "")
    desc = j.get("sicDescription")
    return sic if sic else None, desc

# suppose df has a 'tic' column of US tickers
df = pd.read_csv("../data/invst_univ_companies.csv")
df['SIC'], df['SIC_desc'] = zip(*df['tic'].map(lambda t: get_sic_for_ticker(t) if isinstance(t, str) else (None, None)))

# 2) Extract SIC2
df['SIC2'] = df['SIC'].str[:2]



In [4]:
df.shape

(3338, 5)

In [5]:
# 3) One-hot encode SIC2 dummies
sic2 = pd.get_dummies(df['SIC2'], prefix='SIC2', dtype='int8')
df = pd.concat([df, sic2], axis=1)

In [6]:
# 4) Save
(df[['tic', 'conm', 'SIC', 'SIC_desc', 'SIC2']]
 .dropna()
 .to_csv("../data/invst_univ_companies_with_sic.csv", index=False)
)

In [13]:
# === NEW: build mapping from SIC2 dummy -> list of SIC_desc for that SIC2 group ===
# Clean descriptions a bit and drop NAs
tmp = df.loc[df['SIC2'].notna() & df['SIC_desc'].notna(), ['SIC2', 'SIC_desc']].copy()
tmp['SIC_desc'] = tmp['SIC_desc'].astype(str).str.strip()

# Aggregate unique descriptions by SIC2
sic2_to_descs = (
    tmp.groupby('SIC2')['SIC_desc']
       .apply(lambda s: sorted(set(s)))
       .to_dict()
)

# Convert to the requested dummy-column-keyed mapping: e.g., "SIC2_28": [...]
sic2_dummy_desc_map = {k: v for k, v in sic2_to_descs.items()}

# Optional: persist the map for auditability
# pd.Series(sic2_dummy_desc_map, dtype=object).to_json("sic2_dummy_desc_map.json", indent=2)

In [None]:
with open('../data/sic2_dummy_description_map.json', 'w') as f:
    json.dump(sic2_dummy_desc_map, f)

## Explore the pulled data

In [7]:
df[['SIC', 'SIC2','SIC_desc']].value_counts(dropna=False)

SIC   SIC2  SIC_desc                                
NaN   NaN   NaN                                         1334
2834  28    Pharmaceutical Preparations                  136
6022  60    State Commercial Banks                        88
7372  73    Services-Prepackaged Software                 83
6021  60    National Commercial Banks                     55
                                                        ... 
2330  23    Women's, Misses':  and Juniors Outerwear       1
3713  37    Truck & Bus Bodies                             1
3715  37    Truck Trailers                                 1
3716  37    Motor Homes                                    1
2520  25    Office Furniture                               1
Name: count, Length: 343, dtype: int64

In [8]:
df.shape

(3338, 73)

In [9]:
df[['SIC', 'SIC2','SIC_desc']].value_counts(dropna=False)

SIC   SIC2  SIC_desc                                
NaN   NaN   NaN                                         1334
2834  28    Pharmaceutical Preparations                  136
6022  60    State Commercial Banks                        88
7372  73    Services-Prepackaged Software                 83
6021  60    National Commercial Banks                     55
                                                        ... 
2330  23    Women's, Misses':  and Juniors Outerwear       1
3713  37    Truck & Bus Bodies                             1
3715  37    Truck Trailers                                 1
3716  37    Motor Homes                                    1
2520  25    Office Furniture                               1
Name: count, Length: 343, dtype: int64

In [15]:
df.dropna().groupby('SIC2')[['SIC_desc', 'SIC']].value_counts()

SIC2  SIC_desc                                                SIC 
01    Agricultural Production-Crops                           0100    2
02    Agricultural Prod-Livestock & Animal Specialties        0200    1
07    Agricultural Services                                   0700    2
10    Gold and Silver Ores                                    1040    7
      Metal Mining                                            1000    5
                                                                     ..
87    Services-Management Services                            8741    2
      Services-Testing Laboratories                           8734    2
      Services-Engineering Services                           8711    1
      Services-Engineering, Accounting, Research, Management  8700    1
89    Services-Services, NEC                                  8900    1
Name: count, Length: 340, dtype: int64

In [11]:
df[df.SIC2=='10']['SIC_desc']

324             Metal Mining
617             Metal Mining
779     Gold and Silver Ores
860     Gold and Silver Ores
867             Metal Mining
969             Metal Mining
1575    Gold and Silver Ores
1995    Gold and Silver Ores
2090    Gold and Silver Ores
2453    Gold and Silver Ores
2838            Metal Mining
2948    Gold and Silver Ores
Name: SIC_desc, dtype: object

In [None]:
# sic_dummies['SIC']= sic_dummies['SIC'].astype('Int64')
# sic_dummies['SIC2']= sic_dummies['SIC2'].astype('Int64')
# sic_dummies.to_csv('../data/invst_univ_companies_with_sic.csv', index=False)