### Import SEC Data File Created in Project2_SEC_Filings Notebook

In [1]:
import pandas as pd
import os
import csv

In [2]:
sec_path = 'sec_df_export.csv'
sec_df = pd.read_csv(sec_path)
sec_df.head()
sec_df = sec_df[['cik', 'name', 'countryba', 'stprba', 'cityba', 'zipba', 'form', 'latitude', 'longitude']]
sec_df.dropna(axis=0, how="any", subset=['stprba'], inplace=True)
sec_df.drop_duplicates(inplace=True)
sec_df.sort_values(by="cik", ascending=True, inplace=True)
sec_df.head()

Unnamed: 0,cik,name,countryba,stprba,cityba,zipba,form,latitude,longitude
1734,1750,AAR CORP,US,IL,WOOD DALE,60191,10-K,41.98568,-87.97991
1735,1800,ABBOTT LABORATORIES,US,IL,ABBOTT PARK,60064-3500,10-K,Missing,Missing
2335,1961,WORLDS INC,US,MA,BROOKLINE,2445,10-K,42.336193,-71.1295
1163,2098,ACME UNITED CORP,US,CT,FAIRFIELD,6824,10-K,41.147858,-73.25259
4226,2178,"ADAMS RESOURCES & ENERGY, INC.",US,TX,HOUSTON,77027,10-K,29.75137,-95.45202


In [3]:
sec_df.shape

(5044, 9)

In [4]:
sec_json = 'https://www.sec.gov/files/company_tickers.json'

In [5]:
stock_tickers = pd.read_json(sec_json).transpose()

In [6]:
stock_tickers.set_index('cik_str', inplace=True)
stock_tickers.head()

Unnamed: 0_level_0,ticker,title
cik_str,Unnamed: 1_level_1,Unnamed: 2_level_1
320193,AAPL,Apple Inc.
789019,MSFT,MICROSOFT CORP
1018724,AMZN,AMAZON COM INC
1652044,GOOG,Alphabet Inc.
1326801,FB,Facebook Inc


In [7]:
stock_tickers.drop_duplicates(inplace=True)
stock_tickers.shape

(10862, 2)

In [8]:
sec_df = sec_df.merge(stock_tickers, how="left", left_on="cik", right_index=True)
sec_df.sort_values(by='cik', ascending=True, inplace=True)
sec_df.drop(columns=['name', 'form'], inplace=True)
sec_df.head()

Unnamed: 0,cik,countryba,stprba,cityba,zipba,latitude,longitude,ticker,title
1734,1750,US,IL,WOOD DALE,60191,41.98568,-87.97991,AIR,AAR CORP
1735,1800,US,IL,ABBOTT PARK,60064-3500,Missing,Missing,ABT,ABBOTT LABORATORIES
2335,1961,US,MA,BROOKLINE,2445,42.336193,-71.1295,WDDD,WORLDS INC
1163,2098,US,CT,FAIRFIELD,6824,41.147858,-73.25259,ACU,ACME UNITED CORP
4226,2178,US,TX,HOUSTON,77027,29.75137,-95.45202,AE,"ADAMS RESOURCES & ENERGY, INC."


In [9]:
sec_df = sec_df[['cik', 'ticker', 'title', 'countryba', 'stprba', 'cityba', 'zipba', 'latitude', 'longitude']]

sec_df.head()

Unnamed: 0,cik,ticker,title,countryba,stprba,cityba,zipba,latitude,longitude
1734,1750,AIR,AAR CORP,US,IL,WOOD DALE,60191,41.98568,-87.97991
1735,1800,ABT,ABBOTT LABORATORIES,US,IL,ABBOTT PARK,60064-3500,Missing,Missing
2335,1961,WDDD,WORLDS INC,US,MA,BROOKLINE,2445,42.336193,-71.1295
1163,2098,ACU,ACME UNITED CORP,US,CT,FAIRFIELD,6824,41.147858,-73.25259
4226,2178,AE,"ADAMS RESOURCES & ENERGY, INC.",US,TX,HOUSTON,77027,29.75137,-95.45202


In [10]:
sec_df.rename(columns={'cik':'cik', 'countryba':'country', 'stprba':'state', 'cityba':'city', 'zipba':'zipcode', \
                       'latitude':'lat', 'longitude':'lon', 'ticker':'ticker', 'title':'name'}, inplace=True)

sec_df.head()

Unnamed: 0,cik,ticker,name,country,state,city,zipcode,lat,lon
1734,1750,AIR,AAR CORP,US,IL,WOOD DALE,60191,41.98568,-87.97991
1735,1800,ABT,ABBOTT LABORATORIES,US,IL,ABBOTT PARK,60064-3500,Missing,Missing
2335,1961,WDDD,WORLDS INC,US,MA,BROOKLINE,2445,42.336193,-71.1295
1163,2098,ACU,ACME UNITED CORP,US,CT,FAIRFIELD,6824,41.147858,-73.25259
4226,2178,AE,"ADAMS RESOURCES & ENERGY, INC.",US,TX,HOUSTON,77027,29.75137,-95.45202


In [11]:
sec_df.shape

(6055, 9)

In [12]:
mask1 = sec_df['lat'] == "Missing"
missing_df = sec_df[mask1]
missing_df.set_index('ticker', inplace=True)
missing_df.head()

Unnamed: 0_level_0,cik,name,country,state,city,zipcode,lat,lon
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ABT,1800,ABBOTT LABORATORIES,US,IL,ABBOTT PARK,60064-3500,Missing,Missing
APD,2969,AIR PRODUCTS & CHEMICALS INC /DE/,US,PA,ALLENTOWN,18195-1501,Missing,Missing
ALX,3499,ALEXANDERS INC,US,NJ,PARAMUS,7652,Missing,Missing
ALCO,3545,"ALICO, INC.",US,FL,"FT. MYERS,",33913,Missing,Missing
AEPPL,4904,AMERICAN ELECTRIC POWER CO INC,US,OH,COLUMBUS,43215,Missing,Missing


In [13]:
missing_df.shape

(1406, 8)

In [14]:
# missing_df.to_csv('missing_df.csv', index=False)

In [15]:
missing_coordinates_path = 'missing_df2.csv'
missing_coordinates_df = pd.read_csv(missing_coordinates_path)
missing_coordinates_df = missing_coordinates_df[['ticker', 'lat', 'lon']]
missing_coordinates_df.set_index('ticker', inplace=True)

missing_coordinates_df.rename(columns={'lat':'latitude', 'lon':'longitude'}, inplace=True)
missing_coordinates_df

Unnamed: 0_level_0,latitude,longitude
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
ABT,Missing,Missing
APD,40.6028,-75.4698
ALX,40.9473,-74.0710
ALCO,26.6167,-81.8333
AEPPL,39.9833,-82.9833
...,...,...
GNRSW,40.8202,-73.4680
GNRS,40.8202,-73.4680
GNRSU,40.8202,-73.4680
OVV,Missing,Missing


In [16]:
missing_coordinates_df.shape

(1406, 2)

In [17]:
updated_coord_df = pd.concat([missing_df, missing_coordinates_df], axis=1)
updated_coord_df.head()
updated_coord_df.drop(columns = ['lat', 'lon'], inplace=True)
updated_coord_df.rename(columns={'latitude':'lat', 'longitude':'lon'}, inplace=True)
updated_coord_df

Unnamed: 0_level_0,cik,name,country,state,city,zipcode,lat,lon
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ABT,1800,ABBOTT LABORATORIES,US,IL,ABBOTT PARK,60064-3500,Missing,Missing
APD,2969,AIR PRODUCTS & CHEMICALS INC /DE/,US,PA,ALLENTOWN,18195-1501,40.6028,-75.4698
ALX,3499,ALEXANDERS INC,US,NJ,PARAMUS,7652,40.9473,-74.0710
ALCO,3545,"ALICO, INC.",US,FL,"FT. MYERS,",33913,26.6167,-81.8333
AEPPL,4904,AMERICAN ELECTRIC POWER CO INC,US,OH,COLUMBUS,43215,39.9833,-82.9833
...,...,...,...,...,...,...,...,...
GNRSW,1790665,Greenrose Acquisition Corp.,US,NY,WOODBURY,11797,40.8202,-73.4680
GNRS,1790665,Greenrose Acquisition Corp.,US,NY,WOODBURY,11797,40.8202,-73.4680
GNRSU,1790665,Greenrose Acquisition Corp.,US,NY,WOODBURY,11797,40.8202,-73.4680
OVV,1792580,Ovintiv Inc.,US,CO,DENVER,80202,Missing,Missing


In [18]:
updated_coord_df.shape

(1406, 8)

In [19]:
map1 = sec_df['lat'] != "Missing"
sec_df = sec_df[map1]
sec_df.set_index('ticker', inplace = True)
sec_df

Unnamed: 0_level_0,cik,name,country,state,city,zipcode,lat,lon
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AIR,1750,AAR CORP,US,IL,WOOD DALE,60191,41.98568,-87.97991
WDDD,1961,WORLDS INC,US,MA,BROOKLINE,2445,42.336193,-71.1295
ACU,2098,ACME UNITED CORP,US,CT,FAIRFIELD,6824,41.147858,-73.25259
AE,2178,"ADAMS RESOURCES & ENERGY, INC.",US,TX,HOUSTON,77027,29.75137,-95.45202
BKTI,2186,BK Technologies Corp,US,FL,WEST MELBOURNE,32904,28.096567,-80.6853
...,...,...,...,...,...,...,...,...
VTRS,1792044,Viatris Inc,US,NY,NEW YORK,10017,40.75042,-73.97361
SLQT,1794783,"SelectQuote, Inc.",US,KS,OVERLAND PARK,66211,38.917656,-94.66206
LSACU,1796129,LifeSci Acquisition Corp.,US,NY,NEW YORK,10019,40.76501,-73.98288
LSAC,1796129,LifeSci Acquisition Corp.,US,NY,NEW YORK,10019,40.76501,-73.98288


In [20]:
complete_sec_df = pd.concat([sec_df, updated_coord_df], join="inner")
complete_sec_df.reset_index(inplace=True)
complete_sec_df.head()


Unnamed: 0,ticker,cik,name,country,state,city,zipcode,lat,lon
0,AIR,1750,AAR CORP,US,IL,WOOD DALE,60191,41.98568,-87.97991
1,WDDD,1961,WORLDS INC,US,MA,BROOKLINE,2445,42.336193,-71.1295
2,ACU,2098,ACME UNITED CORP,US,CT,FAIRFIELD,6824,41.147858,-73.25259
3,AE,2178,"ADAMS RESOURCES & ENERGY, INC.",US,TX,HOUSTON,77027,29.75137,-95.45202
4,BKTI,2186,BK Technologies Corp,US,FL,WEST MELBOURNE,32904,28.096567,-80.6853


In [21]:
complete_sec_df.shape

(6055, 9)

In [22]:
mask1 = complete_sec_df['lat'] != "Missing"
complete_sec_df = complete_sec_df[mask1]
complete_sec_df.dropna(axis=0, how="any", inplace=True)
complete_sec_df

Unnamed: 0,ticker,cik,name,country,state,city,zipcode,lat,lon
0,AIR,1750,AAR CORP,US,IL,WOOD DALE,60191,41.98568,-87.97991
1,WDDD,1961,WORLDS INC,US,MA,BROOKLINE,2445,42.336193,-71.1295
2,ACU,2098,ACME UNITED CORP,US,CT,FAIRFIELD,6824,41.147858,-73.25259
3,AE,2178,"ADAMS RESOURCES & ENERGY, INC.",US,TX,HOUSTON,77027,29.75137,-95.45202
4,BKTI,2186,BK Technologies Corp,US,FL,WEST MELBOURNE,32904,28.096567,-80.6853
...,...,...,...,...,...,...,...,...,...
6049,WBQNL,1785494,Woodbridge Liquidation Trust,US,FL,FORT LAUDERDALE,33301,26.1242,-80.1436
6050,GNRSW,1790665,Greenrose Acquisition Corp.,US,NY,WOODBURY,11797,40.8202,-73.4680
6051,GNRS,1790665,Greenrose Acquisition Corp.,US,NY,WOODBURY,11797,40.8202,-73.4680
6052,GNRSU,1790665,Greenrose Acquisition Corp.,US,NY,WOODBURY,11797,40.8202,-73.4680


In [23]:
complete_sec_df['lat'] = pd.to_numeric(complete_sec_df['lat'])
complete_sec_df['lon'] = pd.to_numeric(complete_sec_df['lon'])
complete_sec_df['cik'] = complete_sec_df['cik'].astype('str')

In [24]:
complete_sec_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5602 entries, 0 to 6054
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ticker   5602 non-null   object 
 1   cik      5602 non-null   object 
 2   name     5602 non-null   object 
 3   country  5602 non-null   object 
 4   state    5602 non-null   object 
 5   city     5602 non-null   object 
 6   zipcode  5602 non-null   object 
 7   lat      5602 non-null   float64
 8   lon      5602 non-null   float64
dtypes: float64(2), object(7)
memory usage: 437.7+ KB


In [25]:
complete_sec_df.to_csv('clean_sec_companies_with_tickers_coord.csv', index=False)