# Sample EDA of patent and SBIR dataframes

In [1]:
import pandas as pd
import requests
import io

## EDA Example 1:Find the list of SBIR companies and see if any companies on that list also received a patent

### Get the list of companies from the SBIR data

In [2]:
# path to your SBIR Excel file
url="https://data.www.sbir.gov/awarddatapublic/award_data.csv"
s=requests.get(url).content
sbirdf=pd.read_csv(io.StringIO(s.decode('utf-8')), low_memory=False)

#sbirdf.head(2)

In [3]:
print(len(sbirdf))
#display(sbirdf.head())
#sbirdf.info()

201856


In [4]:
companyds = sbirdf['Company'].copy()
#companyds.name = "Company"

display(companyds.info())
display(companyds.head())

<class 'pandas.core.series.Series'>
RangeIndex: 201856 entries, 0 to 201855
Series name: Company
Non-Null Count   Dtype 
--------------   ----- 
201851 non-null  object
dtypes: object(1)
memory usage: 1.5+ MB


None

0                                 2PI INC
1                  9 CORNER SOLUTIONS LLC
2             ADAMAS NANOTECHNOLOGIES INC
3                           ADDIGURU, LLC
4    ADVANCED CERAMICS MANUFACTURING, LLC
Name: Company, dtype: object

In [5]:
len(companyds)

201856

In [6]:
companyds.drop_duplicates(inplace=True)

In [7]:
len(companyds)

31520

In [8]:
### Get the list of companies from the Patent data

In [9]:
filename_patents = './preprocessed_files/patents_entities.csv'

# Read the Excel file into a pandas DataFrame
df_patents = pd.read_csv(filename_patents)

df_patents.head(2)

Unnamed: 0.1,Unnamed: 0,country,doc-number,date,application-reference,title,assignee,inventors,abstract_entities,claim_entities
0,0,US,20230225235,20230720,"{'country': 'US', 'doc-number': 17754513, 'dat...","AGRICULTURAL TRENCH DEPTH SYSTEMS, METHODS, AN...","{'orgname': 'Precision Planting LLC', 'city': ...","[{'last-name': 'Sloneker', 'first-name': 'Dill...","['depth trench', 'depth trench open row unit',...","['surface row unit', 'body gauge wheel arm', '..."
1,1,US,20230225236,20230720,"{'country': 'US', 'doc-number': 18007883, 'dat...",Agricultural Attachment for Cultivating Row Crops,{'orgname': 'Amazonen-Werke H. Dreyer SE & Co....,"[{'last-name': 'RESCH', 'first-name': 'Rainer'...","['vehicle', 'row-detection device', 'design', ...","['vehicle base', 'steering command', 'cultivat..."


In [10]:
len(df_patents)

7234

In [16]:
patentds = df_patents['assignee'].copy()

In [11]:
#print(patentds[166])
#patentds.dropna()
#patentds.drop_duplicates(inplace=True)
#len(patentds) # this prints out 4292 Don't go this route as it prevents the for loop later on from working

In [17]:
print(patentds[0].split(':')[1].split(',')[0].strip().replace('\'',''))

Precision Planting LLC


In [18]:
pcds = []
for x in range(len(patentds)):
    #print(x)
    if patentds[x] != '{}':
        #patentds[x].split(':')[1].split(',')[0].strip().replace('\'','')
        pcds.append(patentds[x].split(':')[1].split(',')[0].strip().replace('\'',''))

In [19]:
print(pcds[0])
len(pcds)

Precision Planting LLC


7233

In [23]:
patentds2 = pd.Series(pcds, name="Company")
patentds2[0]

'Precision Planting LLC'

In [27]:
patentds2.drop_duplicates(inplace=True)
len(patentds2)

4060

In [28]:
winning_companies = pd.merge(companyds,patentds2, how = 'inner')

In [29]:
len(winning_companies)

6

In [31]:
print("These companies have both SIBRS and Patents")
display(winning_companies.head(6))

These companies have both SIBRS and Patents


Unnamed: 0,Company
0,Beirobotics LLC
1,Ultra Safe Nuclear Corporation
2,Andluca Technologies Inc.
3,FURCIFER INC.
4,Kurt J. Lesker Company
5,Nanosys


In [39]:
wclist = winning_companies['Company'].values

In [42]:
sbirdf[sbirdf['Company'].isin(wclist)][['Company', 'Award Year']]

Unnamed: 0,Company,Award Year
1482,Beirobotics LLC,2022
6726,Ultra Safe Nuclear Corporation,2022
7726,Andluca Technologies Inc.,2021
7727,Andluca Technologies Inc.,2021
9317,FURCIFER INC.,2021
21889,Andluca Technologies Inc.,2019
27910,Ultra Safe Nuclear Corporation,2019
33678,Ultra Safe Nuclear Corporation,2018
39675,Ultra Safe Nuclear Corporation,2017
45077,Ultra Safe Nuclear Corporation,2016
