In [1]:
import pandas as pd
import numpy as np

In [2]:
company = pd.read_csv('Company Descriptions.csv')
company

Unnamed: 0,company_name,company_short_description,company_description
0,Codementor,Codementor is an online marketplace connecting...,Codementor provides live 1:1 help for software...
1,AgShift,AgShift is designing world's most advanced aut...,AgShift solution blends Deep Learning with Com...
2,Shipsi,Shipsi empowers any retailer with the ability ...,Shipsi empowers any retailer with the ability ...
3,OpenNews,"OpenNews helps a global network of developers,...","We're helping a global network of developers, ..."
4,Biobot Analytics,Biobot Analytics analyzes city sewage to estim...,Biobot Analytics analyzes sewage to estimate o...
...,...,...,...
19960,Powermat Technologies,Powermat Technologies develops wireless energy...,Powermat Technologies is a developer of wirele...
19961,Properly,The Ultimate Turnover Tool for Vacation Rental...,Properly is a visual checklist tool that lets ...
19962,Bid Ops,Bid Ops accelerates business partnerships betw...,
19963,Tavolo,Tavolo is an online retailer and destination f...,Tavolo offers an online store that enables its...


In [3]:
company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19965 entries, 0 to 19964
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   company_name               19965 non-null  object
 1   company_short_description  19965 non-null  object
 2   company_description        19237 non-null  object
dtypes: object(3)
memory usage: 468.1+ KB


In [4]:
# importing missing value in company_description with short decription
company.company_description = company.company_description.fillna(company.company_short_description)

In [5]:
company = company.drop(['company_short_description'],axis=1)

In [6]:
company.head()

Unnamed: 0,company_name,company_description
0,Codementor,Codementor provides live 1:1 help for software...
1,AgShift,AgShift solution blends Deep Learning with Com...
2,Shipsi,Shipsi empowers any retailer with the ability ...
3,OpenNews,"We're helping a global network of developers, ..."
4,Biobot Analytics,Biobot Analytics analyzes sewage to estimate o...


# Text preprocessing

In [7]:
company.company_description = company.company_description.apply(lambda x: x.lower())

In [8]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def remove_punctuation(text):
    text_nopunctuation="".join([c for c in text if c not in string.punctuation])
    return text_nopunctuation

In [10]:
company.company_description = company.company_description.apply(remove_punctuation)

In [11]:
def remove_digit(text):
    result = ''.join([i for i in text if not i.isdigit()])
    return result

In [12]:
company.company_description = company.company_description.apply(remove_digit)

In [13]:
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
company.company_description = company.company_description.apply(lambda x: tokenization(x))

In [14]:
#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
company.company_description = company.company_description.apply(lambda x:remove_stopwords(x))

In [15]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
company.company_description = company.company_description.apply(lambda x:lemmatizer(x))

In [16]:
company.company_description = company.company_description.apply(lambda x: ' '.join(x))

In [17]:
company

Unnamed: 0,company_name,company_description
0,Codementor,codementor provides live help for software de...
1,AgShift,agshift solution blends deep learning with com...
2,Shipsi,shipsi empowers any retailer with the ability ...
3,OpenNews,were helping a global network of developers jo...
4,Biobot Analytics,biobot analytics analyzes sewage to estimate o...
...,...,...
19960,Powermat Technologies,powermat technologies is a developer of wirele...
19961,Properly,properly is a visual checklist tool that lets ...
19962,Bid Ops,bid ops accelerates business partnerships betw...
19963,Tavolo,tavolo offers an online store that enables its...


In [18]:
# Import library
from keybert import KeyBERT

kw_model = KeyBERT()

In [19]:
company.company_description[0]

'codementor provides live  help for software development  we’re making it easy for developers to connect with experts via screen sharing video and chat  there are two ways to get help ondemand live  expert help and longterm dedicated mentorship \n\nbuild projects faster with ondemand help for topics including ruby python php javascript csshtml ios swift and more codementor helps you overcome key challenges with timely advice and speeds up your development process'

In [20]:
keywords=kw_model.extract_keywords(company.company_description,top_n=20)

In [21]:
keywords

[[('codementor', 0.5403),
  ('developers', 0.4031),
  ('mentorship', 0.3955),
  ('software', 0.3319),
  ('swift', 0.3225),
  ('development', 0.3224),
  ('ios', 0.3212),
  ('projects', 0.3024),
  ('chat', 0.269),
  ('build', 0.2616),
  ('php', 0.2384),
  ('screen', 0.2363),
  ('ondemand', 0.2355),
  ('sharing', 0.2316),
  ('javascript', 0.2291),
  ('expert', 0.2281),
  ('helps', 0.2153),
  ('live', 0.2082),
  ('dedicated', 0.2035),
  ('making', 0.2025)],
 [('agshift', 0.3913),
  ('software', 0.3842),
  ('learning', 0.3584),
  ('vision', 0.3491),
  ('images', 0.3458),
  ('deep', 0.343),
  ('inspections', 0.3306),
  ('defects', 0.309),
  ('image', 0.3085),
  ('models', 0.2992),
  ('accuracy', 0.2901),
  ('quality', 0.2792),
  ('inspect', 0.2653),
  ('produce', 0.2602),
  ('curated', 0.2552),
  ('sample', 0.251),
  ('blends', 0.2483),
  ('supply', 0.2362),
  ('realworld', 0.2361),
  ('data', 0.229)],
 [('shipsi', 0.6162),
  ('logistics', 0.3932),
  ('shipping', 0.3858),
  ('retailer', 0.36

In [22]:
company['keywords'] = keywords

In [23]:
company

Unnamed: 0,company_name,company_description,keywords
0,Codementor,codementor provides live help for software de...,"[(codementor, 0.5403), (developers, 0.4031), (..."
1,AgShift,agshift solution blends deep learning with com...,"[(agshift, 0.3913), (software, 0.3842), (learn..."
2,Shipsi,shipsi empowers any retailer with the ability ...,"[(shipsi, 0.6162), (logistics, 0.3932), (shipp..."
3,OpenNews,were helping a global network of developers jo...,"[(developers, 0.5045), (journalism, 0.4972), (..."
4,Biobot Analytics,biobot analytics analyzes sewage to estimate o...,"[(biobot, 0.4356), (wastewater, 0.4233), (opio..."
...,...,...,...
19960,Powermat Technologies,powermat technologies is a developer of wirele...,"[(powermat, 0.4932), (powermats, 0.4902), (cha..."
19961,Properly,properly is a visual checklist tool that lets ...,"[(cleaning, 0.3997), (cleaners, 0.3594), (prop..."
19962,Bid Ops,bid ops accelerates business partnerships betw...,"[(partnerships, 0.5327), (bid, 0.481), (suppli..."
19963,Tavolo,tavolo offers an online store that enables its...,"[(tavolo, 0.585), (store, 0.3677), (products, ..."


In [24]:
def keyword_preprocess(list1):
    return [item for t in list1 for item in t if type(item)==str]

In [25]:
company['keywords'] = company['keywords'].apply(keyword_preprocess)

In [26]:
company

Unnamed: 0,company_name,company_description,keywords
0,Codementor,codementor provides live help for software de...,"[codementor, developers, mentorship, software,..."
1,AgShift,agshift solution blends deep learning with com...,"[agshift, software, learning, vision, images, ..."
2,Shipsi,shipsi empowers any retailer with the ability ...,"[shipsi, logistics, shipping, retailer, delive..."
3,OpenNews,were helping a global network of developers jo...,"[developers, journalism, journalists, hackers,..."
4,Biobot Analytics,biobot analytics analyzes sewage to estimate o...,"[biobot, wastewater, opioids, sewage, analytic..."
...,...,...,...
19960,Powermat Technologies,powermat technologies is a developer of wirele...,"[powermat, powermats, charging, airport, durac..."
19961,Properly,properly is a visual checklist tool that lets ...,"[cleaning, cleaners, properly, properlys, chec..."
19962,Bid Ops,bid ops accelerates business partnerships betw...,"[partnerships, bid, suppliers, buyers, busines..."
19963,Tavolo,tavolo offers an online store that enables its...,"[tavolo, store, products, san, buy, cooking, c..."


In [27]:
keys = company['keywords'].tolist()

In [28]:
len(keys)

19965

In [29]:
industry = pd.read_excel("Industry Segments - Top 10 Keywords.xlsx")
industry

Unnamed: 0,Industry segment,Tags
0,Aerospace and defense,"security, systems, video, surveillance, servic..."
1,Agriculture and forestry,"Service, cleantech, water, agriculture, traits..."
2,Biopharmaceuticals,"Developer, treatment, drug, diseases, technolo..."
3,Business support services,"Service, platform, online, management, data, m..."
4,Communications and networking,"Service, wireless, network, data, internet, ap..."
5,Construction and civil engineering,"Service, cleantech, water, energy, waste, trea..."
6,Consumer information services,"Online, users, web, service, platform, social,..."
7,Electronics and computer hardware,"Technology, storage, energy, systems, applicat..."
8,Financial institutions and services,"Service, financial, payment, online, platform,..."
9,Food and beverage,"Food, organic, tea, beverages, ingredients, na..."


In [30]:
#pip install openpyxl

In [31]:
industry['Tags'][0].split(',')

['security',
 ' systems',
 ' video',
 ' surveillance',
 ' service',
 ' aircraft',
 ' military',
 ' system',
 ' technology',
 ' flight']

In [32]:
def text_preprocess(text):
    text = text.lower()
    return [i.strip() for i in text.split(',')]

In [33]:
industry['Tags'] = industry['Tags'].apply(text_preprocess)
industry

Unnamed: 0,Industry segment,Tags
0,Aerospace and defense,"[security, systems, video, surveillance, servi..."
1,Agriculture and forestry,"[service, cleantech, water, agriculture, trait..."
2,Biopharmaceuticals,"[developer, treatment, drug, diseases, technol..."
3,Business support services,"[service, platform, online, management, data, ..."
4,Communications and networking,"[service, wireless, network, data, internet, a..."
5,Construction and civil engineering,"[service, cleantech, water, energy, waste, tre..."
6,Consumer information services,"[online, users, web, service, platform, social..."
7,Electronics and computer hardware,"[technology, storage, energy, systems, applica..."
8,Financial institutions and services,"[service, financial, payment, online, platform..."
9,Food and beverage,"[food, organic, tea, beverages, ingredients, n..."


In [35]:
# industries = []

# for i in range(company.shape[0]):
#     for j in range(industry.shape[0]):
#             if len(set(company['keywords'][i]).intersection(set(industry['Tags'][j]))) >= 5:
                
#                 industries.append(industry['Industry segment'])
            
    
        

      

In [36]:
for i,j in enumerate(industry['Tags']):
    print(j)

['security', 'systems', 'video', 'surveillance', 'service', 'aircraft', 'military', 'system', 'technology', 'flight']
['service', 'cleantech', 'water', 'agriculture', 'traits', 'food', 'plant', 'crop', 'irrigation', 'agricultural']
['developer', 'treatment', 'drug', 'diseases', 'technology', 'disease', 'cancer', 'drugs', 'development', 'therapeutics']
['service', 'platform', 'online', 'management', 'data', 'marketing', 'advertising', 'technology', 'web', 'software']
['service', 'wireless', 'network', 'data', 'internet', 'applications', 'technology', 'networks', 'mobile', 'systems']
['service', 'cleantech', 'water', 'energy', 'waste', 'treatment', 'technology', 'environment', 'systems', 'management']
['online', 'users', 'web', 'service', 'platform', 'social', 'site', 'content', 'mobile', 'search']
['technology', 'storage', 'energy', 'systems', 'applications', 'data', 'developer', 'power', 'efficiency', 'devices']
['service', 'financial', 'payment', 'online', 'platform', 'credit', 'mobil