In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# LOADING AND EXPLORING DATA

In [11]:
# load the data 

data = pd.read_csv('data.csv', encoding='latin1')
data.head()


Unnamed: 0,Name,Website,Organization Id,Top Level Category,Secondary Category,Employee Count,Description,Sourcscrub Description,Description.1
0,ClosingLock,closinglock.com,223865172,,,21,Closinglock is a company that provides wire tr...,,Real Estate Wire Fraud Prevention Solution
1,"Daloopa, Inc.",daloopa.com,284044302,IT Management,Data Extraction,226,Discover how the power of over 100 AI algorith...,"Daloopa, Inc. is the only AI solution for inve...",AI-driven enterprise data that can be trusted
2,"Daloopa, Inc.",daloopa.com,284044302,IT Management,Data Extraction,226,Discover how the power of over 100 AI algorith...,"Daloopa, Inc. is the only AI solution for inve...",AI-driven enterprise data that can be trusted
3,UpSmith,upsmith.com,283999461,,,11,Transforming the future of work.,"UpSmith, Inc. is a provider of a skilled labor...",Significantly enhancing lives by giving people...
4,Equal Ventures,equal.vc,160422940,,,5,Equal Ventures is a venture capital firm that ...,,


In [7]:
data.info() # check the data type and missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56376 entries, 0 to 56375
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Name                    56376 non-null  object
 1   Website                 56376 non-null  object
 2   Organization Id         56376 non-null  int64 
 3   Top Level Category      51602 non-null  object
 4   Secondary Category      51602 non-null  object
 5   Employee Count          56376 non-null  int64 
 6   Description             55138 non-null  object
 7   Sourcscrub Description  56073 non-null  object
 8   Description.1           39389 non-null  object
dtypes: int64(2), object(7)
memory usage: 3.9+ MB


In [8]:
# checks the number of rows and columns of the data
data.shape 

(56376, 9)

In [9]:
data.describe() # check the summary statistics of the data

Unnamed: 0,Organization Id,Employee Count
count,56376.0,56376.0
mean,287260200.0,571.954608
std,21090210.0,9012.273009
min,1514169.0,1.0
25%,288182200.0,4.0
50%,290489600.0,20.0
75%,291417800.0,75.0
max,294547800.0,829808.0


In [10]:
data.describe(include='object') 

Unnamed: 0,Name,Website,Top Level Category,Secondary Category,Description,Sourcscrub Description,Description.1
count,56376,56376,51602,51602,55138,56073,39389
unique,55781,55767,1363,5141,54996,55711,39273
top,Cortex,avalara.com,Vertical Industry,Talent Management,This domain may be for sale!,"Asana, Inc. is a software company that develop...",Dental Treatment Planning Presentation Softwar...
freq,6,4,13905,1819,9,4,3


In [13]:
# get the column names
data.columns  


Index(['Name', 'Website', 'Organization Id', 'Top Level Category',
       'Secondary Category', 'Employee Count', 'Description',
       'Sourcscrub Description', 'Description.1'],
      dtype='object')

In [14]:
# check for missing values
data.isnull().sum()

Name                          0
Website                       0
Organization Id               0
Top Level Category         4774
Secondary Category         4774
Employee Count                0
Description                1238
Sourcscrub Description      303
Description.1             16987
dtype: int64

In [16]:
data[data.isnull().any(axis=1)] # check the rows with missing values

Unnamed: 0,Name,Website,Organization Id,Top Level Category,Secondary Category,Employee Count,Description,Sourcscrub Description,Description.1
0,ClosingLock,closinglock.com,223865172,,,21,Closinglock is a company that provides wire tr...,,Real Estate Wire Fraud Prevention Solution
3,UpSmith,upsmith.com,283999461,,,11,Transforming the future of work.,"UpSmith, Inc. is a provider of a skilled labor...",Significantly enhancing lives by giving people...
4,Equal Ventures,equal.vc,160422940,,,5,Equal Ventures is a venture capital firm that ...,,
6,Sadie Blue Software,sadiebluesoftware.com,224638688,,,5,Sadie Blue Software is the maker of Agility Bl...,,"All your work, under control"
7,QuikData,quikdata.com,289080423,,,7,QuikData | Simply Powerful eDiscovery: Easy eD...,,
...,...,...,...,...,...,...,...,...,...
56352,CannGen Insurance Services,canngenins.com,282401082,,,42,"CannGen Insurance Services, LLC is a leading p...","CannGen Insurance Services, LLC offers indepen...",
56356,CMX,cmx1.com,282401304,,,80,CMX is a platform that helps businesses achiev...,"Compliancemetrix, Inc. (CMX) is a provider of ...",
56364,Strava,strava.com,282423306,,,488,Strava is an American internet service for tra...,"Strava, Inc. is an online network where runner...",Online network connecting the global community...
56367,Too Good To Go,toogoodtogo.com,282401416,,,1428,The app that lets you rescue delicious food fr...,Too Good To Go ApS (TGTG) is an environmental ...,App that offers a solution to sellers and indi...


In [34]:
# check for duplicates
duplicate_data = data.duplicated()
print(duplicate_data.sum())

158


# Data Cleaning

In [22]:
# rows with missing description in all 3 columns
nan_descriptions = data[data[['Description', 'Sourcscrub Description', 'Description.1']].isnull().all(axis=1)]
nan_descriptions

Unnamed: 0,Name,Website,Organization Id,Top Level Category,Secondary Category,Employee Count,Description,Sourcscrub Description,Description.1


In [27]:
# drop rows with no description
data.drop(nan_descriptions.index, inplace=True)

# fill NaN values with unknown
data['Top Level Category'] = data['Top Level Category'].fillna('Unknown')
data['Secondary Category'] = data['Secondary Category'].fillna('Unknown')
data['Description'] = data['Description'].fillna('No Description')
data['Sourcscrub Description'] = data['Sourcscrub Description'].fillna('No Description')
data['Description.1'] = data['Description.1'].fillna('No Description')
data.head()

Unnamed: 0,Name,Website,Organization Id,Top Level Category,Secondary Category,Employee Count,Description,Sourcscrub Description,Description.1
0,ClosingLock,closinglock.com,223865172,Unknown,Unknown,21,Closinglock is a company that provides wire tr...,No Description,Real Estate Wire Fraud Prevention Solution
1,"Daloopa, Inc.",daloopa.com,284044302,IT Management,Data Extraction,226,Discover how the power of over 100 AI algorith...,"Daloopa, Inc. is the only AI solution for inve...",AI-driven enterprise data that can be trusted
2,"Daloopa, Inc.",daloopa.com,284044302,IT Management,Data Extraction,226,Discover how the power of over 100 AI algorith...,"Daloopa, Inc. is the only AI solution for inve...",AI-driven enterprise data that can be trusted
3,UpSmith,upsmith.com,283999461,Unknown,Unknown,11,Transforming the future of work.,"UpSmith, Inc. is a provider of a skilled labor...",Significantly enhancing lives by giving people...
4,Equal Ventures,equal.vc,160422940,Unknown,Unknown,5,Equal Ventures is a venture capital firm that ...,No Description,No Description


In [35]:
# remove duplicates
data.drop_duplicates(inplace=True)
print(data.duplicated().sum()) # should be zero

0


In [44]:
# rows with same organization id
duplicates = data[data.duplicated(subset=['Name', 'Website'], keep=False)]
print(duplicates)


                         Name          Website  Organization Id  \
5825                    EBANX        ebanx.com        291589887   
44476              Hospitable   hospitable.com        289332430   
44478              Hospitable   hospitable.com        291587302   
45253        Trade Ideas, LLC  trade-ideas.com        283119238   
45388  Yoxel, LLC dba Aurinko       aurinko.io        287719430   
45440            Level Access  levelaccess.com        288618696   
45973        Trade Ideas, LLC  trade-ideas.com        288325851   
47091  Yoxel, LLC dba Aurinko       aurinko.io        287719477   
48116                   EBANX        ebanx.com        283480909   
51113            Level Access  levelaccess.com        286579384   
55008          Less Paper Co.  lesspaperco.com        283658716   
55172          Less Paper Co.  lesspaperco.com        282408432   

       Top Level Category                          Secondary Category  \
5825             Commerce                              

In [49]:
filtered_data = data[data['Top Level Category'] == 'Unknown'] | (data['Secondary Category'] == 'Unknown')
print(filtered_data)

# Feature Engineering

In [29]:
# changing the data type of the columns
data['Top Level Category'] = data['Top Level Category'].astype('category') # change to category data type
data['Secondary Category'] = data['Secondary Category'].astype('category') # change to category data type
print(data.dtypes)

Name                        object
Website                     object
Organization Id              int64
Top Level Category        category
Secondary Category        category
Employee Count               int64
Description                 object
Sourcscrub Description      object
Description.1               object
dtype: object


In [40]:
# adding a unified description column
data['Unified Description'] = data['Description'] + ' ' + data['Sourcscrub Description'] + ' ' + data['Description.1']
print(data['Unified Description'])

0        Closinglock is a company that provides wire tr...
1        Discover how the power of over 100 AI algorith...
3        Transforming the future of work. UpSmith, Inc....
4        Equal Ventures is a venture capital firm that ...
5        AutoLeadStar is automotive's first and leading...
                               ...                        
56371    North American Bancard is a payments technolog...
56372    Datavant is a company that connects the larges...
56373    Chess.com is the premier online chess communit...
56374    Dutchie is a leading technology partner for ca...
56375    Kajabi is an all-in-one knowledge commerce pla...
Name: Unified Description, Length: 56214, dtype: object


In [43]:
# remove no decription from the unified description column
data['Unified Description'] = data['Description'].replace('No Description', '') + ' ' + data['Sourcscrub Description'].replace('No Description', '') + ' ' + data['Description.1'].replace('No Description', '')
print(data['Unified Description'])

0        Closinglock is a company that provides wire tr...
1        Discover how the power of over 100 AI algorith...
3        Transforming the future of work. UpSmith, Inc....
4        Equal Ventures is a venture capital firm that ...
5        AutoLeadStar is automotive's first and leading...
                               ...                        
56371    North American Bancard is a payments technolog...
56372    Datavant is a company that connects the larges...
56373    Chess.com is the premier online chess communit...
56374    Dutchie is a leading technology partner for ca...
56375    Kajabi is an all-in-one knowledge commerce pla...
Name: Unified Description, Length: 56214, dtype: object
