# group data by Top categary and then run similarity analysis

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()


# function to tokenize and lemmatize text
def preprocess_text(text):
    # tokenize text
    tokens = word_tokenize(text.lower())
    # remove stopwords
    clean_tokens = []
    for word in tokens:
        if word.isalnum() and word not in stop_words:
            clean_tokens.append(word)
    # lemmatize text
    lemmatized_tokens = []
    for word in clean_tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(word))
        
    return ' '.join(lemmatized_tokens)

In [4]:
# read data
data = pd.read_csv('cleaned_data.csv')

In [5]:
# Preprocess the unified description (or choose one description column)
data['cleaned_description'] = data['Unified Description'].dropna().apply(preprocess_text)

# Check for missing categories
print(data['Top Level Category'].isnull().sum())
print(data['Secondary Category'].isnull().sum())


# Preview the dataset
data.head()

0
0


Unnamed: 0,Name,Website,Organization Id,Top Level Category,Secondary Category,Employee Count,Description,Sourcscrub Description,Description.1,Unified Description,cleaned_description
0,ClosingLock,closinglock.com,223865172,Unknown,Unknown,21,Closinglock is a company that provides wire tr...,No Description,Real Estate Wire Fraud Prevention Solution,Closinglock is a company that provides wire tr...,closinglock company provides wire transfer sec...
1,"Daloopa, Inc.",daloopa.com,284044302,IT Management,Data Extraction,226,Discover how the power of over 100 AI algorith...,"Daloopa, Inc. is the only AI solution for inve...",AI-driven enterprise data that can be trusted,Discover how the power of over 100 AI algorith...,discover power 100 ai algorithm automate inves...
2,UpSmith,upsmith.com,283999461,Unknown,Unknown,11,Transforming the future of work.,"UpSmith, Inc. is a provider of a skilled labor...",Significantly enhancing lives by giving people...,"Transforming the future of work. UpSmith, Inc....",transforming future work upsmith provider skil...
3,Equal Ventures,equal.vc,160422940,Unknown,Unknown,5,Equal Ventures is a venture capital firm that ...,No Description,No Description,Equal Ventures is a venture capital firm that ...,equal venture venture capital firm focus bridg...
4,Fullpath,fullpath.com,288156669,Vertical Industry,Automotive,164,AutoLeadStar is automotive's first and leading...,"AutoLeadStar, Inc. doing business as Fullpath ...",Developed a retail engagement platform for aut...,AutoLeadStar is automotive's first and leading...,autoleadstar automotive first leading customer...


In [6]:
data.columns

Index(['Name', 'Website', 'Organization Id', 'Top Level Category',
       'Secondary Category', 'Employee Count', 'Description',
       'Sourcscrub Description', 'Description.1', 'Unified Description',
       'cleaned_description'],
      dtype='object')

# Feature Engineering

In [7]:
# Normalize Employee Count
scaler = MinMaxScaler()
data['Employee Count Scaled'] = scaler.fit_transform(data[['Employee Count']].fillna(0))

# Preprocess Descriptions using cleaned_description
tfidf = TfidfVectorizer(stop_words='english')
descriptions_tfidf = tfidf.fit_transform(data['cleaned_description'].fillna(''))

In [36]:
# Group by Top-Level Category
grouped_data_top = data.groupby(['Top Level Category'])
print(len(grouped_data_top))

1364


In [24]:
# Split and lower case the 'Top Level Category' column
split_category = data['Top Level Category'].str.lower().str.split(';')

# Explode the lists into separate rows
exploded_category = split_category.explode()

# Count the occurrences of each category
category_counts = exploded_category.value_counts()

# Print the counts
print(category_counts)


Top Level Category
vertical industry     14385
unknown                4713
marketing              4531
erp                    4189
hr                     4064
                      ...  
 data privacy             1
storage                   1
 staffing services        1
 other services           1
security hardware         1
Name: count, Length: 76, dtype: int64


In [40]:
data = data.drop(columns=['Flattened Categories'])

print(data.columns)

Index(['Name', 'Website', 'Organization Id', 'Top Level Category',
       'Secondary Category', 'Employee Count', 'Description',
       'Sourcscrub Description', 'Description.1', 'Unified Description',
       'cleaned_description', 'Employee Count Scaled'],
      dtype='object')


In [13]:
def compute_similarity(group):
    results = []
    group_list = group.reset_index().to_dict('records')  # Convert DataFrame to list of dictionaries
    n = len(group_list)
    
    for i in range(n):
        for j in range(i + 1, n):
            a, b = group_list[i], group_list[j]
            
            # Categories Similarity (Jaccard Similarity using sets)
            categories_a = set(a['Top Level Category'].split('; ') + a['Secondary Category'].split('; '))
            categories_b = set(b['Top Level Category'].split('; ') + b['Secondary Category'].split('; '))
            
            # Check for 'Unknown' categories
            if 'Unknown' in categories_a or 'Unknown' in categories_b:
                category_similarity = 0  # Default to 0 if any category is 'Unknown'
                weight = 0  # Weight is 0 if any category is 'Unknown'
            else:
                category_similarity = len(categories_a.intersection(categories_b)) / len(categories_a.union(categories_b))
                weight = 0.4  # Standard weight when categories are valid
            
            # Description Similarity (Cosine Similarity)
            desc_a = descriptions_tfidf[a['index']].toarray()  # Access vector by DataFrame index
            desc_b = descriptions_tfidf[b['index']].toarray()  # Access vector by DataFrame index
            desc_similarity = cosine_similarity(desc_a, desc_b).item()
            
            # Employee Count Similarity
            employee_similarity = 1 - abs(a['Employee Count Scaled'] - b['Employee Count Scaled'])
            
            # Composite Similarity
            composite_similarity = (category_similarity * weight +
                                     desc_similarity * 0.4 +
                                     employee_similarity * 0.2)
            
            results.append({
                'Item A': a['Name'],
                'Item B': b['Name'],
                'Category Similarity': category_similarity,
                'Description Similarity': desc_similarity,
                'Employee Similarity': employee_similarity,
                'Composite Similarity': composite_similarity
            })
    
    return pd.DataFrame(results)


In [45]:
if unknown_group is not None:
    # Create a small subset (e.g., first 10 rows)
    test_subset = unknown_group.head(10)  # Adjust size as needed
    print(f"Testing on subset of size: {len(test_subset)}")
    
    # Compute similarity on the subset
    result = compute_similarity(test_subset)
    
    # Print the results in a readable format
    for index, row in result.iterrows():
        print(f"Item A: {row['Item A']}, Item B: {row['Item B']}, Composite Similarity: {row['Composite Similarity']}")
else:
    print("No group found with category 'Unknown'")


Testing on subset of size: 10
Item A: ClosingLock, Item B: UpSmith, Composite Similarity: 0.2029461696464465
Item A: ClosingLock, Item B: Equal Ventures, Composite Similarity: 0.20484914325987644
Item A: ClosingLock, Item B: Sadie Blue Software, Composite Similarity: 0.1999961436816031
Item A: ClosingLock, Item B: QuikData, Composite Similarity: 0.1999966257214027
Item A: ClosingLock, Item B: StructureFlow, Composite Similarity: 0.20220461877918092
Item A: ClosingLock, Item B: DigitalOwl INST, Composite Similarity: 0.20341067432341695
Item A: ClosingLock, Item B: Ganaz, Composite Similarity: 0.2026550379621859
Item A: ClosingLock, Item B: Proxima, Composite Similarity: 0.2023053435609491
Item A: ClosingLock, Item B: Syzl, Composite Similarity: 0.20212085172407493
Item A: UpSmith, Item B: Equal Ventures, Composite Similarity: 0.19999855388060117
Item A: UpSmith, Item B: Sadie Blue Software, Composite Similarity: 0.20291747316567238
Item A: UpSmith, Item B: QuikData, Composite Similarity