In [1]:
import pdb

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from tqdm import tqdm

In [2]:
startups = pd.read_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\processed\startups_clean_noents.csv')
industry_data = pd.read_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\processed\industries_clean.csv', sep='\t')

In [3]:
startups.dropna(inplace=True)
industry_data.dropna(inplace=True)

In [4]:
class TopicModelling:
    def __init__(self, dataframe, column_name):
        self.dataframe = dataframe
        self.column_name = column_name

    def calculate_tfidf(self):
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(self.dataframe[self.column_name])
        self.vectorizer = vectorizer
        self.tfidf_matrix = tfidf_matrix
        return tfidf_matrix

    def append_top_words(self, n_topics=1, n_words=10, random_state=42):
        top_words = []
        for i, row in tqdm(enumerate(self.tfidf_matrix)):
            self.lda = LatentDirichletAllocation(n_components=n_topics, random_state=random_state)
            lda_matrix = self.lda.fit_transform(row)
            top_n_words = self.__get_top_words(n_words=n_words)
            top_words.append(top_n_words)
        self.dataframe['top_words'] = top_words
        return self.dataframe

    def __get_top_words(self, n_words=10):
        feature_names = self.vectorizer.get_feature_names_out()
        top_n_words = " ".join([feature_names[i] for i in self.lda.components_[0].argsort()[:-n_words - 1:-1]])
        return top_n_words


In [5]:
topic_modelling = TopicModelling(startups, 'cb_description')

In [6]:
topic_modelling.calculate_tfidf()
startups = topic_modelling.append_top_words(n_topics=1, n_words=10, random_state=42)

3999it [01:31, 43.67it/s]


In [7]:
startups.head()

Unnamed: 0.1,Unnamed: 0,id,name,cb_description,top_words
0,0,1820,0xKYC,modular knowledge system identity credential m...,knowledge ofac sanction zkps credential reimag...
1,1,1536,100ms,live video infrastructure platform provide sub...,infrastructure video seamlessly virtual world ...
2,2,3640,10X-Genomics,create revolutionary dna sequence technology h...,sequence shre subtle variation overlook tiny s...
3,3,9594,111Skin,commit positive luxury skincare push boundary ...,skincare philanthropic boundary female ethical...
4,4,4697,1715Labs,company establish commercialise zooniverse tec...,zooniverse commercialise establish technology ...


In [15]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pdb import set_trace


class Embedding:
    """
    A class to generate embeddings for startups and industries using specified language models and pooling methods.
    """

    def __init__(self, startups, column_name, industries, llm='bert', pool='max', sentence_transformer=False):

        """
        Initializes the Embedding class with specified language models and pooling methods.

        :param startups: DataFrame containing startup data with 'id' and 'cb_description' columns
        :param industries: DataFrame containing industry data with 'id' and 'keywords' columns
        :param llm: string, the language model to use for generating embeddings, default is 'bert'
        :param pool: string, the pooling method to use for generating embeddings, default is 'max'
        :param sentence_transformer: bool, whether to use a sentence transformer model, default is False
        """

        self.startups = startups
        self.industries = industries
        self.column_name = column_name
        self.sentence_transformer = sentence_transformer
        self.pool = pool
        self.llm = {
            'bert': 'bert-base-uncased',
            'gpt2': 'gpt2',
            'gpt': 'openai-gpt',
            'roberta': 'roberta-base',
            'distilbert': 'distilbert-base-uncased',
            'xlnet': 'xlnet-base-uncased',
            'electra': 'google/electra-base-discriminator',
            'industry_classifier': 'sampathkethineedi/industry-classification'
        }
        if not sentence_transformer:
            self.model = AutoModel.from_pretrained(self.llm[llm])
            self.tokenizer = AutoTokenizer.from_pretrained(self.llm[llm])
        else:
            self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


    def generate_embeddings(self, startup=True):
        """
        Generates embeddings for startups or industries using the specified language model and pooling method.

        :param startup: bool, if True, generates embeddings for startups, if False, generates embeddings for industries
        :return: DataFrame with generated embeddings merged with the original input DataFrame
        """
        texts = self.startups if startup else self.industries
        embeddings_list = []

        for i, row in tqdm(texts.iterrows()):
            id = row['id']
            if startup:
                description = row[self.column_name]
            else:
                description = row['keywords']
            if self.sentence_transformer:
                embeddings = self.model.encode(description)
            else:
                inputs = self.tokenizer.encode_plus(description, return_tensors="pt", truncation=True, padding="max_length", max_length=60)
                outputs = self.model(**inputs)
                last_hidden_states = outputs.last_hidden_state
                embeddings = self.pooling(last_hidden_states)

            embeddings_list.append({'id': id, 'embeddings': embeddings.tolist()})

        embeddings_df = pd.DataFrame(embeddings_list)
        merged_df = pd.merge(texts, embeddings_df, on='id', how='left')

        if startup:
            self.startups = merged_df
        else:
            self.industries = merged_df

        return merged_df



    def assign_industry(self, num_labels=3):
        """
        Assigns top industries to startups based on their cosine similarity to the industry embeddings.

        :param num_labels: int, the number of top industries to assign to each startup, default is 3
        :return: list of lists containing dictionaries with assigned industries and their similarity scores
        """
        self.assigned_industries = []
        for startup_embedding in self.startups['embeddings']:
            startup_embedding = np.array(startup_embedding).flatten()
            industry_embeddings = np.array([np.array(x).flatten() for x in self.industries['embeddings']])

            similarities = cosine_similarity([startup_embedding], industry_embeddings)[0]
            top_industry_indices = np.argsort(similarities)[-num_labels:][::-1]
            top_industries = [{'industry': self.industries.iloc[index]['industry'], 'score': similarities[index]} for index in top_industry_indices]

            self.assigned_industries.append(top_industries)

        return self.assigned_industries

    def pooling(self, last_hidden_states):
        """
        Applies the specified pooling method to the given last hidden states tensor.

        :param last_hidden_states: tensor, the last hidden states from the language model
        :return: NumPy array of pooled embeddings
        """
        if self.pool == 'max':
            self.pooled_embeds = torch.max(last_hidden_states, dim=1).values
        elif self.pool == 'avg':
            self.pooled_embeds = torch.mean(last_hidden_states, dim=1)
        elif self.pool == 'concat':
            max_pooling = torch.max(last_hidden_states, dim=1).values
            average_pooling = torch.mean(last_hidden_states, dim=1)
            self.pooled_embeds = torch.cat((max_pooling, average_pooling), dim=1)
        else:
            raise ValueError('pool must be either max, avg or concat')
        return self.pooled_embeds.detach().numpy()

    def update_dataframe(self):
        """
        Updates the startup and industry DataFrames with assigned industries and their similarity scores.

        :return: DataFrame with updated startups data
        """
        max_industries = max([len(x) for x in self.assigned_industries])

        for i in range(max_industries):
            self.startups[f'industry{i + 1}'] = [x[i]['industry'] if i < len(x) else None for x in self.assigned_industries]
            self.startups[f'score{i + 1}'] = [x[i]['score'].round(3) if i < len(x) else None for x in self.assigned_industries]

        self.startups.drop(columns=['embeddings'], inplace=True)
        self.industries.drop(columns=['embeddings'], inplace=True)

        return self.startups


In [16]:
### now x has the top words for each startup lets try sentence transform on them.

embedder = Embedding(startups=startups[0:10], column_name='top_words', industries=industry_data, sentence_transformer=True)

In [17]:
embedder.generate_embeddings(startup=True)
embedder.generate_embeddings(startup=False)

hi


10it [00:03,  3.29it/s]


hi


102it [00:24,  4.25it/s]


Unnamed: 0,id,industry,keywords,embeddings
0,0,neuro,neurology signal neuron memory network cogniti...,"[-0.008603241294622421, -0.09912002831697464, ..."
1,1,procurement,source supply chain proposal supplier negotiat...,"[-0.08471089601516724, 0.0118993716314435, 0.0..."
2,2,greentech,biofuel solar renewable sustainability geother...,"[0.04065382108092308, 0.07953336834907532, 0.0..."
3,3,social impact,empowerment volunteer justice activism social ...,"[0.0023484171833842993, 0.012581953778862953, ..."
4,4,esports,streaming competition game virtual tournament ...,"[0.01624584011733532, -0.006599493324756622, -..."
...,...,...,...,...
97,113,data storage,backup center hardware solution,"[-0.1259157359600067, 0.0014534399379044771, -..."
98,114,generative ai,gin augmentation adversarial generative,"[-0.11226412653923035, -0.05784289166331291, 0..."
99,117,extremism,violence radicalization right speech hate far ...,"[0.0543396957218647, 0.04583355411887169, -0.0..."
100,119,network infrastructure,sdn router optic wan switch backbone,"[-0.012510308064520359, -0.05666343495249748, ..."


In [18]:
embedder.assign_industry(num_labels=3)
new_df = embedder.update_dataframe()

In [19]:
new_df

Unnamed: 0.1,Unnamed: 0,id,name,cb_description,top_words,industry1,score1,industry2,score2,industry3,score3
0,0,1820,0xKYC,modular knowledge system identity credential m...,knowledge ofac sanction zkps credential reimag...,payments,0.375,cybersecurity,0.351,energy efficiency,0.284
1,1,1536,100ms,live video infrastructure platform provide sub...,infrastructure video seamlessly virtual world ...,esports,0.471,telecoms,0.309,gaming,0.305
2,2,3640,10X-Genomics,create revolutionary dna sequence technology h...,sequence shre subtle variation overlook tiny s...,genomics,0.29,biotech,0.228,longevity,0.204
3,3,9594,111Skin,commit positive luxury skincare push boundary ...,skincare philanthropic boundary female ethical...,beauty,0.471,social impact,0.336,sextech,0.333
4,4,4697,1715Labs,company establish commercialise zooniverse tec...,zooniverse commercialise establish technology ...,professional services,0.289,media,0.277,creator economy,0.271
5,5,473,1stdibs,internet company offer marketplace rare desira...,pursuit respected jewelry collector desirable ...,e-commerce,0.357,logistics,0.331,fashion,0.317
6,6,7956,1v1Me,application allow user play match favorite vid...,play favorite match cash member meet video gam...,esports,0.511,social networks,0.339,sharing economy,0.289
7,7,9457,21sportsgroup,online sport good retailer offer selection run...,athlete sport retail triathlete triathlon wurt...,sport & wellness,0.416,esports,0.302,fashion,0.271
8,8,4477,23andMe,human genome research company enable user stud...,ancestry genealogy trait searchable inherit ge...,genomics,0.391,longevity,0.264,biotech,0.211
9,9,917,24Symbols,solution read digital book read device interne...,read internet ereader conection device laptop ...,iot,0.429,industrial iot,0.255,telecoms,0.247
