In [70]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import warnings
from tqdm import tqdm_notebook as tqdm
from sentence_transformers import SentenceTransformer
import torch
from sklearn.decomposition import TruncatedSVD


class Embedding:
    """
    A class to generate embeddings for startups and industries using specified language models and pooling methods.
    """

    def __init__(self, startups, industries, llm='bert', pool='max', sentence_transformer=False, sent='sentence-transformers/all-MiniLM-L6-v2',
                 both=False):

        """
        Initializes the Embedding class with specified language models and pooling methods.

        :param startups: DataFrame containing startup data with 'id' and 'cb_description' columns
        :param industries: DataFrame containing industry data with 'id' and 'keywords' columns
        :param llm: string, the language model to use for generating embeddings, default is 'bert'
        :param pool: string, the pooling method to use for generating embeddings, default is 'max'
        :param sentence_transformer: bool, whether to use a sentence transformer model, default is False
        """

        self.startups = startups
        self.industries = industries
        self.sentence_transformer = sentence_transformer
        self.pool = pool
        self.both = both
        self.llm = {
            'bert': 'bert-base-uncased',
            'gpt2': 'gpt2',
            'gpt': 'openai-gpt',
            'roberta': 'roberta-base',
            'distilbert': 'distilbert-base-uncased',
            'xlnet': 'xlnet-base-uncased',
            'electra': 'google/electra-base-discriminator',
            'industry_classifier': 'sampathkethineedi/industry-classification'
        }
        self.__initialize_models(both, llm, sentence_transformer, sent)
        self.svd = TruncatedSVD(n_components=384)

    def __initialize_models(self, both, llm, sentence_transformer, sent):

        if not both:
            if not sentence_transformer:
                self.model = AutoModel.from_pretrained(self.llm[llm])
                self.tokenizer = AutoTokenizer.from_pretrained(self.llm[llm])
            else:
                self.model = SentenceTransformer(sent)
        else:
            self.model = AutoModel.from_pretrained(self.llm[llm])
            self.tokenizer = AutoTokenizer.from_pretrained(self.llm[llm])
            self.sentence_transformer = SentenceTransformer(sent)

    def generate_embeddings(self, startup=True):
        """
        Generates embeddings for startups or industries using the specified language model and pooling method.

        :param startup: bool, if True, generates embeddings for startups, if False, generates embeddings for industries
        :return: DataFrame with generated embeddings merged with the original input DataFrame
        """
        texts = self.startups if startup else self.industries
        embeddings_list = []

        for i, row in tqdm(texts.iterrows()):
            id = row['id']
            if startup:
                description = row['cb_description']
            else:
                description = row['keywords']
            if self.sentence_transformer:
                embeddings = self.model.encode(description)
            else:
                inputs = self.tokenizer.encode_plus(description, return_tensors="pt", truncation=True, padding="max_length", max_length=60)
                outputs = self.model(**inputs)
                last_hidden_states = outputs.last_hidden_state
                embeddings = self.pooling(last_hidden_states)

            embeddings_list.append({'id': id, 'embeddings': embeddings.tolist()})

        embeddings_df = pd.DataFrame(embeddings_list)
        merged_df = pd.merge(texts, embeddings_df, on='id', how='left')

        if startup:
            self.startups = merged_df
        else:
            self.industries = merged_df

        return merged_df

    def assign_industry(self, num_labels=3, metric='cosine'):
        """
        Assigns top industries to startups based on their similarity to the industry embeddings.

        :param num_labels: int, the number of top industries to assign to each startup, default is 3
        :param metric: string, the similarity metric to use, default is 'cosine'
        :return: list of lists containing dictionaries with assigned industries and their similarity scores
        """
        if metric not in ['cosine', 'euclidean', 'manhattan']:
            raise ValueError("metric must be one of 'cosine', 'euclidean', 'manhattan'")

        self.assigned_industries = []
        for startup_embedding in self.startups['embeddings']:
            startup_embedding = np.array(startup_embedding).flatten()
            industry_embeddings = np.array([np.array(x).flatten() for x in self.industries['embeddings']])

            if metric == 'cosine':
                scores = cosine_similarity([startup_embedding], industry_embeddings)[0]
                top_industry_indices = np.argsort(scores)[-num_labels:][::-1]
            elif metric == 'euclidean':
                scores = euclidean_distances([startup_embedding], industry_embeddings)[0]
                top_industry_indices = np.argsort(scores)[:num_labels]
            elif metric == 'manhattan':
                scores = manhattan_distances([startup_embedding], industry_embeddings)[0]
                top_industry_indices = np.argsort(scores)[:num_labels]
            else:
                raise ValueError("metric must be one of 'cosine', 'euclidean', 'manhattan'")

            # Calculate confidence scores for the top industries
            top_scores = [scores[index] for index in top_industry_indices]
            confidence_industry_1, confidence_industry_2, confidence_industry_3 = self.__calculate_confidence(*top_scores)

            # Include confidence scores in the top_industries dictionaries
            top_industries = [{'industry': self.industries.iloc[index]['industry'], 'score': scores[index], 'confidence': confidence}
                              for index, confidence in zip(top_industry_indices, [confidence_industry_1, confidence_industry_2, confidence_industry_3])]

            self.assigned_industries.append(top_industries)

        return self.assigned_industries


    def pooling(self, embeddings_tensor):

        if self.pool == 'max':
            pooled_embeds = torch.max(embeddings_tensor, dim=1).values
        elif self.pool == 'avg':
            pooled_embeds = torch.mean(embeddings_tensor, dim=1)
        elif self.pool == 'concat':
            max_pooling = torch.max(embeddings_tensor, dim=1).values
            average_pooling = torch.mean(embeddings_tensor, dim=1)
            pooled_embeds = torch.cat((max_pooling, average_pooling), dim=1)
        else:
            raise ValueError('pool must be either max, avg, or concat')
        return pooled_embeds.detach().numpy()



    def update_dataframe(self):
        """
        Updates the startup and industry DataFrames with assigned industries and their similarity scores.

        :return: DataFrame with updated startups data
        """
        df = self.startups.copy()
        max_industries = max([len(x) for x in self.assigned_industries])

        for i in range(max_industries):
            df[f'industry{i + 1}'] = [x[i]['industry'] if i < len(x) else None for x in self.assigned_industries]
            df[f'score{i + 1}'] = [x[i]['score'].round(3) if i < len(x) else None for x in self.assigned_industries]

        #df.drop(columns=['embeddings'], inplace=True)
        #self.industries.drop(columns=['embeddings'], inplace=True)

        return df

    def __calculate_confidence(self, *scores):
        """
        Calculates the confidence of the assigned industries based on the similarity scores.

        :param scores: list of similarity scores
        :return: float, the confidence score
        """
        confidence_industry_1 = scores[0]/sum(scores)
        confidence_industry_2 = scores[1]/sum(scores)
        confidence_industry_3 = scores[2]/sum(scores)

        return confidence_industry_1, confidence_industry_2, confidence_industry_3




In [71]:
startups = pd.read_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\processed\startups_clean_noents_1560.csv')
startups = startups[['id', 'name', 'cb_description']]
industries = pd.read_csv(r'C:\Users\imran\DataspellProjects\WalidCase\data\processed\no_crm_GPT4_generated_keywords.csv')


In [72]:
industries['id'] = industries.index

In [73]:
embedding = Embedding(startups.head(50), industries, llm='bert', pool='max')
embedding.generate_embeddings(startup=True)
embedding.generate_embeddings(startup=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, row in tqdm(texts.iterrows()):


0it [00:00, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, row in tqdm(texts.iterrows()):


0it [00:00, ?it/s]

Unnamed: 0,industry,keywords,id,embeddings
0,Telecoms,"carrier services, satellite communication, fix...",0,"[[1.6293150186538696, 0.9383618235588074, 1.07..."
1,Mobile,"mobile applications, mobile devices, mobile op...",1,"[[0.7876855134963989, 1.2005103826522827, 1.30..."
2,Communications,"real-time communication, chat applications, vi...",2,"[[0.7855498790740967, 0.6936728954315186, 1.23..."
3,Network Infrastructure,"content delivery network, network function vir...",3,"[[0.7913244366645813, 0.44820863008499146, 1.1..."
4,5G Networks,"5G radio access network, millimeter wave, 5G s...",4,"[[0.38631683588027954, 0.4769960045814514, 0.9..."
...,...,...,...,...
112,Video/Audio,"media production, video platforms, audio engin...",112,"[[1.0657232999801636, 0.7541917562484741, 1.05..."
113,Genomics,"functional genomics, gene expression, epigenom...",113,"[[1.0262608528137207, 0.6551767587661743, 1.40..."
114,Longevity,"aging biology, lifespan extension, rejuvenatio...",114,"[[1.128448486328125, 0.9876158237457275, 0.855..."
115,Gut Microbiome,"intestinal microbiota, intestinal ecology, gut...",115,"[[0.8957403898239136, 0.845739483833313, 0.607..."


In [74]:
assigned_industries_cosine = embedding.assign_industry(num_labels=3, metric='cosine')
df_cosine = embedding.update_dataframe()



In [78]:
df_cosine

Unnamed: 0,id,name,cb_description,industry1,score1,industry2,score2,industry3,score3
0,1820,0xKYC,modular knowledge system identity credential m...,Generative AI,0.92,InsurTech,0.919,RegTech,0.918
1,3640,10X-Genomics,create revolutionary dna sequence technology h...,Genomics,0.918,Business Intelligence,0.913,Longevity,0.907
2,9594,111Skin,commit positive luxury skincare push boundary ...,Fashion,0.904,Carbon Removal,0.897,Sharing Economy,0.895
3,473,1stdibs,internet company offer marketplace rare desira...,E-commerce,0.904,Fashion,0.9,Creator Economy,0.899
4,7956,1v1Me,application allow user play match favorite vid...,Esports,0.891,Gaming,0.89,Creator Economy,0.889
5,9457,21sportsgroup,online sport good retailer offer selection run...,Longevity,0.91,LegalTech,0.909,InsurTech,0.908
6,917,24Symbols,solution read digital book read device interne...,Data Centers,0.903,Connected Home,0.903,Connected Life,0.902
7,5470,2GIS,international technology company develop servi...,Sharing Economy,0.908,PropTech,0.907,Creator Economy,0.906
8,5958,352-Medical-Group,local organisation trust healthcare profession...,Healthcare,0.902,Health Tech,0.901,Death Tech,0.894
9,4004,365-Response,response unique software service organisation ...,Autonomous Driving,0.918,Data Infrastructure,0.915,IoT,0.915


In [16]:
df_cosine.drop(columns=['embeddings'], inplace=True)

In [17]:
df_cosine

Unnamed: 0,id,name,cb_description,industry1,score1,industry2,score2,industry3,score3
0,1820,0xKYC,modular knowledge system identity credential m...,RegTech,0.359,DeFi,0.339,InfoSec,0.325
1,3640,10X-Genomics,create revolutionary dna sequence technology h...,Genomics,0.438,Life Sciences,0.306,Deep Tech,0.262
2,9594,111Skin,commit positive luxury skincare push boundary ...,Beauty,0.483,SexTech,0.337,Happiness & Wellbeing,0.328
3,473,1stdibs,internet company offer marketplace rare desira...,E-commerce,0.443,Fashion,0.43,Beauty,0.397
4,7956,1v1Me,application allow user play match favorite vid...,Social Networks,0.295,Gaming,0.279,Entertainment,0.26
5,9457,21sportsgroup,online sport good retailer offer selection run...,Fashion,0.441,E-commerce,0.389,Retail,0.355
6,917,24Symbols,solution read digital book read device interne...,Mobile,0.221,EdTech,0.219,Telecoms,0.172
7,5470,2GIS,international technology company develop servi...,Mobile,0.419,Telecoms,0.412,Health Tech,0.401
8,5958,352-Medical-Group,local organisation trust healthcare profession...,Healthcare,0.519,Health Tech,0.433,InsurTech,0.331
9,4004,365-Response,response unique software service organisation ...,Productivity,0.312,Circular Economy,0.304,Future of Work,0.297


In [15]:
assigned_industries_cosine.drop(columns=['embeddings'], inplace=True)
assigned_industries_euclidean.drop(columns=['embeddings'], inplace=True)
assigned_industries_manhattan.drop(columns=['embeddings'], inplace=True)

AttributeError: 'list' object has no attribute 'drop'

In [8]:
#merge all three dataframes on id and keep only the columns with industries and scores
assigned_industries = assigned_industries_cosine.merge(assigned_industries_euclidean, on='id', how='left')
assigned_industries = assigned_industries.merge(assigned_industries_manhattan, on='id', how='left')
assigned_industries

Unnamed: 0,id,name_x,cb_description_x,industry1_x,score1_x,industry2_x,score2_x,industry3_x,score3_x,name_y,...,industry3_y,score3_y,name,cb_description,industry1,score1,industry2,score2,industry3,score3
0,1820,0xKYC,modular knowledge system identity credential m...,RegTech,0.359,DeFi,0.339,InfoSec,0.325,0xKYC,...,InfoSec,1.162,0xKYC,modular knowledge system identity credential m...,RegTech,17.414,Crypto,17.744,Security,17.834
1,3640,10X-Genomics,create revolutionary dna sequence technology h...,Genomics,0.438,Life Sciences,0.306,Deep Tech,0.262,10X-Genomics,...,Deep Tech,1.215,10X-Genomics,create revolutionary dna sequence technology h...,Genomics,16.785,Life Sciences,17.979,Deep Tech,18.731
2,9594,111Skin,commit positive luxury skincare push boundary ...,Beauty,0.483,SexTech,0.337,Happiness & Wellbeing,0.328,111Skin,...,Happiness & Wellbeing,1.159,111Skin,commit positive luxury skincare push boundary ...,Beauty,15.619,SexTech,17.938,Happiness & Wellbeing,17.994
3,473,1stdibs,internet company offer marketplace rare desira...,E-commerce,0.443,Fashion,0.43,Beauty,0.397,1stdibs,...,Beauty,1.098,1stdibs,internet company offer marketplace rare desira...,Fashion,16.47,E-commerce,16.604,Creator Economy,16.879
4,7956,1v1Me,application allow user play match favorite vid...,Social Networks,0.295,Gaming,0.279,Entertainment,0.26,1v1Me,...,Entertainment,1.216,1v1Me,application allow user play match favorite vid...,Social Networks,18.454,Entertainment,18.53,Gaming,18.887
5,9457,21sportsgroup,online sport good retailer offer selection run...,Fashion,0.441,E-commerce,0.389,Retail,0.355,21sportsgroup,...,Retail,1.136,21sportsgroup,online sport good retailer offer selection run...,Fashion,16.454,E-commerce,17.474,Sport & Wellness,17.722
6,917,24Symbols,solution read digital book read device interne...,Mobile,0.221,EdTech,0.219,Telecoms,0.172,24Symbols,...,Telecoms,1.287,24Symbols,solution read digital book read device interne...,Mobile,19.303,EdTech,19.557,FamilyTech,19.835
7,5470,2GIS,international technology company develop servi...,Mobile,0.419,Telecoms,0.412,Health Tech,0.401,2GIS,...,Health Tech,1.094,2GIS,international technology company develop servi...,Mobile,16.537,Health Tech,16.761,Telecoms,16.852
8,5958,352-Medical-Group,local organisation trust healthcare profession...,Healthcare,0.519,Health Tech,0.433,InsurTech,0.331,352-Medical-Group,...,InsurTech,1.157,352-Medical-Group,local organisation trust healthcare profession...,Healthcare,15.212,Health Tech,16.581,MedTech,17.797
9,4004,365-Response,response unique software service organisation ...,Productivity,0.312,Circular Economy,0.304,Future of Work,0.297,365-Response,...,Future of Work,1.185,365-Response,response unique software service organisation ...,Productivity,18.048,Circular Economy,18.29,Developer Tools,18.345
