In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_text(text):
    if pd.isna(text):
        return ''
    return str(text).lower().strip()

def calculate_similarity_scores(investors_df, founder_data):
    # Preprocess descriptions
    investors_df['processed_description'] = investors_df['description'].apply(preprocess_text)
    founder_desc = preprocess_text(founder_data['description'])
    
    # Combine industries and description for better matching
    investors_df['combined_text'] = investors_df['processed_description'] + ' ' + investors_df['industries'].fillna('')
    
    # Create document list
    documents = investors_df['combined_text'].tolist()
    documents.append(founder_desc)
    
    # TF-IDF Vectorization with specific parameters
    vectorizer = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 2),  # Include bigrams
        min_df=1,            # Include all terms
        max_df=0.9          # Exclude terms that appear in more than 90% of documents
    )
    
    try:
        tfidf_matrix = vectorizer.fit_transform(documents)
        
        # Calculate cosine similarity
        cosine_similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])[0]
        
        # Normalize scores to 0-1 range
        normalized_scores = (cosine_similarities - cosine_similarities.min()) / \
                          (cosine_similarities.max() - cosine_similarities.min() + 1e-10)
        
        return normalized_scores
        
    except Exception as e:
        print(f"Error in similarity calculation: {e}")
        return np.zeros(len(investors_df))

# Load and process data
investor_file_path = '/Users/sasanksasi/Downloads/project/VertexAi/dataset.csv'
investors_df = pd.read_csv(investor_file_path)

founder_data = {
    'name': 'John Doe',
    'description': 'A passionate entrepreneur in the field of artificial intelligence and machine learning.',
    'industry': 'Artificial Intelligence'
}

# Filter relevant investors
filtered_investors = investors_df[
    investors_df['industries'].str.contains(founder_data['industry'], case=False, na=False)
]

# Calculate similarity scores
similarity_scores = calculate_similarity_scores(filtered_investors, founder_data)
filtered_investors['similarity_score'] = similarity_scores

# Sort and display results
sorted_investors = filtered_investors.sort_values(by='similarity_score', ascending=False)

# Display top matches with scores
print("\nTop matches:")
print(sorted_investors[['company_name', 'investor_type', 'industries', 'similarity_score']].head())

# Save results
new_file_path = '/Users/sasanksasi/Downloads/project/VertexAi/Phase1_dataset.csv'
sorted_investors.to_csv(new_file_path, index=False)


Top matches:
                 company_name                               investor_type  \
97                DIVEdigital                             Venture Capital   
144                     TRIVE                                    Micro VC   
148                  IncuVest                Angel Group, Venture Capital   
12            Promus Ventures                                    Micro VC   
43   Boeing HorizonX Ventures  Corporate Venture Capital, Venture Capital   

                                            industries  similarity_score  
97   Artificial Intelligence, Machine Learning, Man...          1.000000  
144  Analytics, Artificial Intelligence, Business I...          0.593795  
148  Advanced Materials, Artificial Intelligence, A...          0.474880  
12   Aerospace, AgTech, Artificial Intelligence, Fi...          0.418227  
43   Advanced Materials, Artificial Intelligence, C...          0.399897  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  investors_df['processed_description'] = investors_df['description'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  investors_df['combined_text'] = investors_df['processed_description'] + ' ' + investors_df['industries'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from groq import Groq
import time
from tqdm import tqdm

# Load environment variables
load_dotenv()
groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))

class InvestorAnalyzer:
    def __init__(self, groq_client):
        self.groq_client = groq_client
        
    def analyze_investor(self, founder_data, investor):
        prompt = f"""
        Analyze the compatibility between this founder and investor:

        Founder:
        {founder_data['description']}
        Industry Focus: {founder_data['industry']}

        Investor:
        Company: {investor['company_name']}
        Type: {investor['investor_type']}
        Description: {investor['description']}
        Industries: {investor['industries']}
        Location: {investor['location']}

        Provide a structured analysis with:
        1. Score (0-100)
        2. Brief explanation
        
        Format: <score>|<explanation>
        """

        try:
            completion = self.groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="mixtral-8x7b-32768",
                temperature=0.3,
                max_tokens=150
            )
            response = completion.choices[0].message.content.strip()
            
            # Parse response
            score, explanation = response.split('|')
            return float(score), explanation.strip()
            
        except Exception as e:
            print(f"Error analyzing {investor['company_name']}: {str(e)}")
            return 0, "Analysis failed"

    def process_investors(self, investors_df, founder_data):
        results = []
        
        # Process each investor with progress bar
        for _, investor in tqdm(investors_df.iterrows(), total=len(investors_df)):
            score, explanation = self.analyze_investor(founder_data, investor)
            
            results.append({
                'company_name': investor['company_name'],
                'investor_type': investor['investor_type'],
                'location': investor['location'],
                'industries': investor['industries'],
                'groq_score': score,
                'explanation': explanation,
                'similarity_score': investor['similarity_score']
            })
            
            # Rate limiting
            time.sleep(1)
            
        return pd.DataFrame(results)

def main():
    # Load dataset
    investors_df = pd.read_csv('/Users/sasanksasi/Downloads/project/VertexAi/Phase1_dataset.csv')
    
    # Sample founder data
    founder_data = {
        'name': 'John Doe',
        'description': 'Building an AI-powered healthcare diagnostics platform using computer vision and machine learning.',
        'industry': 'Healthcare AI'
    }
    
    # Initialize analyzer
    analyzer = InvestorAnalyzer(groq_client)
    
    # Process investors
    results_df = analyzer.process_investors(investors_df, founder_data)
    
    # Calculate final score (weighted combination)
    results_df['final_score'] = (
        results_df['similarity_score'] * 0.3 + 
        results_df['groq_score'] * 0.7
    )
    
    # Sort and display results
    final_results = results_df.sort_values('final_score', ascending=False)
    
    # Display top matches
    print("\nTop Investor Matches:")
    print(final_results[['company_name', 'investor_type', 'groq_score', 'final_score', 'explanation']].head())
    print(final_results.columns)
    
    # Save results
    final_results.to_csv('investor_matches.csv', index=False)

if __name__ == "__main__":
    main()

  0%|          | 0/8 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 12%|█▎        | 1/8 [00:01<00:11,  1.61s/it]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 25%|██▌       | 2/8 [00:03<00:09,  1.57s/it]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 38%|███▊      | 3/8 [00:04<00:07,  1.58s/it]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 50%|█████     | 4/8 [00:06<00:06,  1.57s/it]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 62%|██████▎   | 5/8 [00:07<00:04,  1.58s/it]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 75%|███████▌  | 6/8 [00:09<00:03,  1.64s/it]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
 88%|████████▊ | 7/8 


Top Investor Matches:
            company_name                 investor_type  groq_score  \
1                  TRIVE                      Micro VC        85.0   
2               IncuVest  Angel Group, Venture Capital        85.0   
3        Promus Ventures                      Micro VC        85.0   
5          Acton Capital               Venture Capital        85.0   
7  Govin Capital Pte Ltd               Venture Capital        85.0   

   final_score                                        explanation  
1    59.678138  TRIVE is a micro VC firm that focuses on inves...  
2    59.642464  IncuVest is a Singapore-based early and growth...  
3    59.625468  Promus Ventures is a strong match for the foun...  
5    59.535985  Acton Capital has a strong focus on tech-enabl...  
7    59.500000  Govin Capital Pte Ltd appears to be a highly c...  
Index(['company_name', 'investor_type', 'location', 'industries', 'groq_score',
       'explanation', 'similarity_score', 'final_score'],
      dtyp


