Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import os

## Three types of datasets were used for relevant context generation for the rag model, News and press release data, Data regarding deals done by Indian pe/vc firms and those done by international pe/vc firms

Importing preprocessed news data

In [None]:
news_df = pd.read_csv('final_news_press.csv')
news_df

Loading the data for indian firms

In [None]:
def read_excel_files_in_folder(folder_path):
    file_list = os.listdir(folder_path) # Read individual excel files from a specified folder
    excel_files = [file for file in file_list if file.endswith('.xlsx') or file.endswith('.xls')]
    dfs = [] 
    
    # Read each Excel file into a DataFrame and append to the list
    for file in excel_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_excel(file_path, skiprows=14) # Just extracting the tables 
        df['Institution'] = os.path.splitext(file)[0]
        dfs.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(dfs, ignore_index=True)
    
    return concatenated_df

# Specify the folder path where Excel files are located
folder_path = r"D:\Python\dataset_rag\indpe"

indian_pe = read_excel_files_in_folder(folder_path)
indian_pe

In [None]:
indian_pe.drop(['MnA Deal PermId'],axis=1,inplace=True) # dropping useless columns

In [None]:
indian_pe['Year'] = indian_pe['Investment Date'].astype(str).apply(lambda x: x[0:4])  # Adding date and time column
indian_pe['month'] = indian_pe['Investment Date'].astype(str).apply(lambda x: x[5:7])

In [None]:
indian_pe

In [None]:
foreign_pe = pd.read_csv("PE_VC Data - Sheet1.csv") # loading already consolidated international firms data

In [None]:
foreign_pe

In [None]:
foreign_pe['Year'] = foreign_pe['Investment Date'].astype(str).apply(lambda x: x[-4:])
foreign_pe['month'] = foreign_pe['Investment Date'].astype(str).apply(lambda x: x.split('/')[0] if '/' in x else '')

In [None]:
pevc_df = pd.concat([indian_pe,foreign_pe], ignore_index=True) # Concatenating Indian and international firms data

In [None]:
pevc_df.drop(['Region','Deal Id','Investment Date'],axis=1,inplace=True) # dropping irrelevant columns

In [None]:
# Specifying geographic region for countries

categories = {
      "North America": ["United States", "Canada", "Mexico"],
      "Western Europe": ["United Kingdom", "Germany", "Sweden", "Netherlands", "Belgium", "Ireland", "Italy", "Portugal", "France", "Spain", "Switzerland", "Austria", "Luxembourg", "Denmark", "Norway", "Finland", "Czech Republic"],
      "Middle East": ["Saudi Arabia", "Israel", "United Arab Emirates", "Greece",'Turkey'],
      "Southern Asia": ["India"],
      "Northern Europe": [], 
      "SouthEast Asia": ["Indonesia", "Singapore", "Philippines",'Malaysia','Thailand','Vietnam'],
      "East Asia": ["China (Mainland)", "Taiwan", "Japan", "Hong Kong",'South Korea'],
      "Caribbean": ["Antigua and Barbuda","Cayman Islands",'Bermuda'],
      "South America": ["Brazil", "Argentina", "Peru", "Chile"],
      "Southern Europe": [],
      "Western Africa": ["Nigeria"],
      "Pacific": ["New Zealand", "Australia"],
      "Eastern Europe": ["Russia",'Poland','Estonia'],
      "Southern Africa": ["South Africa"],
      "Eastern Africa": ["Kenya"]
  }

In [None]:
# Mapping each country to a geographic region

def get_region(country):
  """
  Maps a country name to its corresponding region from the categories dictionary.

  Args:
    country: The name of the country as a string.

  Returns:
    The region of the country, or None if not found.
  """
  for region, countries in categories.items():
    if country in countries:
      return region
  return None

pevc_df['Region'] = pevc_df['Nation'].apply(get_region)

In [None]:
pevc_df['Institution'].unique() # List of firms in consideration

In [None]:
pevc_df['month'] = pevc_df['month'].apply(lambda x: pd.to_datetime(str(x), format='%m').strftime('%B') if pd.notnull(x) else '')


In [None]:
pevc_df['Month-Year'] = pevc_df['month'].astype(str) + ' ' + pevc_df['Year'].astype(str)

The preprocessed News data will be appended to our pe/vc dataframe

In [None]:
news_df = news_df[(news_df['company'] != 'Vanguard') & (news_df['company'] != 'JPMorgan Chase & Co.')] # Filtering the data on the basis just the selected firms
news_df.reset_index(inplace=True)
news_df.drop(['index'],axis  = 1,inplace=True)
news_df

In [None]:
news_df.rename(columns={'company': 'Institution'}, inplace=True) # Standardising some inconsistencies
news_df['Institution'] = news_df['Institution'].replace('BlackRock', 'Blackrock')

Now the news data and the investment data will be merged on the basis of the name of the firm that the news is about and the month-year in which that news was released

In [None]:
news_pevc = pd.merge(pevc_df,news_df, on=['Institution', 'Month-Year'], how='left') 

In [None]:
news_pevc_grouped = news_pevc.groupby(['Investee Company', 'Round Equity Total, MM', 'Fund Name',
       'TRBC Industry', 'Stage', 'Status', 'Nation', 'Institution', 'Year',
       'month', 'Region', 'Month-Year'])['LLM snippet'].agg(lambda x: ', '.join(x.dropna())).reset_index()
news_pevc_grouped = news_pevc_grouped.drop_duplicates() # removing duplicates

Since the data is in tabular format, it is not ready to be fed into the rag model directly, therefore a method was deviced in which a general string template would be used that would convey the same information each row is conveying just in a text based sentence format.

In [None]:
# specific columns that contain valuable information
selected_columns = ['Investee Company', 'Round Equity Total, MM', 'Fund Name', 'TRBC Industry', 'Stage', 'Status', 'Nation', 'Institution', 'year', 'month', 'Region', 'Month-Year', 'LLM snippet']

def generate_context_sentence(row):
    """
    Generates a context string for a given DataFrame row, tailored to
    your specific data and preferences.

    Args:
        row (pandas.Series): A row from the `df_companies` DataFrame.

    Returns:
        str: The generated context string.
    """
    
    context = f"{row['Investee Company']} which is a startup company secured venture funding from the institutional investor {row['Fund Name']} during the {row['Stage']} stage. The company, operating in the {row['TRBC Industry']} industry and based in the nation of {row['Nation']}, received investment in month of {row['Month-Year']}. {row['Institution']} was involved in the funding round. Some news snippits about {row['Institution']} in the month of {row['Month-Year']}: {row['LLM snippet']} "
    
    return context

news_pevc_grouped['Context'] = news_pevc_grouped.apply(generate_context_sentence, axis=1)

Apart from this we also had overall data about for each firm on the basis of Year of investment, stage of investment, country, region and industry in which the investment was made by the firm

This data was loaded and preprocessed for both indian and international firms, the loading method is little different as the data was not available in the structure.

In [None]:
df_dict_foreign = pd.read_excel('Firm_Investment_Profile_2024_02_17_23_23_20.xlsx', sheet_name=None)
for key, df in df_dict_foreign.items():
    df.rename(columns={'Company Name': 'Institution'}, inplace=True)

In [None]:
def read_excel_files_in_folder_2(folder_path, sheet_name='Sheet1'):
    file_list = os.listdir(folder_path) # Reading the excel files as earlier
    
    excel_files = [file for file in file_list if file.endswith('.xlsx') or file.endswith('.xls')]
    
    dfs = []
    
    for file in excel_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_excel(file_path, sheet_name, skiprows=8)
        df['Institution'] = os.path.splitext(file)[0]
        dfs.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df = concatenated_df[concatenated_df[concatenated_df.columns[0]] != "Total"]
    return concatenated_df

In [None]:
df_dict_indian = {}
folder_path = r"D:\Python\dataset_rag\indpe_info"
selected_sheet_name = ['History','Industry','Stage','Status','Nation','World_Location']  #'YourSheetName' in the function will be replaced with the actual sheet name
for i in selected_sheet_name:
    result_df = read_excel_files_in_folder_2(folder_path, sheet_name=i)
    df_dict_indian[i] = result_df

In [None]:
df_dict_combined = {}

# Iterating through the keys in both dictionaries
for key in df_dict_indian.keys():
    # Concatenating the corresponding DataFrames from both dictionaries
    df_combined = pd.concat([df_dict_indian[key], df_dict_foreign[key]], ignore_index=True)
    # Storing the concatenated DataFrame in the new dictionary
    df_dict_combined[key] = df_combined

Applying the similar context generation functions in these dataframes

In [None]:
columns_to_drop = ['Num of Investments', 'Sum of Investments, MM', 'Avg by Company, MM', 'Num of Companies']

def generate_context_Stage(row):

    template = '''
    context about how has the institutional investor invested in the category of {Stage} stage investments, {Company_Name} made {Num} investments, totaling {Sum} million, averaging {Avg} million per company. {Num} companies received funding in total. This is data is about how the investor has invested across all years, regions, industries, nations, etc. and should be read separately.
    '''
    # Placeholders and replacements:
    replacements = {
        'Stage': row['Stage'],
        'Num': row['Num of Investments'],
        'Sum': row['Sum of Investments, MM'],
        'Avg': row['Avg by Company, MM'],
        'Num': row['Num of Companies'],
        'Company_Name': row['Institution']
    }


    # Generating the context string:
    context = template.format(**replacements)

    return context

df_dict_combined['Stage']['Stage_Context'] = df_dict_combined['Stage'].apply(generate_context_Stage, axis=1)
df_dict_combined['Stage'].drop(columns=columns_to_drop, inplace=True)


In [None]:
def generate_context_history(row):

    template = '''
    context about how has the institutional investor in question invested in the year of {Year}, {Company_Name} made {Num_of_Investments} investments, totaling {Sum_of_Investments_MM} million, averaging {Avg_by_Company_MM} million per company. {Num_of_Companies} companies received funding in total. This is data is about how the investor has invested across all stages, regions, industries, nations, etc. and should be read separately.
    '''
    # Placeholders and replacements:
    replacements = {
        'Year': row['Year'],
        'Num_of_Investments': row['Num of Investments'],
        'Sum_of_Investments_MM': row['Sum of Investments, MM'],
        'Avg_by_Company_MM': row['Avg by Company, MM'],
        'Num_of_Companies': row['Num of Companies'],
        'Company_Name': row['Institution']
    }


    # Generating the context string:
    context = template.format(**replacements)

    return context

df_dict_combined['History']['History_Context'] = df_dict_combined['History'].apply(generate_context_history, axis=1)
df_dict_combined['History'].drop(columns=columns_to_drop, inplace=True)

In [None]:
def generate_context_industry(row):

    template = '''
    context about how has the institutional investor invested in the {Investee} industry, {Company_Name} made {Num} investments, totaling {Sum} million, averaging {Avg} million per company. {Num} companies received funding in total. This is data is about how the investor has invested across all years, regions, stages, nations, etc. and should be read separately.
    '''
    # Placeholders and replacements:
    replacements = {
        'Investee': row['Investee Company TRBC Economic Sector'],
        'Num': row['Num of Investments'],
        'Sum': row['Sum of Investments, MM'],
        'Avg': row['Avg by Company, MM'],
        'Num': row['Num of Companies'],
        'Company_Name': row['Institution']
    }


    # Generating the context string:
    context = template.format(**replacements)

    return context

df_dict_combined['Industry']['Industry_Context'] = df_dict_combined['Industry'].apply(generate_context_industry, axis=1)
df_dict_combined['Industry'].drop(columns=columns_to_drop, inplace=True)

In [None]:
df_dict_combined['Industry']

In [None]:
def generate_context_nation(row):

    template = '''
    context about how has the institutional investor invested in the country of {Nation}, {Company_Name} made {Num} investments, totaling {Sum} million, averaging {Avg} million per company. {Num} companies received funding in total. This is data is about how the investor has invested across all years, regions, industries, stages, etc. and should be read separately.
    '''
    # Placeholders and replacements:
    replacements = {
        'Nation': row['Investee Company Nation'],
        'Num': row['Num of Investments'],
        'Sum': row['Sum of Investments, MM'],
        'Avg': row['Avg by Company, MM'],
        'Num': row['Num of Companies'],
        'Company_Name': row['Institution']
    }


    # Generating the context string:
    context = template.format(**replacements)

    return context

df_dict_combined['Nation']['Nation_Context'] = df_dict_combined['Nation'].apply(generate_context_nation, axis=1)
df_dict_combined['Nation'].drop(columns=columns_to_drop, inplace=True)

In [None]:
def generate_context_World_Location(row):

    template = '''
    context about how has the institutional investor in question invested in the region of {World_Location}, {Company_Name} made {Num_of_Investments} investments, totaling {Sum_of_Investments_MM} million, averaging {Avg_by_Company_MM} million per company. {Num_of_Companies} companies received funding in total. This is data is about how the investor has invested across all years, stages, industries, nations, etc. and should be read separately.
    '''
    # Placeholders and replacements:
    replacements = {
        'World_Location': row['Investee Company World Sub Location'],
        'Num_of_Investments': row['Num of Investments'],
        'Sum_of_Investments_MM': row['Sum of Investments, MM'],
        'Avg_by_Company_MM': row['Avg by Company, MM'],
        'Num_of_Companies': row['Num of Companies'],
        'Company_Name': row['Institution']
    }


    # Generating the context string:
    context = template.format(**replacements)

    return context

df_dict_combined['World_Location']['World_Location_Context'] = df_dict_combined['World_Location'].apply(generate_context_World_Location, axis=1)
df_dict_combined['World_Location'].drop(columns=columns_to_drop, inplace=True)

In [None]:
df_dict_combined['History']

In [None]:
news_pevc_grouped.to_csv('prelim.csv') # Saving the dataframe in csv format

In [None]:
selected_columns = ['Investee Company', 'TRBC Industry', 'Stage', 'Status', 'Nation', 'Institution', 'Year', 'month', 'Region', 'Month-Year']
news_pevc_grouped['Context key'] = news_pevc_grouped[selected_columns].astype(str).agg(' - '.join, axis=1) # Creating a new column that would hold the keywords used in each context

In [None]:
news_pevc_grouped

In [None]:
news_pevc_grouped.columns

In [None]:
df_dict_combined.keys()

Merging each of the summary dataframe with the main dataframe

In [None]:
Hist_merge = pd.merge(
    news_pevc_grouped.astype({'Institution': str, 'Year': str}),
    df_dict_combined['History'].astype({'Institution': str, 'Year': str}),
    on=['Institution', 'Year'],
    how='left'
)

In [None]:
Ind_merge = pd.merge(
    Hist_merge.astype({'Institution': str, 'TRBC Industry': str}),
    df_dict_combined['Industry'].astype({'Institution': str, 'Investee Company TRBC Economic Sector': str}),
    left_on=['Institution', 'TRBC Industry'],
    right_on=['Institution', 'Investee Company TRBC Economic Sector'],
    how='left'
)

In [None]:
Stage_merge = pd.merge(
    Ind_merge.astype({'Institution': str, 'Stage': str}),
    df_dict_combined['Stage'].astype({'Institution': str, 'Stage': str}),
    left_on=['Institution', 'Stage'],
    right_on=['Institution', 'Stage'],
    how='left'
)

In [None]:
Nation_merge = pd.merge(
    Stage_merge.astype({'Institution': str, 'Nation': str}),
    df_dict_combined['Nation'].astype({'Institution': str, 'Investee Company Nation': str}),
    left_on=['Institution', 'Nation'],
    right_on=['Institution', 'Investee Company Nation'],
    how='left'
)

In [None]:
Final_merge = pd.merge(
    Nation_merge.astype({'Institution': str, 'Region': str}),
    df_dict_combined['World_Location'].astype({'Institution': str, 'Investee Company World Sub Location': str}),
    left_on=['Institution', 'Region'],
    right_on=['Institution', 'Investee Company World Sub Location'],
    how='left'
)

In [None]:
Final_merge.columns

In [None]:
Final_merge

In [None]:
context_df = Final_merge[['Context key','Context','History_Context','Industry_Context','Stage_Context','Nation_Context','World_Location_Context']] # retaing just textual context columns for rag model

In [None]:
context_df.to_csv('Combined.csv') # saving in csv format

Creating a single string that holds all the information

In [None]:
context_df['Combined_Context'] = context_df.apply(lambda row: ' '.join(row[['Context', 'History_Context', 'Industry_Context', 'Stage_Context', 'Nation_Context', 'World_Location_Context']].astype(str)), axis=1)

Finalizing the data that will be used for our rag model

In [None]:
Fin_df = context_df[['Combined_Context','Context key']]

In [None]:
Fin_df.shape

In [None]:
Fin_df.to_csv('Final_output.csv')

In [None]:
Fin_df