## Importing the Libraries ##

In [1]:
# pip install faker

In [2]:
from faker import Faker
import numpy as np
import pandas as pd
import random
from typing import List, Dict

## Building the class for data generation ##

In [3]:
np.random.seed(42)
random.seed(42)

In [10]:
class csvBuilder():
    def __init__(self,sample_size: int, industries: List[str], prob_indus: List[float], countries: List[str], prob_countries: List[float], funding_means: Dict[str,int]):
        self.companies = Faker()
        self.sample_size = sample_size
        self.industries = industries
        self.prob_indus = prob_indus
        self.countries = countries
        self.prob_countries = prob_countries
        self.funding_means = funding_means
        self.data = None

    def build_dataframe(self)-> pd.DataFrame:
        self.data = pd.DataFrame(
            {
                'company_name':[self.companies.company() for _ in range(self.sample_size)],
                'founded_on' : pd.to_datetime(np.random.choice(pd.date_range('2010-01-01','2022-12-31'), size=self.sample_size)),
                'industry': np.random.choice(self.industries, size=self.sample_size, p=self.prob_indus),
                'headquarters': np.random.choice(self.countries, size=self.sample_size, p=self.prob_countries),
                'founder_experience': np.random.exponential(scale=5, size=self.sample_size).astype(int)
            }
        )
        return self.data

    def variation_data_information(self) -> pd.DataFrame:
        self.investors = []
        self.rounds = []
        self.funding = []
        for i in range(len(self.data)):
            industry = self.data.loc[i,'industry']
            exp = self.data.loc[i,'founder_experience']

            base = self.funding_means[industry]
            noise = np.random.normal(0,base*0.3)
            multiplier = 1 + min(exp,15)*0.05

            fund = max(1_000_000, int(base*multiplier + noise))
            self.funding.append(fund)

            round = np.clip(int(np.random.exponential(scale=2) + fund/15_000_000),1,8)
            investor = np.clip(int(np.random.poisson(3)+fund/10_000_000),1,20)

            self.rounds.append(round)
            self.investors.append(investor)

        self.data['total_funding_usd'] = self.funding
        self.data['num_funding_rounds'] = self.rounds
        self.data['investors_count'] = self.investors

        self.data['last_funding_date'] = self.data['founded_on']+pd.to_timedelta(
            np.random.randint(365,365*10,size=len(self.data)), unit='D'
        )
        self.data['last_funding_date'] = self.data['last_funding_date'].clip(upper=pd.Timestamp('2025-03-31'))

        return self.data

    def target_column(self) -> pd.DataFrame:
        for idx, row in self.data.iterrows():
            if row['total_funding_usd'] > 20_000_000 and row['num_funding_rounds'] > 3:
                self.data.at[idx,'status'] = np.random.choice(['success', 'fail'], p=[0.6, 0.4])

            elif row['total_funding_usd'] > 5_000_000:
                 self.data.at[idx,'status'] = np.random.choice(['success', 'fail'], p=[0.25, 0.75])

            else:
                 self.data.at[idx,'status'] = np.random.choice(['success', 'fail'], p=[0.15, 0.85])

        return self.data

    def missing_vals(self) -> pd.DataFrame:
        for col in ['founder_experience','last_funding_date','industry']:
            self.data.loc[self.data.sample(frac=0.05).index, col] = np.nan

        return self.data

    def generate(self) -> pd.DataFrame:
        self.build_dataframe()
        self.variation_data_information()
        self.target_column()
        self.missing_vals()
        return self.data