In [2]:
import pandas as pd

# 날짜 컬럼 방지
def read_csv_auto_dates(path):
    # 1) 전체 파일을 먼저 읽기
    temp = pd.read_csv(path, low_memory=False)
    
    # 2) _at 로 끝나는 컬럼 자동 탐지
    date_cols = [col for col in temp.columns if col.endswith('_at')]
    
    # 3) 다시 날짜 컬럼을 datetime 으로 읽기
    return pd.read_csv(path, parse_dates=date_cols, low_memory=False)

# Load Files (전처리)
acq = read_csv_auto_dates("./clean_data_v1/clean_acquisitions_final.csv")
deg = read_csv_auto_dates("./clean_data_v1/clean_degrees_final.csv")
frs = read_csv_auto_dates("./clean_data_v1/clean_fr_final.csv")
fds = read_csv_auto_dates("./clean_data_v1/clean_funds_final.csv")
inv = read_csv_auto_dates("./clean_data_v1/clean_investments_final.csv")
ipo = read_csv_auto_dates("./clean_data_v1/clean_ipos_final.csv")
mil = read_csv_auto_dates("./clean_data_v1/clean_milestones_final.csv")
obj = read_csv_auto_dates("./clean_data_v1/clean_objects_final.csv")
peo = read_csv_auto_dates("./clean_data_v1/clean_people_final.csv")
off = read_csv_auto_dates("./clean_data_v1/clean_offices_final.csv")
rel = read_csv_auto_dates("./clean_data_v1/clean_relationships_final.csv")

# Load Files (작업용 전처리)
company = read_csv_auto_dates("./entity_type/objects_company.csv")
finorg = read_csv_auto_dates("./entity_type/objects_finorg.csv")
person = read_csv_auto_dates("./entity_type/objects_person.csv")
product = read_csv_auto_dates("./entity_type/objects_product.csv")

# statistic join load
startup_profile = read_csv_auto_dates("./statistic_join/cy_startup_profile.csv")
founder_master = read_csv_auto_dates("./statistic_join/founder_master.csv")
success_master = read_csv_auto_dates("./statistic_join/success_master.csv")
investments_master = read_csv_auto_dates("./statistic_join/investments_master.csv")
vc_profile = read_csv_auto_dates("./statistic_join/vc_profile.csv")

print("="*60)
print("Dataset 로드 완료!")
print("="*60)

Dataset 로드 완료!


## vc 유형 군집: vc의 투자 패턴
grain: vc

[투자 스타트업 특성] 

- 스타트업 산업, 지역, 투자한 창업자 학위(평균), 학교(집계 어떻게 하냐)

[투자 방식]

- 투자 라운드(빈도수 가장 높은걸로), 평균 투자 간격, 총 투자 스타트업 수, 연 평균 투자 건수, 한 스타트업에 재투자 하는 비율


In [25]:
vc_profile

Unnamed: 0.1,Unnamed: 0,investor_cfp_id,description,country_code,obj_city_fixed,first_investment_at,last_investment_at,investment_rounds,invested_companies,relationships,...,is_past,sequence,cat_rel_title,title_diversity,degrees_p_id,cat_degrees_degree_type_vc,degree_level_vc,is_degree_missing_vc,cat_degrees_subject_vc,institution_normalized_vc
0,0,f:1,VC firm,USA,menlo park,2000-03-01,2013-12-05,307.0,196.0,71.0,...,0.0,1.0,Partner,13.0,p:32797,Master’s degree,3.0,0.0,,harvard university
1,1,f:2,,USA,redwood city,2003-11-01,2013-06-06,52.0,36.0,90.0,...,0.0,1.0,Board,16.0,,,,,,
2,2,f:4,,USA,palo alto,2000-02-15,2013-12-05,479.0,289.0,119.0,...,0.0,1.0,Partner,19.0,,,,,,
3,3,f:5,,USA,palo alto,2003-02-01,2013-11-18,64.0,52.0,7.0,...,0.0,2.0,Director,2.0,p:167018,Master’s degree,3.0,0.0,,
4,4,f:6,Multi-Stage Venture Capital,USA,san francisco,2005-07-01,2013-12-04,130.0,102.0,32.0,...,0.0,1.0,Partner,14.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7666,7666,f:12841,,USA,portland,2013-12-11,2013-12-11,1.0,1.0,1.0,...,1.0,1.0,Partner,1.0,p:101547,Bachelor’s degree,2.0,0.0,Marketing / Communications,iowa university
7667,7667,f:15067,,USA,eden prairie,2013-12-10,2013-12-10,1.0,1.0,0.0,...,,,,,,,,,,
7668,7668,f:15069,,,,2013-12-11,2013-12-11,1.0,1.0,0.0,...,,,,,,,,,,
7669,7669,f:2248,,,,NaT,NaT,,,,...,,,,,,,,,,


In [3]:
# investment_rounds: 참여한 총 라운드 수 
# invested_companies: 총 투자했던 회사 수 
# relationships: vc회사의 총 직원 수 
# title_diversity: 해당 직원 구성 다양성 

In [4]:
# 머신러닝을 위한 vc profile

base_cols = [
    "investor_cfp_id",  
    # vc가 투자한 스타트업의 산업 (비율)       
    # VC가 투자한 스타트업의 위치 (비율)
    "first_investment_at",
    "last_investment_at",
    "investment_rounds",        # vc가 참여한 총 라운드 수
    "invested_companies",       # 총 투자 스타트업 수 
    "relationships"            # vc회사의 총 직원 수
]

base_cols = [c for c in base_cols if c in vc_profile.columns]
vc_base = vc_profile[base_cols].copy()

In [5]:
import numpy as np

vc_base["first_investment_at"] = pd.to_datetime(vc_base["first_investment_at"], errors="coerce")
vc_base["last_investment_at"]  = pd.to_datetime(vc_base["last_investment_at"],  errors="coerce")

# 투자 활동 기간(년)
vc_base["investment_span_years"] = (
    (vc_base["last_investment_at"] - vc_base["first_investment_at"]).dt.days / 365.25
)

# 0/결측 보정
vc_base["investment_span_years_adj"] = vc_base["investment_span_years"].where(
    vc_base["investment_span_years"].notna() & (vc_base["investment_span_years"] > 0),
    1.0
)
# 연 평균 투자 라운드 수 = 투자를 얼마나 자주 하느냐
vc_base["avg_annual_rounds"] = np.where(
    vc_base["investment_span_years"] > 0,
    vc_base["investment_rounds"] / vc_base["investment_span_years"],
    np.nan
)

# 연 평균 신규 투자 회사 수 = 포트폴리오 확장 속도
vc_base["avg_annual_companies"] = np.where(
    vc_base["investment_span_years"] > 0,
    vc_base["invested_companies"] / vc_base["investment_span_years"],
    np.nan
)


# 평균 투자 간격(개월) - 추정치
span_days = (vc_base["last_investment_at"] - vc_base["first_investment_at"]).dt.days

vc_base["avg_invest_gap_months_est"] = np.where(
    (vc_base["investment_rounds"] > 1) & (span_days > 0),
    (span_days / (vc_base["investment_rounds"] - 1)) / 30.44,
    np.nan
)

vc_base

Unnamed: 0,investor_cfp_id,first_investment_at,last_investment_at,investment_rounds,invested_companies,relationships,investment_span_years,investment_span_years_adj,avg_annual_rounds,avg_annual_companies,avg_invest_gap_months_est
0,f:1,2000-03-01,2013-12-05,307.0,196.0,71.0,13.763176,13.763176,22.305898,14.240899,0.539688
1,f:2,2003-11-01,2013-06-06,52.0,36.0,90.0,9.596167,9.596167,5.418830,3.751498,2.257736
2,f:4,2000-02-15,2013-12-05,479.0,289.0,119.0,13.804244,13.804244,34.699474,20.935591,0.346522
3,f:5,2003-02-01,2013-11-18,64.0,52.0,7.0,10.795346,10.795346,5.928481,4.816891,2.056087
4,f:6,2005-07-01,2013-12-04,130.0,102.0,32.0,8.427105,8.427105,15.426413,12.103801,0.783852
...,...,...,...,...,...,...,...,...,...,...,...
7666,f:12841,2013-12-11,2013-12-11,1.0,1.0,1.0,0.000000,1.000000,,,
7667,f:15067,2013-12-10,2013-12-10,1.0,1.0,0.0,0.000000,1.000000,,,
7668,f:15069,2013-12-11,2013-12-11,1.0,1.0,0.0,0.000000,1.000000,,,
7669,f:2248,NaT,NaT,,,,,1.000000,,,


In [6]:
# # inv ←> objects merge
# # inv에서 f: 만 필터링 
# inv_f = inv.loc[
#     inv['investor_cfp_id'].astype(str).str.startswith('f:', na=False)
# ].copy()

# inv_obj = (
#     inv_f
#     .merge(obj, how='left', left_on='investor_cfp_id', right_on='objects_cfpr_id')
# )
# print(f"df_1 행수:{inv_obj.shape}")

In [7]:
# investment 테이블 기준으로 vc가 투자한 스타트업의 산업을 볼 수 있는 테이블 구성

# 1) VC 투자만 필터링 (f:)
inv_vc = inv.loc[
    inv['investor_cfp_id'].astype(str).str.startswith('f:', na=False)
].copy()

# 2) funding_rounds와 조인해서 스타트업 id 얻기
inv_frs = (
    inv_vc
    .merge(
        frs[['funding_round_id', 'fr_c_id']],
        how='left',
        on='funding_round_id'
    )
)

# object 테이블과 조인해서 산업 가져오기
inv_fr_obj = (
    inv_frs
    .merge(
        obj[['objects_cfpr_id', 'obj_category_filled', 'obj_city_fixed']],
        how='left',
        left_on='fr_c_id',
        right_on='objects_cfpr_id'
    )
)

# 테이블 붙이기 
vc_startup_industry = (
    inv_fr_obj[[
        'investor_cfp_id',
        'fr_c_id',
        'obj_category_filled',
        'obj_city_fixed'
    ]]
)

# vc가 투자한 산업, 지역 들어있음 !
vc_startup_industry


Unnamed: 0,investor_cfp_id,fr_c_id,obj_category_filled,obj_city_fixed
0,f:1,c:4,news,san francisco
1,f:2,c:4,news,san francisco
2,f:4,c:5,social,menlo park
3,f:1,c:5,social,menlo park
4,f:5,c:5,social,menlo park
...,...,...,...,...
61398,f:173,c:15847,security,san francisco
61399,f:1290,c:15847,security,san francisco
61400,f:73,c:15847,security,san francisco
61401,f:15098,c:70841,software,austin


In [8]:
# vc가 투자한 스타트업의 산업 (최빈값)
import numpy as np

# 최빈값 찾기
def safe_mode(s):
    s = s.dropna()
    if len(s) == 0:
        return np.nan
    return s.value_counts().idxmax()

# 1위 비중 계산
def top1_share(s):
    s = s.dropna()
    if len(s) == 0:
        return np.nan
    vc = s.value_counts()
    return vc.max() / vc.sum()


In [9]:
# vc 스타트업 산업별 집계
vc_industry = (
    vc_startup_industry
    .groupby("investor_cfp_id")
    .apply(lambda g: pd.Series({
        "startup_industry_top1": safe_mode(g["obj_category_filled"]),
        "startup_industry_top1_share": top1_share(g["obj_category_filled"]),
        "startup_industry_count": g["obj_category_filled"].nunique()
    }))
    .reset_index()
)

vc_industry


  .apply(lambda g: pd.Series({


Unnamed: 0,investor_cfp_id,startup_industry_top1,startup_industry_top1_share,startup_industry_count
0,f:1,software,0.198697,32.0
1,f:10,enterprise,0.172414,11.0
2,f:100,web,1.000000,1.0
3,f:10000,biotech,0.750000,2.0
4,f:10001,biotech,1.000000,1.0
...,...,...,...,...
7666,f:999,advertising,0.266667,9.0
7667,f:9991,education,0.200000,5.0
7668,f:9997,software,1.000000,1.0
7669,f:9998,fashion,0.200000,5.0


In [10]:
# vc 스타트업 지역별 집계
vc_city = (
    vc_startup_industry
    .groupby("investor_cfp_id")
    .apply(lambda g: pd.Series({
        "startup_city_top1": safe_mode(g["obj_city_fixed"]),
        "startup_city_top1_share": top1_share(g["obj_city_fixed"]),
        "startup_city_count": g["obj_city_fixed"].nunique()
    }))
    .reset_index()
)

vc_city


  .apply(lambda g: pd.Series({


Unnamed: 0,investor_cfp_id,startup_city_top1,startup_city_top1_share,startup_city_count
0,f:1,san francisco,0.237458,61.0
1,f:10,san diego,0.428571,19.0
2,f:100,san francisco,1.000000,1.0
3,f:10000,fort worth,0.250000,4.0
4,f:10001,fort worth,1.000000,1.0
...,...,...,...,...
7666,f:999,hamburg,0.916667,2.0
7667,f:9991,berkeley,0.400000,3.0
7668,f:9997,stamford,1.000000,1.0
7669,f:9998,new york,0.800000,2.0


In [11]:
# vc_base에 붙이기 (산업, 지역)
vc_base = vc_base.merge(vc_industry, how="left", on="investor_cfp_id")
vc_base = vc_base.merge(vc_city, how="left", on="investor_cfp_id")

In [12]:
# 회사(스타트업)별 대표 학력 요약 테이블
# vc 투자만 필터 
inv_vc = inv.loc[
    inv["investor_cfp_id"].astype(str).str.startswith("f:", na=False)
].copy()

# 필요한 컬럼만
rel_col = rel[["rel_cf_id", "rel_p_id", "cat_rel_title"]].copy()

# Founder만
rel_founder = rel_col.loc[
    rel_col["cat_rel_title"] == "Founder"
].copy()

# 결측/중복 제거
rel_founder = rel_founder.dropna(subset=["rel_cf_id", "rel_p_id"]).drop_duplicates()

inv_rel = inv_vc.merge(
    rel_founder,
    how="left",
    left_on="invested_c_id",   # 투자받은 스타트업(회사) id
    right_on="rel_cf_id"       # 관계 테이블의 회사 id
)

inv_rel[["investor_cfp_id", "invested_c_id", "rel_p_id"]]


# degree 에서 필요한 컬럼 추출
deg_col = deg[["degrees_p_id", "institution", "degree_level"]].copy()
deg_col = deg_col.dropna(subset=["degrees_p_id"]).drop_duplicates()

# degrees 붙이기
inv_rel_deg = inv_rel.merge(
    deg_col,
    how="left",
    left_on="rel_p_id",        # founder person id
    right_on="degrees_p_id"    # degrees person id
)


# (VC, startup)별로 최고학위 founder 1명만 남기기
inv_rel_deg_sorted = inv_rel_deg.sort_values(
    ["investor_cfp_id", "invested_c_id", "degree_level", "institution"],
    ascending=[True, True, False, True],
    na_position="last"
)

vc_startup_rep = (
    inv_rel_deg_sorted
    .drop_duplicates(subset=["investor_cfp_id", "invested_c_id"])
    [["investor_cfp_id", "invested_c_id", "degree_level", "institution"]]
    .copy()
)

vc_startup_rep

Unnamed: 0,investor_cfp_id,invested_c_id,degree_level,institution
57865,f:1,c:1088,,
2591,f:1,c:1101,3.0,Tel Aviv University
4380,f:1,c:1102,,
24423,f:1,c:11042,2.0,University of Kansas
47477,f:1,c:11391,4.0,University of Michigan Law School
...,...,...,...,...
73564,f:9998,c:152816,,
83938,f:9998,c:191881,2.0,University of Chicago
80277,f:9998,c:193389,,
96932,f:9998,c:278633,,


In [13]:
# VC별 평균 founder 학위
vc_degree = (
    vc_startup_rep
    .groupby("investor_cfp_id")["degree_level"]
    .mean()
    .reset_index(name="founder_degree_level_mean")
)

# VC별 자주 투자한 학교
vc_school = (
    vc_startup_rep
    .groupby("investor_cfp_id")["institution"]
    .agg(lambda s: s.dropna().value_counts().idxmax()
         if s.dropna().size else np.nan)
    .reset_index(name="founder_institution_top1")
)


In [None]:
# vc_base에 붙이기 (학위, 학교)
vc_base = vc_base.merge(vc_degree, how="left", on="investor_cfp_id")
vc_base = vc_base.merge(vc_school, how="left", on="investor_cfp_id")

In [15]:
# VC–스타트업별 투자 횟수 세기
vc_startup_invest_cnt = (
    inv
    .groupby(["investor_cfp_id", "invested_c_id"])
    .size()
    .reset_index(name="invest_cnt")
)

# 재투자 여부 플래그 만들기
vc_startup_invest_cnt["is_follow_on"] = (
    vc_startup_invest_cnt["invest_cnt"] >= 2
).astype(int)

# VC별 재투자율 계산
vc_followon = (
    vc_startup_invest_cnt
    .groupby("investor_cfp_id")
    .agg(
        followon_startup_cnt=("is_follow_on", "sum"),
        total_startup_cnt=("invested_c_id", "nunique")
    )
    .reset_index()
)

vc_followon["followon_ratio"] = (
    vc_followon["followon_startup_cnt"] /
    vc_followon["total_startup_cnt"]
)

vc_followon


Unnamed: 0,investor_cfp_id,followon_startup_cnt,total_startup_cnt,followon_ratio
0,c:10020,0,1,0.0
1,c:10022,0,3,0.0
2,c:10071,0,1,0.0
3,c:10085,0,1,0.0
4,c:100936,0,1,0.0
...,...,...,...,...
17147,p:99790,0,1,0.0
17148,p:99826,0,3,0.0
17149,p:99827,0,1,0.0
17150,p:99898,0,1,0.0


In [16]:
# vc_base에 붙이기 (산업, 지역)
vc_base = vc_base.merge(vc_followon, how="left", on="investor_cfp_id")

In [17]:
frs

Unnamed: 0,funding_round_id,fr_c_id,funded_at,funding_round_type,funding_round_code,raised_amount_usd,pre_money_valuation_usd,post_money_valuation_usd,participants,is_first_round,is_last_round,funded_year,funded_quarter,cat_fr_type,num_fr_type,log_participants,is_fr_raised_private
0,1,c:4,2006-12-01,series-b,b,8500000.0,,,2,0,0,2006.0,2006Q4,series-b,2,1.098612,0
1,2,c:5,2004-09-01,angel,angel,500000.0,,,2,0,1,2004.0,2004Q3,seed,0,1.098612,0
2,3,c:5,2005-05-01,series-a,a,12700000.0,115000000.0,,3,0,0,2005.0,2005Q2,series-a,1,1.386294,0
3,4,c:5,2006-04-01,series-b,b,27500000.0,525000000.0,,4,0,0,2006.0,2006Q2,series-b,2,1.609438,0
4,5,c:7299,2006-05-01,series-b,b,10500000.0,,,2,0,0,2006.0,2006Q2,series-b,2,1.098612,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52923,57948,c:211890,2013-12-12,series-a,a,3000000.0,,,1,1,1,2013.0,2013Q4,series-a,1,0.693147,0
52924,57949,c:267427,2010-02-06,venture,partial,570000.0,,,0,0,1,2010.0,2010Q1,venture,99,0.000000,0
52925,57950,c:261728,2010-02-06,venture,unattributed,2184100.0,,,0,0,1,2010.0,2010Q1,venture,99,0.000000,0
52926,57951,c:285864,2013-12-12,series-a,a,790783.0,,,0,1,1,2013.0,2013Q4,series-a,1,0.000000,0


In [18]:
# vc별로 자주 투자한 라운드
# VC 투자만
inv_vc = inv.loc[
    inv["investor_cfp_id"].astype(str).str.startswith("f:", na=False)
].copy()

# funding_rounds 붙이기 (라운드 타입 가져오기)
inv_vc_round = inv_vc.merge(
    frs[["funding_round_id", "cat_fr_type"]], 
    how="left",
    on="funding_round_id"
)

# vc별로 자주 투자한 라운드
vc_round_mode = (
    inv_vc_round
    .groupby("investor_cfp_id")
    .apply(lambda g: pd.Series({
        "favorite_round_type": safe_mode(g["cat_fr_type"])
    }))
    .reset_index()
)

vc_base = vc_base.merge(
    vc_round_mode,
    how="left",
    on="investor_cfp_id"
)

  .apply(lambda g: pd.Series({


In [19]:
vc_base

Unnamed: 0,investor_cfp_id,first_investment_at,last_investment_at,investment_rounds,invested_companies,relationships,investment_span_years,investment_span_years_adj,avg_annual_rounds,avg_annual_companies,...,startup_industry_count,startup_city_top1,startup_city_top1_share,startup_city_count,founder_degree_level_mean,founder_institution_top1,followon_startup_cnt,total_startup_cnt,followon_ratio,favorite_round_type
0,f:1,2000-03-01,2013-12-05,307.0,196.0,71.0,13.763176,13.763176,22.305898,14.240899,...,32.0,san francisco,0.237458,61.0,2.846154,Stanford University,76,196,0.387755,series-c+
1,f:2,2003-11-01,2013-06-06,52.0,36.0,90.0,9.596167,9.596167,5.418830,3.751498,...,17.0,san francisco,0.423077,21.0,2.666667,"University of California, Berkeley",10,36,0.277778,series-a
2,f:4,2000-02-15,2013-12-05,479.0,289.0,119.0,13.804244,13.804244,34.699474,20.935591,...,30.0,san francisco,0.165217,104.0,2.843478,Stanford University,125,289,0.432526,series-c+
3,f:5,2003-02-01,2013-11-18,64.0,52.0,7.0,10.795346,10.795346,5.928481,4.816891,...,20.0,sunnyvale,0.107692,36.0,2.823529,Harvard University,12,52,0.230769,series-c+
4,f:6,2005-07-01,2013-12-04,130.0,102.0,32.0,8.427105,8.427105,15.426413,12.103801,...,31.0,san francisco,0.404959,32.0,2.593220,Stanford University,21,102,0.205882,series-a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7666,f:12841,2013-12-11,2013-12-11,1.0,1.0,1.0,0.000000,1.000000,,,...,1.0,kirkland,1.000000,1.0,,,0,1,0.000000,venture
7667,f:15067,2013-12-10,2013-12-10,1.0,1.0,0.0,0.000000,1.000000,,,...,1.0,chicago,1.000000,1.0,2.000000,Southern Methodist University - Cox School of ...,0,1,0.000000,venture
7668,f:15069,2013-12-11,2013-12-11,1.0,1.0,0.0,0.000000,1.000000,,,...,1.0,melville,1.000000,1.0,,CW Post,0,1,0.000000,series-a
7669,f:2248,NaT,NaT,,,,,1.000000,,,...,1.0,san diego,1.000000,1.0,,,0,1,0.000000,series-b


## vc_base 컬럼 설명
investor_cfp_id: vc id

first_investment_at / last_investment_at: vc의 첫 투자 마지막 투자

investment_rounds: vc가 참여한 총 라운드 수

invested_companies: vc가 총 투자한 회사 수

relationships: vc회사의 총 직원 수 

investment_span_years: 투자 활동 기간(년) -> investment_span_years_adj: 결측 보정한 거

avg_annual_rounds: 연 평균 투자 라운드 수 = 투자를 얼마나 자주 하느냐

avg_annual_companies: 연 평균 신규 투자 회사 수 = 포트폴리오 확장 속도

avg_invest_gap_months_est: 평균 투자 간격(개월)

startup_industry_top1: 가장 많이 투자하는 산업 1위

startup_industry_top1_share: VC의 전체 투자 중, 가장 많이 투자한 산업이 차지하는 비율

startup_industry_count: VC가 투자한 스타트업 산업의 개수

startup_city_top1: 가장 많이 투자하는 도시 1위

startup_city_top1_share: VC의 전체 투자 중, 가장 많이 투자한 도시이 차지하는 비율

startup_city_count: VC가 투자한 스타트업 도시의 개수

founder_degree_level_mean: VC가 투자한 회사의 창업자 학위 레벨 (평균)

founder_institution_top1: 가장 많이 투자하는 학교 1위

followon_ratio: 동일 기업의 재투자율

favorite_round_type: 가장 많이 투자하는 라운드

In [23]:
vc_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7671 entries, 0 to 7670
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   investor_cfp_id              7671 non-null   object        
 1   first_investment_at          7592 non-null   datetime64[ns]
 2   last_investment_at           7592 non-null   datetime64[ns]
 3   investment_rounds            7600 non-null   float64       
 4   invested_companies           7600 non-null   float64       
 5   relationships                7600 non-null   float64       
 6   investment_span_years        7592 non-null   float64       
 7   investment_span_years_adj    7671 non-null   float64       
 8   avg_annual_rounds            4162 non-null   float64       
 9   avg_annual_companies         4162 non-null   float64       
 10  avg_invest_gap_months_est    4162 non-null   float64       
 11  startup_industry_top1        7657 non-null 

In [20]:
# csv 쓰기 (데이터 프레임 편집, 분석한 데이터 프레임 저장)
vc_base.to_csv('vc_base.csv', index=False)
print("전처리된 데이터를 vc_base.csv 파일로 저장했습니다.")

전처리된 데이터를 vc_base.csv 파일로 저장했습니다.
