In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', 100)

In [2]:
processed_path = 'cleaned_data_with_target.csv'

try:
    df = pd.read_csv(processed_path)
    print("Processed data loaded successfully!")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {processed_path}")

Processed data loaded successfully!
Dataset shape: (38398, 15)


In [3]:
# Ensure date columns are in datetime format
df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
df['last_funding_at'] = pd.to_datetime(df['last_funding_at'], errors='coerce')

# Calculate age in days
df['company_age_days'] = (df['last_funding_at'] - df['founded_at']).dt.days

# Drop rows where age could not be calculated
df.dropna(subset=['company_age_days'], inplace=True)

print("Created 'company_age_days' feature.")
df[['founded_at', 'last_funding_at', 'company_age_days']].head()

Created 'company_age_days' feature.


Unnamed: 0,founded_at,last_funding_at,company_age_days
0,2012-06-01,2012-06-30,29.0
2,2012-10-26,2012-08-09,-78.0
3,2011-04-01,2011-04-01,0.0
4,2014-01-01,2014-09-26,268.0
5,2011-10-10,2013-05-31,599.0


In [4]:
print(f"Number of unique markets before simplification: {df['market'].nunique()}")

# Get the top 15 most frequent markets
top_markets = df['market'].value_counts().nlargest(15).index

df['market_simplified'] = df['market'].apply(lambda x: x if x in top_markets else 'Other')

print(f"Number of unique markets after simplification: {df['market_simplified'].nunique()}")
print("\nNew market distribution:")
print(df['market_simplified'].value_counts())

Number of unique markets before simplification: 714
Number of unique markets after simplification: 16

New market distribution:
market_simplified
Other                    14561
 Software                 3328
 Biotechnology            2375
 Mobile                   1402
 E-Commerce               1175
 Curated Web              1094
 Enterprise Software       966
 Health Care               817
 Advertising               778
 Hardware + Software       767
 Games                     758
 Clean Technology          684
 Social Media              624
 Health and Wellness       588
 Education                 572
 Finance                   547
Name: count, dtype: int64


In [5]:
# Define which columns are numerical and which are categorical
numerical_features = ['funding_total_usd', 'funding_rounds', 'seed', 'venture', 'round_A', 'round_B', 'round_C', 'company_age_days']
categorical_features = ['market_simplified', 'country_code']

# Separate features (X) and target (y)
X = df[numerical_features + categorical_features]
y = df['acquired_within_1_year']

# Fill any remaining NaN values in numerical columns with the median
for col in numerical_features:
    median_val = X[col].median()
    X[col].fillna(median_val, inplace=True)

# Fill NaNs in categorical features with 'missing'
for col in categorical_features:
    X[col].fillna('missing', inplace=True)

print("Features and target variable are ready.")

Features and target variable are ready.


In [6]:
# Combine X and y for saving
final_df_for_modeling = pd.concat([X, y], axis=1)

output_path = 'final_modeling_data.csv'
final_df_for_modeling.to_csv(output_path, index=False)

print(f"Final dataset for modeling saved to {output_path}")
print(f"Shape of the final dataset: {final_df_for_modeling.shape}")
final_df_for_modeling.head()

Final dataset for modeling saved to final_modeling_data.csv
Shape of the final dataset: (31036, 11)


Unnamed: 0,funding_total_usd,funding_rounds,seed,venture,round_A,round_B,round_C,company_age_days,market_simplified,country_code,acquired_within_1_year
0,1750000.0,1.0,1750000.0,0.0,0.0,0.0,0.0,29.0,Other,USA,0
2,40000.0,1.0,40000.0,0.0,0.0,0.0,0.0,-78.0,Other,EST,0
3,1500000.0,1.0,1500000.0,0.0,0.0,0.0,0.0,0.0,Other,GBR,0
4,60000.0,2.0,0.0,0.0,0.0,0.0,0.0,268.0,Other,USA,0
5,7000000.0,1.0,0.0,7000000.0,0.0,7000000.0,0.0,599.0,Software,USA,0
