In [12]:
# =========================
# 1. IMPORT LIBRARIES
# =========================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib

# =========================
# 2. LOAD DATASET
# =========================
data = pd.read_csv("startup data.csv")

print("Dataset shape:", data.shape)
print(data.head())

# =========================
# 3. HANDLE MISSING VALUES
# =========================
data = data.dropna()
print("After dropping nulls:", data.shape)

# =========================
# 4. ENCODE TARGET COLUMN (IMPORTANT)
# =========================
# Only encode 'status' if it is string

if data["status"].dtype == "object":
    le_target = LabelEncoder()
    data["status"] = le_target.fit_transform(data["status"])

# =========================
# 5. SELECT ONLY 10 IMPORTANT FEATURES
# =========================
important_features = [
    'age_first_funding_year',
    'age_last_funding_year',
    'age_first_milestone_year',
    'age_last_milestone_year',
    'relationships',
    'funding_rounds',
    'funding_total_usd',
    'milestones',
    'avg_participants',
    'is_top500'
]

# Make sure all features exist
missing = [col for col in important_features if col not in data.columns]
if missing:
    print("Missing columns:", missing)
else:
    print("All required columns are present.")

X = data[important_features]
y = data["status"]

print("Selected Features:", important_features)
print("Feature shape:", X.shape)

# =========================
# 6. TRAIN TEST SPLIT
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Training samples:", X_train.shape)
print("Testing samples:", X_test.shape)

# =========================
# 7. MODEL TRAINING
# =========================
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# =========================
# 8. PREDICTION
# =========================
y_pred = model.predict(X_test)

# =========================
# 9. EVALUATION
# =========================
acc = accuracy_score(y_test, y_pred)
print("\nAccuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# =========================
# 10. SAVE MODEL
# =========================
joblib.dump(model, "random_forest_model.pkl")

print("\nModel saved successfully as random_forest_model.pkl")


Dataset shape: (923, 49)
   Unnamed: 0 state_code   latitude   longitude zip_code       id  \
0        1005         CA  42.358880  -71.056820    92101   c:6669   
1         204         CA  37.238916 -121.973718    95032  c:16283   
2        1001         CA  32.901049 -117.192656    92121  c:65620   
3         738         CA  37.320309 -122.050040    95014  c:42668   
4        1002         CA  37.779281 -122.419236    94105  c:65806   

            city              Unnamed: 6               name  labels  ...  \
0      San Diego                     NaN        Bandsintown       1  ...   
1      Los Gatos                     NaN          TriCipher       1  ...   
2      San Diego      San Diego CA 92121              Plixi       1  ...   
3      Cupertino      Cupertino CA 95014  Solidcore Systems       1  ...   
4  San Francisco  San Francisco CA 94105     Inhale Digital       0  ...   

  object_id has_VC has_angel has_roundA  has_roundB  has_roundC  has_roundD  \
0    c:6669      0      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from google.colab import files
files.download("random_forest_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>