<a href="https://colab.research.google.com/github/shigee2004/Startup-revenue-predictor/blob/main/startup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
import joblib
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
os.listdir()


['.config',
 'Preprocessed_time_series_data.csv',
 'lstm_funding_model.keras',
 'xgboost_startup_model_v3.json',
 'preprocessed_startup_dataset.csv',
 'sample_data']

In [None]:
lstm_model = load_model("lstm_funding_model.keras")
xgb_model = xgb.Booster()
xgb_model.load_model("xgboost_startup_model_v3.json")


In [None]:
# Load and preprocess the LSTM input data
df_lstm = pd.read_csv("Preprocessed_time_series_data.csv")


In [None]:
# Define LSTM features
lstm_features = ["Funding Amount (M$)", "Number of Investors", "Investor Reputation",
                 "Economic Indicator", "Sector Growth Rate (%)", "Estimated Valuation (M$)"]


In [None]:
# Normalize using same scaling approach as training
lstm_scaler = MinMaxScaler()
df_lstm[lstm_features] = lstm_scaler.fit_transform(df_lstm[lstm_features])


In [None]:

# Sequence creation for LSTM
def create_sequences(data, seq_length=12):
    X = []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
    return np.array(X)


In [None]:
X_lstm_seq = create_sequences(df_lstm[lstm_features].values)


In [None]:
# Get the last prediction from LSTM (simulate latest future funding prediction)
lstm_prediction = lstm_model.predict(X_lstm_seq[-1:])[0][0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step


In [None]:
# A single funding prediction

print(f"LSTM Predicted Funding (M$): {lstm_prediction:.4f}")



LSTM Predicted Funding (M$): 0.3421


In [None]:
# Load and preprocess XGBoost input data
df_xgb = pd.read_csv("preprocessed_startup_dataset.csv")

# Print the columns of the XGBoost input data (df_xgb)
print(df_xgb.columns.tolist())

['Startup Name', 'Total Funding (M$)', 'Number of Investors', 'Investor Reputation', 'Growth Rate (%)', 'Revenue (M$)', 'Market Size', 'Years Since Founded', 'Success Score', 'Industry_AgriTech', 'Industry_Autonomous Vehicles', 'Industry_Cybersecurity', 'Industry_EdTech', 'Industry_FinTech', 'Industry_GreenTech', 'Industry_Healthcare', 'Industry_Quantum Computing']


In [None]:
# Load and preprocess XGBoost input data
df_xgb = pd.read_csv("preprocessed_startup_dataset.csv")

# Print the columns of the XGBoost input data (df_xgb)
print(df_xgb.columns.tolist())

# Assuming you want to use a single row for prediction with XGBoost,
# you need to create a sample DataFrame, potentially from df_xgb
# For example, let's take the first row of df_xgb as a sample:
xgb_sample = df_xgb.iloc[[0]].copy()  # Create a copy to avoid modifying the original df_xgb

# Now you can assign the LSTM prediction to the 'Predicted Funding (M$)' column
xgb_sample["Predicted Funding (M$)"] = lstm_prediction

['Startup Name', 'Total Funding (M$)', 'Number of Investors', 'Investor Reputation', 'Growth Rate (%)', 'Revenue (M$)', 'Market Size', 'Years Since Founded', 'Success Score', 'Industry_AgriTech', 'Industry_Autonomous Vehicles', 'Industry_Cybersecurity', 'Industry_EdTech', 'Industry_FinTech', 'Industry_GreenTech', 'Industry_Healthcare', 'Industry_Quantum Computing']


In [None]:

['Startup Name', 'Total Funding (M$)', 'Number of Investors', 'Investor Reputation', 'Growth Rate (%)', 'Revenue (M$)', 'Market Size', 'Years Since Founded', 'Success Score', 'Industry_AgriTech', 'Industry_Autonomous Vehicles', 'Industry_Cybersecurity', 'Industry_EdTech', 'Industry_FinTech', 'Industry_GreenTech', 'Industry_Healthcare', 'Industry_Quantum Computing', 'Predicted Funding (M$)']


['Startup Name',
 'Total Funding (M$)',
 'Number of Investors',
 'Investor Reputation',
 'Growth Rate (%)',
 'Revenue (M$)',
 'Market Size',
 'Years Since Founded',
 'Success Score',
 'Industry_AgriTech',
 'Industry_Autonomous Vehicles',
 'Industry_Cybersecurity',
 'Industry_EdTech',
 'Industry_FinTech',
 'Industry_GreenTech',
 'Industry_Healthcare',
 'Industry_Quantum Computing',
 'Predicted Funding (M$)']

In [None]:
# Define training features (excluding non-numeric and target)
features_used = [
    'Total Funding (M$)',
    'Number of Investors',
    'Investor Reputation',
    'Growth Rate (%)',
    'Revenue (M$)',
    'Market Size',
    'Years Since Founded',
    'Success Score',
    'Industry_AgriTech',
    'Industry_Autonomous Vehicles',
    'Industry_Cybersecurity',
    'Industry_EdTech',
    'Industry_FinTech',
    'Industry_GreenTech',
    'Industry_Healthcare',
    'Industry_Quantum Computing'
]


In [None]:

# Just in case, validate against actual columns in the DataFrame
valid_features = [col for col in features_used if col in xgb_sample.columns]

# Print and double-check
print("✅ Valid features being used:", valid_features)


✅ Valid features being used: ['Total Funding (M$)', 'Number of Investors', 'Investor Reputation', 'Growth Rate (%)', 'Revenue (M$)', 'Market Size', 'Years Since Founded', 'Success Score', 'Industry_AgriTech', 'Industry_Autonomous Vehicles', 'Industry_Cybersecurity', 'Industry_EdTech', 'Industry_FinTech', 'Industry_GreenTech', 'Industry_Healthcare', 'Industry_Quantum Computing']


In [None]:
# Filter the DataFrame
xgb_sample_filtered = xgb_sample[valid_features]

# Predict
dtest = xgb.DMatrix(xgb_sample_filtered)
predictions = xgb_model.predict(dtest)



In [None]:

# Optional: Add predictions back to DataFrame
xgb_sample['Predicted Funding (M$)'] = predictions


In [None]:
print("✅ Predictions added to DataFrame!")



✅ Predictions added to DataFrame!


In [None]:
# Reuse filtered features
dtest = xgb.DMatrix(xgb_sample_filtered)

# Predict revenue
predicted_revenue = xgb_model.predict(dtest)[0]

print(f"Final Predicted Revenue (M$) using hybrid model: {predicted_revenue:.4f}")


Final Predicted Revenue (M$) using hybrid model: 2.2648


In [None]:
import pandas as pd
import xgboost as xgb
import joblib


In [None]:

# Load the XGBoost model
xgb_model = xgb.Booster()
xgb_model.load_model("xgboost_startup_model_v3.json")


In [None]:
# Define expected features
features_used = [
    'Total Funding (M$)', 'Number of Investors', 'Investor Reputation',
    'Growth Rate (%)', 'Revenue (M$)', 'Market Size', 'Years Since Founded',
    'Success Score', 'Industry_AgriTech', 'Industry_Autonomous Vehicles',
    'Industry_Cybersecurity', 'Industry_EdTech', 'Industry_FinTech',
    'Industry_GreenTech', 'Industry_Healthcare', 'Industry_Quantum Computing'
]



In [None]:
# Prompt user input
print("Please enter the following startup details:\n")

user_input = {
    'Total Funding (M$)': float(input("Total Funding (in M$): ")),
    'Number of Investors': int(input("Number of Investors: ")),
    'Investor Reputation': float(input("Investor Reputation (0-10): ")),
    'Growth Rate (%)': float(input("Growth Rate (%): ")),
    'Revenue (M$)': float(input("Current Revenue (in M$): ")),
    'Market Size': float(input("Market Size (in M$): ")),
    'Years Since Founded': int(input("Years Since Founded: ")),
    'Success Score': float(input("Success Score (0-100): "))
}


Please enter the following startup details:

Total Funding (in M$): 50000
Number of Investors: 7
Investor Reputation (0-10): 6
Growth Rate (%): 8
Current Revenue (in M$): 45000
Market Size (in M$): 6
Years Since Founded: 4
Success Score (0-100): 79


In [None]:
# Industry selection (one-hot encoding)
industries = [
    'AgriTech', 'Autonomous Vehicles', 'Cybersecurity', 'EdTech',
    'FinTech', 'GreenTech', 'Healthcare', 'Quantum Computing'
]

print("\nSelect Industry from the following:")
for idx, name in enumerate(industries):
    print(f"{idx + 1}. {name}")



Select Industry from the following:
1. AgriTech
2. Autonomous Vehicles
3. Cybersecurity
4. EdTech
5. FinTech
6. GreenTech
7. Healthcare
8. Quantum Computing


In [None]:
industry_choice = int(input("Enter the number corresponding to the industry: "))

# One-hot encode industries
for i, industry in enumerate(industries):
    key = f'Industry_{industry}'
    user_input[key] = 1 if (i + 1) == industry_choice else 0

# Create input DataFrame
input_df = pd.DataFrame([user_input])

# Predict
dtest = xgb.DMatrix(input_df[features_used])
predicted_revenue = xgb_model.predict(dtest)[0]

print(f"\n✅ Final Predicted Revenue (M$): {predicted_revenue:.4f}")

Enter the number corresponding to the industry: 5

✅ Final Predicted Revenue (M$): 2.8155
