# Prosperity Prognosticator: Startup Success Prediction
### End-to-End Machine Learning Project

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, roc_auc_score
)
import joblib

print('All libraries imported successfully!')

## 2. Load Dataset

In [None]:
# Load the dataset
# Download from: https://www.kaggle.com/datasets/manishkc06/startup-success-prediction
data = pd.read_csv('startup data.csv')
print('Dataset Shape:', data.shape)
data.head()

In [None]:
print('Columns:', data.columns.tolist())
print('\nData Types:')
print(data.dtypes)
print('\nNull Values:')
print(data.isnull().sum())

## 3. Exploratory Data Analysis

In [None]:
# Descriptive Statistics
print('Descriptive Statistics:')
data.describe()

### 3.1 State Distribution

In [None]:
# State distribution
data['State'] = 'Other'
data.loc[data['state_code'] == 'CA', 'State'] = 'CA'
data.loc[data['state_code'] == 'NY', 'State'] = 'NY'
data.loc[data['state_code'] == 'MA', 'State'] = 'MA'
data.loc[data['state_code'] == 'TX', 'State'] = 'TX'
data.loc[data['state_code'] == 'WA', 'State'] = 'WA'

state_counts = data['State'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(state_counts, labels=state_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Startups by State')
plt.tight_layout()
plt.savefig('state_distribution.png')
plt.show()

### 3.2 Category Distribution

In [None]:
# Category distribution
data['category'] = 'Other'
top_categories = ['software', 'web', 'mobile', 'enterprise', 'advertising', 'games_video', 'ecommerce', 'biotech', 'consulting', 'messaging']
for cat in top_categories:
    data.loc[data['category_code'] == cat, 'category'] = cat

cat_counts = data['category'].value_counts()
plt.figure(figsize=(10, 8))
plt.pie(cat_counts, labels=cat_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Startups by Category')
plt.tight_layout()
plt.savefig('category_distribution.png')
plt.show()

### 3.3 Distribution of Startup Status

In [None]:
plt.figure(figsize=(6, 4))
data['status'].value_counts().plot(kind='bar', color=['steelblue', 'coral'])
plt.title('Distribution of Startup Status')
plt.xlabel('Status')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('status_distribution.png')
plt.show()

### 3.4 State vs Status

In [None]:
plt.figure(figsize=(10, 5))
state_status = data.groupby(['State', 'status']).size().unstack(fill_value=0)
state_status.plot(kind='bar', ax=plt.gca(), color=['steelblue', 'coral'])
plt.title('State vs Startup Status')
plt.xlabel('State')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(title='Status')
plt.tight_layout()
plt.savefig('state_vs_status.png')
plt.show()

### 3.5 Funding Rounds Analysis

In [None]:
funding_cols = ['has_VC', 'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD']
funding_available = [c for c in funding_cols if c in data.columns]

if funding_available:
    fig, axes = plt.subplots(2, 3, figsize=(14, 8))
    axes = axes.flatten()
    for i, col in enumerate(funding_available):
        data.groupby([col, 'status']).size().unstack(fill_value=0).plot(
            kind='bar', ax=axes[i], color=['steelblue', 'coral'])
        axes[i].set_title(col)
        axes[i].set_xlabel('')
        axes[i].tick_params(axis='x', rotation=0)
    plt.suptitle('Funding Rounds vs Startup Status', fontsize=14)
    plt.tight_layout()
    plt.savefig('funding_rounds.png')
    plt.show()

### 3.6 Correlation Heatmap

In [None]:
numeric_data = data.select_dtypes(include=[np.number])
plt.figure(figsize=(14, 10))
sns.heatmap(numeric_data.corr(), annot=False, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.show()

## 4. Data Preprocessing

In [None]:
# Drop irrelevant columns
drop_cols = ['Unnamed: 0', 'Unnamed: 6', 'id', 'Unnamed: 0.1', 'name', 'labels',
             'founded_at', 'closed_at', 'first_funding_at', 'last_funding_at',
             'state_code', 'state_code.1', 'region', 'city', 'zip_code', 'country_code',
             'category_code', 'object_id', 'State', 'category']

drop_cols_available = [c for c in drop_cols if c in data.columns]
df = data.drop(columns=drop_cols_available)

# Encode target
df['status'] = df['status'].map({'acquired': 1, 'closed': 0})
df = df.dropna(subset=['status'])

# Fill missing numeric values
df = df.fillna(df.median(numeric_only=True))

print('Cleaned Dataset Shape:', df.shape)
df.head()

## 5. Train-Test Split

In [None]:
X = df.drop('status', axis=1)
y = df['status']

# Keep only numeric columns
X = X.select_dtypes(include=[np.number])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=116)

print('Training set size:', X_train.shape)
print('Test set size:', X_test.shape)
print('Features used:', X.columns.tolist())

## 6. Model Building - Random Forest with Hyperparameter Tuning

In [None]:
# Baseline Random Forest
rf_base = RandomForestClassifier(n_estimators=100, random_state=42)
rf_base.fit(X_train, y_train)

train_acc_base = accuracy_score(y_train, rf_base.predict(X_train))
test_acc_base = accuracy_score(y_test, rf_base.predict(X_test))

print(f'Baseline - Train Accuracy: {train_acc_base:.4f}')
print(f'Baseline - Test Accuracy:  {test_acc_base:.4f}')

In [None]:
# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=1
)
grid_search.fit(X_train, y_train)

print('Best Parameters:', grid_search.best_params_)
print('Best CV Score:', grid_search.best_score_)

## 7. Evaluating the Best Model

In [None]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

train_acc = accuracy_score(y_train, best_model.predict(X_train))
test_acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Train Accuracy: {train_acc:.4f}')
print(f'Test Accuracy:  {test_acc:.4f}')
print(f'ROC AUC Score:  {roc_auc:.4f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=['Closed', 'Acquired']))

results = {
    'predictions': y_pred,
    'accuracy': test_acc,
    'classification_report': classification_report(y_test, y_pred)
}

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Closed', 'Acquired'],
            yticklabels=['Closed', 'Acquired'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.show()

In [None]:
# Feature Importance
feat_imp = pd.Series(best_model.feature_importances_, index=X.columns)
feat_imp_top20 = feat_imp.nlargest(20)

plt.figure(figsize=(10, 7))
feat_imp_top20.sort_values().plot(kind='barh', color='steelblue')
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

In [None]:
# Accuracy Comparison: Before vs After Hyperparameter Tuning
models = ['Baseline RF', 'Tuned RF']
train_accs = [train_acc_base, train_acc]
test_accs = [test_acc_base, test_acc]

x = np.arange(len(models))
width = 0.35

fig, ax = plt.subplots(figsize=(8, 5))
bars1 = ax.bar(x - width/2, train_accs, width, label='Train Accuracy', color='steelblue')
bars2 = ax.bar(x + width/2, test_accs, width, label='Test Accuracy', color='coral')

ax.set_ylabel('Accuracy')
ax.set_title('Model Accuracy: Before vs After Hyperparameter Tuning')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
ax.set_ylim(0, 1.1)
for bar in bars1:
    ax.annotate(f'{bar.get_height():.2f}', xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                xytext=(0, 3), textcoords='offset points', ha='center')
for bar in bars2:
    ax.annotate(f'{bar.get_height():.2f}', xy=(bar.get_x() + bar.get_width()/2, bar.get_height()),
                xytext=(0, 3), textcoords='offset points', ha='center')
plt.tight_layout()
plt.savefig('accuracy_comparison.png')
plt.show()

## 8. Save the Model

In [None]:
# Save the best model
joblib.dump(best_model, 'random_forest_model.pkl')
print('Model saved as random_forest_model.pkl')

# Save feature names for use in Flask app
import json
with open('feature_names.json', 'w') as f:
    json.dump(X.columns.tolist(), f)
print('Feature names saved as feature_names.json')

# Test loading the model
loaded_model = joblib.load('random_forest_model.pkl')
test_preds = loaded_model.predict(X_test)
print('Model loaded successfully!')
print(f'Loaded model test accuracy: {accuracy_score(y_test, test_preds):.4f}')