In [None]:
import pandas as pd

# Load data
df = pd.read_csv("data.csv")  # replace with actual path
print(df.head())

# Check class balance
print(df['Bankrupt?'].value_counts(normalize=True))

In [None]:
df.dropna(inplace=True)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ✅ 1. Load your dataset (replace with your actual file name)
df = pd.read_csv("data.csv")  # example: "bankruptcy_data.csv"
print(df.head())  # optional: see first few rows

# ✅ 2. Prepare features and labels
X = df.drop("Bankrupt?", axis=1)
y = df["Bankrupt?"]

# ✅ 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ✅ 4. Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Data loaded and preprocessed successfully!")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# 2. Predict on the test set
y_pred = model.predict(X_test_scaled)

# 3. Evaluate the model
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))

print("\n🧠 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Add class_weight='balanced' to give minority class more importance
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train_scaled, y_train)

model = RandomForestClassifier(random_state=42)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features")
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train with balanced data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Feature Importance
import matplotlib.pyplot as plt
import numpy as np

importances = model.feature_importances_
indices = np.argsort(importances)[-10:]  # top 10
features = X.columns[indices]

plt.figure(figsize=(8, 5))
plt.barh(features, importances[indices])
plt.title("Top 10 Important Features")
plt.xlabel("Importance")
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
# Just replace GridSearchCV with RandomizedSearchCV in the above setup

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distribution
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Initialize model
rf = RandomForestClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,  # ✅ FIXED
    n_iter=20,             # Number of random combinations to try
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model
random_search.fit(X_train_res, y_train_res)

# Show best parameters
print("✅ Best Parameters Found:")
print(random_search.best_params_)

In [None]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print("📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🧠 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
import joblib
joblib.dump(best_model, "best_bankruptcy_model.pkl")


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', 'balanced_subsample']
}

grid_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,  # ✅ correct key here
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_scaled, y_train)

In [None]:
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)


In [None]:
y_pred_best = grid_search.best_estimator_.predict(X_test_scaled)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print("🔍 Classification Report:")
print(classification_report(y_test, y_pred_best))

print("\n🧠 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Feature importances
importances = grid_search.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot
plt.figure(figsize=(12, 6))
plt.title("Feature Importances")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xlabel("Feature Index")
plt.ylabel("Importance Score")
plt.show()


In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_scaled, y_train)

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    scale_pos_weight=30,  # balance the class
    max_depth=10,
    n_estimators=300,
    learning_rate=0.05,
    random_state=42
)
xgb.fit(X_train_scaled, y_train)


In [None]:
import xgboost
print(xgboost.__version__)

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_scaled, y_train)
from xgboost import XGBClassifier

xgb = XGBClassifier(
    scale_pos_weight=30,  # balance the class
    max_depth=10,
    n_estimators=300,
    learning_rate=0.05,
    random_state=42
)
xgb.fit(X_train_scaled, y_train)

In [None]:
y_probs = grid_search.best_estimator_.predict_proba(X_test_scaled)[:, 1]
y_custom = (y_probs > 0.3).astype(int)  # try 0.3 or 0.4 instead of default 0.5

In [None]:
import shap

explainer = shap.Explainer(grid_search.best_estimator_, X_train_scaled)
shap_values = explainer(X_test_scaled)


In [None]:
# Extract class 1 explanation for sample 0 (bankruptcy)
single_explanation = shap.Explanation(
    values=shap_values.values[0, 1],
    base_values=shap_values.base_values[0, 1],
    data=shap_values.data[0],
    feature_names=shap_values.feature_names
)

# Now plot it
shap.plots.waterfall(single_explanation)


In [None]:
single_explanation = shap.Explanation(
    values=shap_values.values[0, 1],
    base_values=shap_values.base_values[0, 1],
    data=X_test_scaled[0],  # OR: X_test.iloc[0] if using DataFrame
    feature_names=X.columns.tolist()
)


In [None]:
# Get predicted probabilities for class 1 (bankruptcy)
probs = model.predict_proba(X_test_scaled)[:, 1]

# Sort by highest risk
top_indices = probs.argsort()[-5:][::-1]

# Plot SHAP explanations for top risky companies
for idx in top_indices:
    explanation = shap.Explanation(
        values=shap_values.values[idx, 1],
        base_values=shap_values.base_values[idx, 1],
        data=X_test_scaled[idx],
        feature_names=X.columns.tolist()
    )
    shap.plots.waterfall(explanation)
    

In [None]:
import yfinance as yf

ticker = yf.Ticker("AAPL")
info = ticker.info

# You can extract financial features like this:
features = [
    info.get("returnOnAssets", 0),
    info.get("debtToEquity", 0),
    info.get("grossMargins", 0),
    info.get("operatingMargins", 0),
    info.get("revenueGrowth", 0),
    info.get("netMargins", 0)
]

print("AAPL features:", features)

In [None]:
import yfinance as yf

# Pick your stock symbols
symbols = ["AAPL", "MSFT", "TSLA", "GOOGL"]

# Fetch data
for symbol in symbols:
    stock = yf.Ticker(symbol)
    
    # Get latest info
    info = stock.info
    print(f"\n📊 {symbol} - {info['longName']}")
    print(f"Current Price: ${info['regularMarketPrice']}")
    print(f"52-Week Range: {info['fiftyTwoWeekLow']} - {info['fiftyTwoWeekHigh']}")
    print(f"Market Cap: {info['marketCap']}")
    print(f"📈 Sector: {info.get('sector', 'N/A')}")

In [None]:
data = yf.download("TSLA", period="1mo", interval="1d")
print(data.tail())


In [None]:
portfolio = []

for symbol in ["TSLA", "AAPL"]:
    stock = yf.Ticker(symbol)
    hist = stock.history(period="1d")
    latest_features = [
        stock.info["returnOnAssets"],
        stock.info["debtToEquity"],
        stock.info["currentRatio"]
        # ➕ Add more if needed
    ]
    portfolio.append({"symbol": symbol, "features": latest_features})

print(portfolio)


In [None]:
import pandas as pd

# Pull S&P 500 symbols from Wikipedia
sp500 = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
symbols = sp500["Symbol"].tolist()


In [None]:

import yfinance as yf

company_data = []

for symbol in symbols[:50]:  # Limit initially to avoid rate limits!
    try:
        ticker = yf.Ticker(symbol)
        info = ticker.info
        
        features = [
            info.get("returnOnAssets", 0),
            info.get("debtToEquity", 0),
            info.get("currentRatio", 0),
            info.get("grossMargins", 0),
            info.get("quickRatio", 0)
            # ➕ Add more features that match your model
        ]
        
        company_data.append({
            "symbol": symbol,
            "features": features
        })

    except Exception as e:
        print(f"⚠️ Error fetching {symbol}: {e}")

print("✅ Total Companies Processed:", len(company_data))

In [None]:
import numpy as np
import pandas as pd

# ✅ Simulated placeholder: Load the dataset you trained on
df = pd.read_csv("data.csv")  # Replace with actual file
X = df.drop("Bankrupt?", axis=1)
expected_features = X.columns.tolist()

# ✅ Assume scaler and model are already trained
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import RandomForestClassifier
# scaler = StandardScaler().fit(X)
# model = RandomForestClassifier().fit(X_train_scaled, y_train)

# ✅ Example company data you want to predict on
company_stats_dict = {
    'ROA(C) before interest and depreciation before interest': 0.51,
    'ROA(A) before interest and % after tax': 0.56,
    'ROA(B) before interest and depreciation after tax': 0.55,
    'Operating Gross Margin': 0.61,
    'Realized Sales Gross Margin': 0.61,
    'Operating Profit Rate': 1.0,
    'Pre-tax net Interest Rate': 0.80,
    'After-tax net Interest Rate': 0.81,
    'Non-industry income and expenditure/revenue': 0.4,
    # ... continue to fill or default the rest
}

# ✅ Build feature vector of shape (1, 95) with default 0.0 if missing
company_features = [company_stats_dict.get(col, 0.0) for col in expected_features]
X_input = np.array(company_features).reshape(1, -1)

# ✅ Scale and predict
X_input_scaled = scaler.transform(X_input)
prediction = model.predict(X_input_scaled)[0]
prediction_prob = model.predict_proba(X_input_scaled)[0][1]

# ✅ Output result
print("🔎 Bankruptcy Risk Prediction:", "⚠️ High Risk" if prediction == 1 else "✅ Low Risk")
print(f"📊 Risk Score: {prediction_prob:.2f}")

In [None]:
import joblib

# Save scaler
joblib.dump(scaler, "scaler.pkl")

# Save model (e.g., RandomForest, XGBoost, etc.)
joblib.dump(model, "bankruptcy_model.pkl")

print("✅ Scaler and model saved!")


In [None]:
import numpy as np
import pandas as pd
import joblib

# ✅ Load training feature structure
df = pd.read_csv("data.csv")  # Use the same dataset you trained on
X = df.drop("Bankrupt?", axis=1)
expected_features = X.columns.tolist()

# ✅ Load trained scaler and model
scaler = joblib.load("scaler.pkl")              # Must have been saved using joblib.dump()
model = joblib.load("bankruptcy_model.pkl")     # Same as above

# ✅ Example company financial stats (partial data shown here)
company_stats_dict = {
    'ROA(C) before interest and depreciation before interest': 0.51,
    'ROA(A) before interest and % after tax': 0.56,
    'ROA(B) before interest and depreciation after tax': 0.55,
    'Operating Gross Margin': 0.61,
    'Realized Sales Gross Margin': 0.61,
    'Operating Profit Rate': 1.0,
    'Pre-tax net Interest Rate': 0.80,
    'After-tax net Interest Rate': 0.81,
    'Non-industry income and expenditure/revenue': 0.4,
    # ➕ Add remaining features or let them default to 0.0
}

# ✅ Build feature vector of shape (1, 95)
company_features = [company_stats_dict.get(col, 0.0) for col in expected_features]
X_input_df = pd.DataFrame([company_features], columns=expected_features)

# ✅ Scale and predict
X_input_scaled = scaler.transform(X_input_df)
prediction = model.predict(X_input_scaled)[0]
prediction_prob = model.predict_proba(X_input_scaled)[0][1]

# ✅ Output result
print("\n🔎 Bankruptcy Risk Prediction:", "⚠️ High Risk" if prediction == 1 else "✅ Low Risk")
print(f"📊 Risk Score: {prediction_prob:.2f}\n")

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

# -------------------------------
# STEP 1: Load the dataset
# -------------------------------
df = pd.read_csv("data.csv")  # Update path if needed
X = df.drop("Bankrupt?", axis=1)
y = df["Bankrupt?"]

# -------------------------------
# STEP 2: Train/test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# STEP 3: Preprocessing
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save for future use
joblib.dump(scaler, "scaler.pkl")

# -------------------------------
# STEP 4: Train Model
# -------------------------------
model = RandomForestClassifier(random_state=42, class_weight="balanced")
model.fit(X_train_scaled, y_train)

# Save model
joblib.dump(model, "bankruptcy_model.pkl")

# -------------------------------
# STEP 5: Use a real row from data
# -------------------------------
# Let's say row 100
sample_index = 150
sample_features = X.iloc[sample_index].values.reshape(1, -1)
sample_scaled = scaler.transform(sample_features)

# -------------------------------
# STEP 6: Predict
# -------------------------------
prediction = model.predict(sample_scaled)[0]
prediction_prob = model.predict_proba(sample_scaled)[0][1]

print(f"\n📦 Company #{sample_index}")
print("🔎 Bankruptcy Risk Prediction:", "⚠️ High Risk" if prediction == 1 else "✅ Low Risk")
print(f"📊 Risk Score: {prediction_prob:.2f}")

In [None]:
import numpy as np
import joblib
import pandas as pd

# ------------------------------
# STEP 1: Load model, scaler, and features
# ------------------------------
model = joblib.load("bankruptcy_model.pkl")
scaler = joblib.load("scaler.pkl")
expected_features = joblib.load("features_list.pkl")  # Make sure you saved this during training

# ------------------------------
# STEP 2: Partial company input (only a few features)
# ------------------------------
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

# -------------------------------
# STEP 1: Load the dataset
# -------------------------------
df = pd.read_csv("data.csv")  # Update path if needed
X = df.drop("Bankrupt?", axis=1)
y = df["Bankrupt?"]

# -------------------------------
# STEP 2: Train/test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# STEP 3: Preprocessing
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save for future use
joblib.dump(scaler, "scaler.pkl")

# -------------------------------
# STEP 4: Train Model
# -------------------------------
model = RandomForestClassifier(random_state=42, class_weight="balanced")
model.fit(X_train_scaled, y_train)


# ------------------------------
# STEP 3: Build full-length feature vector
# ------------------------------
company_features = [company_stats_dict.get(feature, 0.0) for feature in expected_features]
X_input = np.array(company_features).reshape(1, -1)

# ------------------------------
# STEP 4: Scale and Predict
# ------------------------------
X_input_scaled = scaler.transform(X_input)
prediction = model.predict(X_input_scaled)[0]
prediction_prob = model.predict_proba(X_input_scaled)[0][1]

# ------------------------------
# STEP 5: Output
# ------------------------------
print("📦 Company: [Partial data input]")
print("🔎 Bankruptcy Risk Prediction:", "⚠️ High Risk" if prediction == 1 else "✅ Low Risk")
print(f"📊 Risk Score: {prediction_prob:.2f}")

In [None]:
# import numpy as np

# for company in company_data:
#     try:
#         X_input = np.array(company["features"]).reshape(1, -1)
#         X_input_scaled = scaler.transform(X_input)
#         prediction = model.predict(X_input_scaled)[0]

#         if prediction == 1:
#             print(f"🚨 {company['symbol']} → ⚠️ High bankruptcy risk!")
#         else:
#             print(f"✅ {company['symbol']} → 🟢 Stable")

#     except Exception as e:
#         print(f"Error for {company['symbol']}: {e}")

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# scaler.fit(X_train_scaled)  # X_train is your training features before scaling
