In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

In [3]:
print("="*50)
print("TRAINING DATASET")
print("="*50)
print(f"Shape: {train_df.shape}")
print(f"\nFirst few rows:")
print(train_df.head())
print(f"\nColumn names:")
print(train_df.columns.tolist())
print(f"\nData types:")
print(train_df.dtypes)

TRAINING DATASET
Shape: (62789, 24)

First few rows:
   Unnamed: 0 company_name   fyear status_label        X1          X2  \
0           0          C_1  1999.0        alive  511267.0  740998.000   
1           1          C_1  2000.0        alive  485856.0     701.854   
2           2          C_1  2001.0        alive  436656.0  710199.000   
3           3          C_1  2002.0        alive  396412.0     686.621   
4           4          C_1  2003.0        alive  432204.0     709.292   

         X3        X4        X5        X6  ...        X11         X12  \
0  833107.0  180447.0  18373.00  70658.00  ...     35.163  201026.000   
1  713811.0  179987.0  18577.00     45.79  ...  18531.000  204065.000   
2  526477.0  217699.0  22496.00   4711.00  ...    -58.939     139.603   
3  496747.0  164658.0  27172.00   3573.00  ...    -12.410  124106.000   
4  523302.0  248666.0     26.68  20811.00  ...   3504.000  131884.000   

          X13          X14       X15       X16          X17       X18

In [4]:
print("\n" + "="*50)
print("TEST DATASET")
print("="*50)
print(f"Shape: {test_df.shape}")
print(f"\nFirst few rows:")
print(test_df.head())


TEST DATASET
Shape: (15893, 23)

First few rows:
   Unnamed: 0 company_name   fyear        X1        X2       X3      X4  \
0          31          C_3  1999.0  9757.000  13986.00  19796.0  5974.0   
1          32          C_3  2000.0     7.884  11608.00  16506.0  4875.0   
2          33          C_3  2001.0  6494.000   8635.00     15.7  3873.0   
3          34          C_3  2002.0  5938.000      7.85  12919.0  2546.0   
4          35          C_3  2004.0  5807.000   6245.00  12018.0   222.0   

        X5        X6      X7  ...       X11     X12     X13       X14  \
0  667.000  -932.000  -265.0  ... -2207.000 -6375.0  3924.0     29.37   
1    0.700    -0.028   672.0  ...    -0.808 -7184.0  3244.0  25367.00   
2    0.761    -0.380   381.0  ...    -1.738 -8922.0  2677.0  24051.00   
3  355.000   356.000   711.0  ...    84.000 -8816.0  2465.0  20087.00   
4    0.160  1454.000  1614.0  ...  1345.000 -8974.0  2504.0  19833.00   

       X15       X16       X17       X18  Division  MajorGro

In [5]:
train_df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
company_name,0
fyear,0
status_label,0
X1,0
X2,0
X3,0
X4,0
X5,0
X6,0


In [6]:
test_df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
company_name,0
fyear,0
X1,0
X2,0
X3,0
X4,0
X5,0
X6,0
X7,0


In [7]:
print("\n" + "="*50)
print("TARGET VARIABLE DISTRIBUTION")
print("="*50)
print(train_df['status_label'].value_counts())
print(f"\nPercentage distribution:")
print(train_df['status_label'].value_counts(normalize=True) * 100)


TARGET VARIABLE DISTRIBUTION
status_label
alive     58586
failed     4203
Name: count, dtype: int64

Percentage distribution:
status_label
alive     93.306152
failed     6.693848
Name: proportion, dtype: float64


In [10]:
print("\n" + "="*50)
print("BASIC STATISTICS")
print("="*50)
print(train_df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']].describe())


BASIC STATISTICS
                 X1            X2            X3            X4            X5  \
count  6.278900e+04  6.278900e+04  6.278900e+04  6.278900e+04  6.278900e+04   
mean   2.150171e+05  5.940270e+05  3.789251e+05  1.465220e+05  2.449805e+04   
std    7.293540e+05  2.061058e+06  2.097401e+06  6.747103e+05  9.976495e+04   
min   -1.300000e+01  7.000000e-03 -3.666450e+05 -2.300000e+01  0.000000e+00   
25%    1.286000e+03  3.322800e+03  1.476000e+03  0.000000e+00  7.500400e+01   
50%    2.091400e+04  4.200200e+04  2.015800e+04  5.130000e+02  1.186000e+03   
75%    1.519670e+05  3.294390e+05  1.774980e+05  1.541700e+04  1.198300e+04   
max    3.132258e+07  6.723494e+07  1.632456e+08  1.829466e+07  4.440438e+06   

                 X6            X7            X8            X9           X10  
count  6.278900e+04  6.278900e+04  6.278900e+04  6.278900e+04  6.278900e+04  
mean   4.435436e+04  6.938075e+04  1.724399e+05  5.163481e+04  1.237003e+05  
std    2.309307e+05  2.923196e+05  6

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 10)

In [11]:
X_columns = [f'X{i}' for i in range(1, 19)]
for col in X_columns:
    non_numeric = train_df[col].apply(lambda x: not isinstance(x, (int, float, np.int64, np.float64)))
    if non_numeric.sum() > 0:
        print(f"{col}: {non_numeric.sum()} non-numeric values found")
        print(f"Sample values: {train_df[train_df[col].apply(lambda x: not isinstance(x, (int, float, np.int64, np.float64)))][col].head()}")

In [12]:
feature_columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9',
                   'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']

In [13]:
X_train = train_df[feature_columns]
y_train = train_df['status_label']
X_test = test_df[feature_columns]

In [14]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [15]:
print("="*50)
print("DATA PREPARATION COMPLETE")
print("="*50)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")

DATA PREPARATION COMPLETE
X_train shape: (62789, 18)
y_train shape: (62789,)
X_test shape: (15893, 18)


In [16]:
print(f"\nTarget encoding:")
print(f"  'alive' is encoded as: {label_encoder.transform(['alive'])[0]}")
print(f"  'failed' is encoded as: {label_encoder.transform(['failed'])[0]}")

print(f"\nTarget distribution after encoding:")
print(f"  Class 0: {(y_train_encoded == 0).sum()} samples")
print(f"  Class 1: {(y_train_encoded == 1).sum()} samples")



Target encoding:
  'alive' is encoded as: 0
  'failed' is encoded as: 1

Target distribution after encoding:
  Class 0: 58586 samples
  Class 1: 4203 samples


In [17]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [18]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train_encoded)

In [20]:
print(f"\nBefore SMOTE:")
print(f"  Class 0 (alive): {(y_train_encoded == 0).sum()} samples")
print(f"  Class 1 (failed): {(y_train_encoded == 1).sum()} samples")

print(f"\nAfter SMOTE:")
print(f"  Class 0 (alive): {(y_train_balanced == 0).sum()} samples")
print(f"  Class 1 (failed): {(y_train_balanced == 1).sum()} samples")


Before SMOTE:
  Class 0 (alive): 58586 samples
  Class 1 (failed): 4203 samples

After SMOTE:
  Class 0 (alive): 58586 samples
  Class 1 (failed): 58586 samples


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_balanced, y_train_balanced)

In [25]:
y_pred_train = log_reg.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train_encoded, y_pred_train)

In [26]:
print(f"Training Accuracy: {train_accuracy*100:.2f}%")

Training Accuracy: 37.58%


In [27]:
print("\nClassification Report:")
print(classification_report(y_train_encoded, y_pred_train, target_names=['Alive', 'Failed']))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_train_encoded, y_pred_train)
print(cm)
print(f"\nInterpretation:")
print(f"  Correctly predicted Alive: {cm[0][0]}")
print(f"  Incorrectly predicted Alive as Failed: {cm[0][1]}")
print(f"  Incorrectly predicted Failed as Alive: {cm[1][0]}")
print(f"  Correctly predicted Failed: {cm[1][1]}")


Classification Report:
              precision    recall  f1-score   support

       Alive       0.95      0.35      0.51     58586
      Failed       0.08      0.77      0.14      4203

    accuracy                           0.38     62789
   macro avg       0.52      0.56      0.33     62789
weighted avg       0.90      0.38      0.48     62789


Confusion Matrix:
[[20362 38224]
 [  968  3235]]

Interpretation:
  Correctly predicted Alive: 20362
  Incorrectly predicted Alive as Failed: 38224
  Incorrectly predicted Failed as Alive: 968
  Correctly predicted Failed: 3235


In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_balanced, y_train_balanced)

In [31]:
y_pred_train_rf = rf_model.predict(X_train_scaled)
train_accuracy_rf = accuracy_score(y_train_encoded, y_pred_train_rf)

print(f"Training Accuracy: {train_accuracy_rf*100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_train_encoded, y_pred_train_rf, target_names=['Alive', 'Failed']))
print("\nConfusion Matrix:")
cm_rf = confusion_matrix(y_train_encoded, y_pred_train_rf)
print(cm_rf)
print(f"\nInterpretation:")
print(f"  Correctly predicted Alive: {cm_rf[0][0]}")
print(f"  Incorrectly predicted Alive as Failed: {cm_rf[0][1]}")
print(f"  Incorrectly predicted Failed as Alive: {cm_rf[1][0]}")
print(f"  Correctly predicted Failed: {cm_rf[1][1]}")

Training Accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

       Alive       1.00      1.00      1.00     58586
      Failed       1.00      1.00      1.00      4203

    accuracy                           1.00     62789
   macro avg       1.00      1.00      1.00     62789
weighted avg       1.00      1.00      1.00     62789


Confusion Matrix:
[[58586     0]
 [    0  4203]]

Interpretation:
  Correctly predicted Alive: 58586
  Incorrectly predicted Alive as Failed: 0
  Incorrectly predicted Failed as Alive: 0
  Correctly predicted Failed: 4203


In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [33]:
cv_scores = cross_val_score(rf_model, X_train_balanced, y_train_balanced,
                            cv=5, scoring='accuracy', n_jobs=-1)

scoring = ['accuracy', 'precision', 'recall', 'f1']
cv_results = cross_validate(rf_model, X_train_balanced, y_train_balanced,
                            cv=5, scoring=scoring, n_jobs=-1)

In [34]:
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean()*100:.2f}%")
print(f"Standard Deviation: {cv_scores.std()*100:.2f}%")

print(f"Accuracy:  {cv_results['test_accuracy'].mean()*100:.2f}% (+/- {cv_results['test_accuracy'].std()*100:.2f}%)")
print(f"Precision: {cv_results['test_precision'].mean()*100:.2f}% (+/- {cv_results['test_precision'].std()*100:.2f}%)")
print(f"Recall:    {cv_results['test_recall'].mean()*100:.2f}% (+/- {cv_results['test_recall'].std()*100:.2f}%)")
print(f"F1-Score:  {cv_results['test_f1'].mean()*100:.2f}% (+/- {cv_results['test_f1'].std()*100:.2f}%)")

Cross-Validation Scores: [0.87313847 0.94900789 0.95173679 0.94520782 0.94136724]
Mean CV Accuracy: 93.21%
Standard Deviation: 2.97%
Accuracy:  93.21% (+/- 2.97%)
Precision: 92.29% (+/- 0.61%)
Recall:    94.29% (+/- 6.17%)
F1-Score:  93.18% (+/- 3.31%)


In [35]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss', n_jobs=-1)
xgb_model.fit(X_train_balanced, y_train_balanced)

y_pred_train_xgb = xgb_model.predict(X_train_scaled)
train_accuracy_xgb = accuracy_score(y_train_encoded, y_pred_train_xgb)


cv_scores_xgb = cross_val_score(xgb_model, X_train_balanced, y_train_balanced,
                                cv=5, scoring='accuracy', n_jobs=-1)

cv_results_xgb = cross_validate(xgb_model, X_train_balanced, y_train_balanced,
                                cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'], n_jobs=-1)



In [36]:
print(f"Training Accuracy: {train_accuracy_xgb*100:.2f}%")
print(f"\nCross-Validation Scores: {cv_scores_xgb}")
print(f"Mean CV Accuracy: {cv_scores_xgb.mean()*100:.2f}%")
print(f"Standard Deviation: {cv_scores_xgb.std()*100:.2f}%")
print(f"Accuracy:  {cv_results_xgb['test_accuracy'].mean()*100:.2f}% (+/- {cv_results_xgb['test_accuracy'].std()*100:.2f}%)")
print(f"Precision: {cv_results_xgb['test_precision'].mean()*100:.2f}% (+/- {cv_results_xgb['test_precision'].std()*100:.2f}%)")
print(f"Recall:    {cv_results_xgb['test_recall'].mean()*100:.2f}% (+/- {cv_results_xgb['test_recall'].std()*100:.2f}%)")
print(f"F1-Score:  {cv_results_xgb['test_f1'].mean()*100:.2f}% (+/- {cv_results_xgb['test_f1'].std()*100:.2f}%)")

Training Accuracy: 87.94%

Cross-Validation Scores: [0.80392575 0.90471517 0.89698728 0.89340275 0.88832466]
Mean CV Accuracy: 87.75%
Standard Deviation: 3.72%
Accuracy:  87.75% (+/- 3.72%)
Precision: 85.85% (+/- 1.06%)
Recall:    90.30% (+/- 7.68%)
F1-Score:  87.88% (+/- 4.34%)


In [41]:
print("="*60)
print("         FINAL MODEL COMPARISON SUMMARY")
print("="*60)

results_summary = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'CV Accuracy': ['38.00%', '93.21%', '87.75%'],
    'Precision': ['8%', '92.29%', '85.85%'],
    'Recall': ['77%', '94.29%', '90.30%'],
    'F1-Score': ['14%', '93.18%', '87.88%'],
    'Status': ['Poor', 'BEST', 'Good']
})

print(results_summary.to_string(index=False))

print("\n" + "="*60)
print("WINNER: Random Forest with 93.21% Accuracy!")
print("="*60)
print("\n Random Forest Benefits:")
print("  • Highest accuracy (93.21%)")
print("  • Excellent precision (92.29%) - fewer false alarms")
print("  • Great recall (94.29%) - catches most bankruptcies")
print("  • Balanced F1-score (93.18%)")
print("  • Robust and reliable performance")

         FINAL MODEL COMPARISON SUMMARY
              Model CV Accuracy Precision Recall F1-Score Status
Logistic Regression      38.00%        8%    77%      14%   Poor
      Random Forest      93.21%    92.29% 94.29%   93.18%   BEST
            XGBoost      87.75%    85.85% 90.30%   87.88%   Good

WINNER: Random Forest with 93.21% Accuracy!

 Random Forest Benefits:
  • Highest accuracy (93.21%)
  • Excellent precision (92.29%) - fewer false alarms
  • Great recall (94.29%) - catches most bankruptcies
  • Balanced F1-score (93.18%)
  • Robust and reliable performance


In [40]:
print("="*60)
print("MAKING PREDICTIONS ON TEST DATASET")
print("="*60)

test_predictions = rf_model.predict(X_test_scaled)

test_predictions_labels = label_encoder.inverse_transform(test_predictions)

test_df['predicted_status'] = test_predictions_labels

print("\nPrediction Distribution:")
print(test_df['predicted_status'].value_counts())
print(f"\nPercentage:")
print(test_df['predicted_status'].value_counts(normalize=True) * 100)

print("\n" + "="*60)
print("SAMPLE PREDICTIONS (First 10 companies)")
print("="*60)
print(test_df[['company_name', 'fyear', 'predicted_status']].head(10))

print("\n Predictions completed successfully!")

MAKING PREDICTIONS ON TEST DATASET

Prediction Distribution:
predicted_status
alive     14787
failed     1106
Name: count, dtype: int64

Percentage:
predicted_status
alive     93.040961
failed     6.959039
Name: proportion, dtype: float64

SAMPLE PREDICTIONS (First 10 companies)
  company_name   fyear predicted_status
0          C_3  1999.0            alive
1          C_3  2000.0            alive
2          C_3  2001.0            alive
3          C_3  2002.0            alive
4          C_3  2004.0            alive
5          C_3  2005.0            alive
6          C_3  2006.0            alive
7          C_3  2007.0            alive
8          C_3  2008.0           failed
9          C_4  1999.0            alive

 Predictions completed successfully!


In [42]:
output_simple = test_df[['company_name', 'fyear', 'predicted_status']].copy()
output_simple.to_csv('output.csv', index=False)

print(f"Total predictions: {len(output_simple)}")
print(f"\nPrediction Summary:")
print(output_simple['predicted_status'].value_counts())

output_detailed = test_df.copy()
output_detailed.to_csv('output_detailed.csv', index=False)

Total predictions: 15893

Prediction Summary:
predicted_status
alive     14787
failed     1106
Name: count, dtype: int64


In [43]:
from google.colab import files
files.download('output.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>