In [28]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the training and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [29]:
print("Number of null values in train_df is:", train_df.isnull().sum().sum())
train_df.head()

Number of null values in train_df is: 0


Unnamed: 0,ID_FIRM,P1110_B,P1110_E,P1120_B,P1120_E,P1130_B,P1130_E,P1140_B,P1140_E,P1150_B,...,P2421_E,P2430_B,P2430_E,P2450_B,P2450_E,P2460_B,P2460_E,P2400_B,P2400_E,BANKR
0,1,8,7,0,0,0,0,0,0,402,...,161,0,0,0,0,0,0,1561,621,0
1,2,0,0,3318,3318,0,0,0,0,208809,...,0,0,0,0,0,0,8551,-15296,-16123,0
2,3,0,0,0,0,0,0,0,0,237,...,0,0,0,0,0,6,0,-2166,-1375,0
3,4,0,0,0,0,0,0,0,0,15428,...,-55,0,22,0,4,0,0,-3390,502,0
4,5,0,0,0,0,0,0,0,0,340249,...,-895,0,11,0,0,105,0,26131,6100,0


In [30]:
print("Number of null values in train_df is:", train_df.isnull().sum().sum())
train_df.head()

Number of null values in train_df is: 0


Unnamed: 0,ID_FIRM,P1110_B,P1110_E,P1120_B,P1120_E,P1130_B,P1130_E,P1140_B,P1140_E,P1150_B,...,P2421_E,P2430_B,P2430_E,P2450_B,P2450_E,P2460_B,P2460_E,P2400_B,P2400_E,BANKR
0,1,8,7,0,0,0,0,0,0,402,...,161,0,0,0,0,0,0,1561,621,0
1,2,0,0,3318,3318,0,0,0,0,208809,...,0,0,0,0,0,0,8551,-15296,-16123,0
2,3,0,0,0,0,0,0,0,0,237,...,0,0,0,0,0,6,0,-2166,-1375,0
3,4,0,0,0,0,0,0,0,0,15428,...,-55,0,22,0,4,0,0,-3390,502,0
4,5,0,0,0,0,0,0,0,0,340249,...,-895,0,11,0,0,105,0,26131,6100,0


In [31]:
# Store the firm_ids for the output csv later
firm_ids=test_df['ID_FIRM']
# Normalize the data
train_df=(train_df-train_df.min())/(train_df.max()-train_df.min())
train_df=train_df.drop('ID_FIRM', axis=1)
test_df=(test_df-test_df.min())/(test_df.max()-test_df.min())
test_df=test_df.drop('ID_FIRM', axis=1)

train_df.head()

Unnamed: 0,P1110_B,P1110_E,P1120_B,P1120_E,P1130_B,P1130_E,P1140_B,P1140_E,P1150_B,P1150_E,...,P2421_E,P2430_B,P2430_E,P2450_B,P2450_E,P2460_B,P2460_E,P2400_B,P2400_E,BANKR
0,1.074149e-08,9.398804e-09,0.0,0.0,0.0,0.0,0.0,0.0,5.140222e-08,5.130811e-08,...,0.19138,0.0,0.0,0.0,0.0,0.0,0.0,0.548724,0.285104,0.0
1,0.0,0.0,9e-05,9.4e-05,0.0,0.0,0.0,0.0,2.669962e-05,2.644443e-05,...,0.191379,0.0,0.0,0.0,0.0,0.0,0.000297,0.548708,0.285065,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.03043e-08,3.055282e-08,...,0.191379,0.0,0.0,0.0,0.0,1.962512e-07,0.0,0.54872,0.2851,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.97272e-06,1.980648e-06,...,0.191379,0.0,4.20329e-07,0.0,5.247966e-08,0.0,0.0,0.548719,0.285104,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.350636e-05,4.498432e-05,...,0.191374,0.0,2.101645e-07,0.0,0.0,3.434395e-06,0.0,0.548746,0.285117,0.0


In [32]:
# Split the training data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(train_df.drop('BANKR', axis=1), train_df['BANKR'], test_size=0.2, random_state=42)

# Train the model, see explanation for BRF here: https://medium.com/sfu-cspmp/surviving-in-a-random-forest-with-imbalanced-datasets-b98b963d52eb
# model = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)


# # Hyperparameter Tuning
# param_grid = {
#     'max_depth': [None, 10, 20, 30]
# }

# grid_search = GridSearchCV(BalancedRandomForestClassifier(n_estimators=100, random_state=42), param_grid)
# grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_

# Train the model with best parameters
# model = BalancedRandomForestClassifier(**best_params, random_state=42)
model = BalancedRandomForestClassifier(n_estimators=100,  random_state=42)
model.fit(X_train, y_train)



# Evaluate the model
predictions = model.predict(X_test)
print("Validation Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

  warn(
  warn(
  warn(


Validation Accuracy: 0.8191810712215745
Confusion Matrix:
 [[119598  26411]
 [   107    539]]
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.82      0.90    146009
         1.0       0.02      0.83      0.04       646

    accuracy                           0.82    146655
   macro avg       0.51      0.83      0.47    146655
weighted avg       0.99      0.82      0.90    146655



In [33]:
# Predict on test set
test_predictions = model.predict(test_df)

# Prepare submission file
submission = pd.DataFrame({'ID_FIRM': firm_ids, 'BANKR': test_predictions})
submission['BANKR'] = submission['BANKR'].astype(int)
submission.to_csv('submission.csv', index=False)

submission.head()


Unnamed: 0,ID_FIRM,BANKR
0,733272,1
1,733273,1
2,733274,0
3,733275,0
4,733276,0
