# Machine Learning Project

### Company Bankruptcy Prediction

Names:    
    - Denis Mugisha   
    - Liu Guangqiang   
    - Rachel Fanti   
    
Dataset: https://www.kaggle.com/fedesoriano/company-bankruptcy-prediction

Data: May/2021

## Libraries

In [12]:
# Basics
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd
import seaborn as sns

In [13]:
# Evaluation of the model
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, balanced_accuracy_score
from sklearn.metrics import mean_squared_error, roc_curve, auc
from sklearn.model_selection import learning_curve, validation_curve

### Evaluation the Algorithm

**Confusion matrix:**  
- True positives: positive tuples correctly labeled   
- True negatives: negative tuples correctly labeled   
- False positives: negative tuples incorrectly labeled   
- False negatives: positive tuples incorrectly labeled   

**Classification report:**   
- Accuracy = (TP+TN)/(TP+TN+FP+FN)   
- Precision = TP/(TP+FP)   
- Recall = TP/(TP+FN)   
- F1 - measure = 2rp/(r+p)   
- WeightedAccuracy = (Wtp*TP+Wtn*TN)/(Wtp*TP+Wtn*TN+Wfp*FP+Wfn*FN)  

**Error:**
- Mean squared error (MSE)

**Receiver Operating Characteristic (ROC) curve:** 
 - Area Under the Curve (AUC)

In [14]:
# df_results = pd.DataFrame(columns = ['Scenarios','#Features', 'Train/Test', 'Acc', 'Bal_Acc','M_P','M_R', 'M_F1', "P0", "P1", "R0", "R1", 'MSE', 'Auc'])
# print (df_results)

In [15]:
def evaluation_set(clf, y, predict, df_results, type_set, scenario_name, n_feat):
    
    '''Calculate metrics given a scenario and set (train/validatioon or test set)'''
    
    # Classification report
    report = classification_report(y, predict, output_dict=True)
    accuracy = round(report['accuracy'], 2)
    balanced_accuracy = round(balanced_accuracy_score(y, predict), 2)
    macro_precision =  round(report['macro avg']['precision'],2)
    macro_recall = round(report['macro avg']['recall'], 2)    
    macro_f1 = round(report['macro avg']['f1-score'], 2)
    p0 = round(report['0']['precision'], 2)
    p1 = round(report['1']['precision'], 2)
    r0 = round(report['0']['recall'], 2)
    r1 = round(report['1']['recall'], 2) 
    
    # MSE - Mean squared error
    mse = round(mean_squared_error (y,predict),3)
    
    # AUC
    fpr, tpr, thresh = roc_curve(y, predict)
    area = round(auc(fpr, tpr),2)
    
    df_results.loc[len(df_results)+1] = [scenario_name, str(n_feat) + ' features', type_set, accuracy, balanced_accuracy, 
                                         macro_precision, macro_recall, macro_f1, p0, p1, r0, r1, mse, area]

In [16]:
def evaluation_scenario(clf, y_train, y_test, predict_train, predict_test, df_results, scenario_name, n_feat):
    evaluation_set(clf, y_train, predict_train, df_results,'Train', scenario_name, n_feat)
    evaluation_set(clf, y_test, predict_test, df_results,'Val/Test', scenario_name, n_feat) 

In [17]:
def print_confusion_matrix(clf, X, y, type_set):
    
    # Confusion matrix

    print(f'Confusion matrix - {type_set}:')
    plot_confusion_matrix(clf, X, y)
    plt.show()  

In [18]:
def print_classification_report(clf, y, predict, type_set):
    
    print(f'Classification Report - {type_set}:')
    print(classification_report(y, predict))
    print()       
    print ('MSE - {type_set}:', round(mean_squared_error (y,predict),3))

In [19]:
def plot_roc_curve(y, predict, type_set):

    fpr, tpr, thresh = roc_curve(y, predict)
    area = auc(fpr, tpr)

    plt.plot(fpr, tpr, label='ROC curve (area = %.2f)' %area)
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')
    plt.title(f'AUC & ROC curve - {type_set}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid()
    plt.legend()
    plt.show()

In [20]:
def print_evaluation_scenario(clf, X_train, y_train, X_test, y_test, predict_train, predict_test):
    
    # print_confusion_matrix(clf, X_train, y_train, 'Train')
    conf_matrix = print_confusion_matrix(clf, X_test, y_test, 'Val/Test')
    # print_classification_report(clf, y_train, predict_train, 'Train')
    # print_classification_report(clf, y_test, predict_test, 'Val/Test')
    # plot_roc_curve(y_train, predict_train, 'Train')
    roc = plot_roc_curve(y_test, predict_test, 'Val/Test')

**Learning curve:**

In [21]:
def plot_learning_curve(estimator, X, y, ylim=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    
    #cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False) # cv = 5, instead of 10 to avoid samples without class 1  
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, scoring='balanced_accuracy', cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
        
    print(f'#Cv - Train: {train_scores.shape} - Test: {test_scores.shape}\n')
    print('Train and test scores according to the sample size\n')
    print('Train: {0}'.format(train_scores_mean))
    print('Test:  {0}'.format(test_scores_mean)) 
    
    title = "Learning Curve for Multi Layer Preceptron (MLP)\n" \
        "Cross Validation of {cv} splits\n"
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Number of training examples")
    plt.ylabel("Balanced Accuracy Score")
          
    #plt.grid()
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Testing score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1,color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.legend(loc='best')
    plt.show()
