# Hello
In this kernel, I will be using scikit-learn to predict if a company is bankrupt.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("../input/company-bankruptcy-prediction/data.csv")
data.head()

Lets see how the data looks like.

In [None]:
data.describe()

Well it looks like they have a ton of columns. This should be helpful when we create a model.

In [None]:
# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

This dataset has so many categories. You can't really see much by looking at these categories. 

Let's take a look at the individual columns to see how they are spreadout.



In [None]:
for name in data.columns:
    ax = sns.violinplot(data=data[name], orient="h", palette="Set2").set_title(name)
    plt.show()

    

For this dataset we are trying to predict whether a company is bankrupt. So we will have two different groups of outputs
0. Not Bankrupt
1. Bankrupt

This means that I will need to build a classifier. 

For this task I will use one of sklearn's various models. I think an SVM should do the job

[Support Vector Machine](https://scikit-learn.org/stable/modules/svm.html)

In [None]:
# First we have to create the subsets of the data
import sklearn
from sklearn import svm

target = "Bankrupt?"

shuffled_data = data.sample(frac=1)
X = shuffled_data.drop([target], axis=1)
y = shuffled_data[target]
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.25)


Now lets create our initial model.

In [None]:
clf = svm.SVC()
clf.fit(x_train, y_train)

Lets score it

In [None]:
prediction = clf.predict(x_test)
prediction

In [None]:
# Count how many were wrong
y_list = y_test.tolist()

counter = 0
for index, val in enumerate(prediction):
    if y_list[index] != val:
        counter += 1
       
# This is how many that were wrong
counter

Now we can display this accuracy score as a rounded percent.

This code finds the percentage of the data points that were wrong, then converts it into the fraction of values that were right, next it converts the fraction to a percent and finally rounds it.

In [None]:
print(f"Accuracy: {round(100*(1 - counter/len(prediction)), 2)}%")

Now lets create a function to do this automatically and select the best model from that.

In [None]:
# It looks like the SVM is not random like I thought it would be
# I found another random classifier and I will use it instead
# As you can see, I am a beginner and any advice would be appreciated!

from sklearn.ensemble import RandomForestClassifier

def train_model(X, y, x_test, y_test, random=True):
    if random:
        # if we want a random model generate the model like this
        clf = RandomForestClassifier()
    else:
        # Otherwise generate it normally
        clf = svm.SVC()
    
    # Train the model
    clf.fit(X, y)
    
    # calculate the Training Accuracy
    trainingAccuracy = scoreModel(clf, X, y)
    
    print(f"\nTraining Accuracy: {round(100*(1 - trainingAccuracy/len(X)), 2)}%")
    
    # Calculate the Validation Accuracy
    validationAccuracy = scoreModel(clf, x_test, y_test)
    
    print(f"Validation Accuracy: {round(100*(1 - validationAccuracy/len(x_test)), 2)}%")

    # We only want the validation accuracy, training accuracy is essentially the same thing
    return clf, validationAccuracy

def scoreModel(model, X, y):
    # Get the Prediction
    prediction = model.predict(X)
    # Get the actual values
    y_list = y.tolist()
    
    # count how many where wrong
    counter = 0
    for index, val in enumerate(prediction):
        if y_list[index] != val:
            counter += 1
            
    return counter

Next we can run the function multiple times and save the best one.

In [None]:
# Get the best model
bestModel, bestScore = train_model(x_train, y_train, x_test, y_test, False) # Just in case all of our models are horrible

# Run it multiple times
for i in range(15):
    model, counter = train_model(x_train, y_train, x_test, y_test)
    # Save the best scorer
    if counter < bestScore:
        bestModel, bestScore = model, counter

print(f"\nThe best model had an accuracy of {round(100*(1 - bestScore/len(prediction)), 2)}%")


In [None]:
# I am not sure if this works on Kaggle but if you download this file it should create a save file for the best model
from joblib import dump, load
dump(bestModel, "best_model.joblib")

In [None]:
# To load from the save
bestModelFromSave = load("best_model.joblib")

All Done!