In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This is an example of using random forest method to predict bankruptcy for a company based on the Kaggle bankruptcy-prediction dataset.

In [None]:
# variables
path_str = "../input/company-bankruptcy-prediction/data.csv"
# For random forest, a number of trees must be selected.
# The higher number, the more thorough the calculation, but it takes longer to run.
number_of_trees = 200
# Target column for random forest prediction
target_column_name = 'Bankrupt?'
# Usually, decision trees can be large.  Setting this variable to 3 or 4 makes the result tree easier to see and interpret.
tree_depth = 4

In [None]:
#  Load data
# create dataframe from data

df = pd.read_csv(path_str)

df.head()

In [None]:
# check length-rows and width-columns of data
df.shape

In [None]:
# Use numpy to convert to arrays.
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, 
# along with a large collection of high-level mathematical functions to operate on these arrays.
import numpy as np

# Assign target variable to separate array
target = np.array(df[target_column_name])

# Remove target column from features
features = df.drop(target_column_name, axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# convert features dataframe to array
features = np.array(features)

In [None]:
#  Using Skicit-learn to split data into training and testing sets.
#  Scikit-learn (formerly scikits.learn and also known as sklearn) is a free software machine learning library for the Python programming language.
#  It features various classification, #  regression and clustering algorithms including support vector machines, random forests, 
#  gradient boosting, k-means and DBSCAN, and is designed to interoperate with the Python numerical and scientific libraries NumPy and SciPy.
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets.  test_size is n% of the rows. The other % will train the model.
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size = 0.25, random_state = 42)

# Check to see that training features and labels have the same rows, and testing features and labels have the same rows
print('Training Features Shape:', train_features.shape)
print('Training target Shape:', train_target.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing target Shape:', test_target.shape)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model.  n_estimators is the number of decision trees you want to use
rf = RandomForestRegressor(n_estimators = number_of_trees, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_target)

In [None]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
from IPython.display import Image
# pydot may need to be installed. 
try:
    import pydot
except ImportError as e:
    !pip install pydot
    import pydot

In [None]:
# Limit depth of tree to n levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = tree_depth)
rf_small.fit(train_features, train_target)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')
# show png file
Image(graph.create_png())

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

**Observations:**
* From the above list of importances (listed in descending order), as well as the sample tree from the random forest, we can see that growth rate and income-to-equity are the top factors.  Whether the growth-rate factor would apply to large established companies that grow more slowly, we do not have enough information to determine.
* As a cross-check on the model, a quick sort does indeed show a high concentration of bankruptcies where net value growth rate is lowest.
* A view of the raw data shows a correlation between net-income/equity and bankruptcy cases.  This combination of the two top might lead us to conclude that low growth rate and low income are red flags for small firms.
* A second tier of factors: interest-bearing debt interest rate and borrowing dependency, likely indicate things typically associated with bankruptcy risk, such as high-risk/high-interest debt, and being highly leveraged or dependent on borrowing to stay afloat.