In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import seaborn as sns

Load dateset and display basic information

In [None]:
df = pd.read_csv('./datasets/full_cleaned_dataset.csv')
df.info()

Calcualte the Altman-Z score for each observation (where possible)
And show the top and bottom 5 companies based on the Altman-Z score

In [None]:
df["z_score"] = 1.2*(df["totalCurrentAssets"]/df["totalAssets"]) + 1.4*df["retainedEarnings"]/df["totalAssets"] + 3.3 * (df["ebitda"]-df["depreciationAndAmortization_income_statement"])/df["totalAssets"] + 0.6*df["marketcap"] / df["totalLiabilities"] + 0.999*df["revenue"]/df["totalAssets"]
df["z_score"] = df["z_score"].round(2)
df.dropna(subset=['z_score'], inplace=True)

df_zscore_analysis = df[['symbol', 'year', 'z_score', 'distressed', 'totalCurrentAssets', 'totalAssets', 'retainedEarnings', 'ebitda', 'depreciationAndAmortization_income_statement', 'totalLiabilities', 'marketcap', 'revenue']]
df_zscore_print = df[['symbol', 'year', 'z_score', 'distressed']]
print(df_zscore_print.sort_values(by=['z_score'], ascending=False).head(10))
print(df_zscore_print.sort_values(by=['z_score'], ascending=True).head(10))

The Altman-Z score is usually interpreted as follows:
- Z > 3.0: Safe zone
- 1.8 < Z < 3.0: Grey zone
- Z < 1.8: Distress zone

Looking at the top and bottom 10, we get very extreme values (above 25 and below -6) which are likely due to the faulty or inaccurate data. These entries should be considered as outliers and removed for further analysis.
Interestingly, in the bottom 10, we don't have companies that we considered to be distressed. In a next step let's take a look at the score of the distressed companies.

In [None]:
print(df_zscore_print.where(df_zscore_print['distressed'] == 1).sort_values(by=['z_score'], ascending=False).dropna())

We can see the the Altman-Z score does seem to be a decent indicator in the case of our distressed companies.
CSGN (Credit Suisse) does have a very low score
STLN (Swiss Steel Holding) had a low score in 2020, but it recovered in later years. Checking the news, it shows that the company went through rebranding, after some stressful years. In 2023 it dropped again, fitting with their wish to remove themselves from the stock market.

Now let's look at the distribution just numerically and visually in a histogram.

In [None]:
print(df['z_score'].describe())  

# Histogram
bins =  int(math.sqrt(len(df.index))) # number of bins based on square root of number of data points
plt.hist(df['z_score'], bins=bins, edgecolor='black')
plt.title('Z-Score Distribution')
plt.xlabel('Z-Score')
plt.ylabel('Frequency')
plt.show()



This is just another proof that there are extreme outliers in the data.
Let's remove the outliers based the standard deviation (7.48) and see how the distribution looks like.

In [None]:
std = 7.48
mean = 4.03

df_1std = df.where((df['z_score'] >= mean - std) & (df['z_score'] <= mean + std)).dropna()
df_2std = df.where((df['z_score'] >= mean - 2*std) & (df['z_score'] <= mean + 2*std)).dropna()

print(df_1std['z_score'].describe())
print(df_2std['z_score'].describe())

bins1 = int(math.sqrt(len(df_1std.index)))
bins2 = int(math.sqrt(len(df_2std.index)))

fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
plt.ylabel('Frequency')
axs[0].hist(df_1std['z_score'], bins=bins1, edgecolor='black')
axs[0].set_title('1 Standard Deviation')
axs[1].hist(df_2std['z_score'], bins=bins2, edgecolor='black')
axs[1].set_title('2 Standard Deviations')
plt.show()


Staying within a single standard deviation the number of observations drop from 731 to 667. This is still a good number of observations to work with.
The bad news is that the means is still very high at 3.49. The expected mean should be closer to 1.8, which is also the cutoff point for the grey zone according to Altman. However, considering that the data we're working with aren't just random companies, but sourced from a high perfoming index, this might be a reasonable result.

Interesting are the companies with negative scores, let's take a look at them.

In [None]:
negative_scores = df_1std.where(df_1std['z_score']<0).dropna()
negative_z_score_analysis = negative_scores[['symbol', 'year', 'z_score', 'totalCurrentAssets', 'totalAssets', 'retainedEarnings', 'ebitda', 'depreciationAndAmortization_income_statement', 'totalLiabilities', 'marketcap', 'revenue']]


In [None]:
negative_z_score_analysis['ebit'] = negative_z_score_analysis['ebitda'] - negative_z_score_analysis['depreciationAndAmortization_income_statement']
negative_z_score_analysis = negative_z_score_analysis.drop(columns=['ebitda', 'depreciationAndAmortization_income_statement'])
negative_z_score_analysis['A'] = negative_z_score_analysis['totalCurrentAssets'] / negative_z_score_analysis['totalAssets']
negative_z_score_analysis['B'] = negative_z_score_analysis['retainedEarnings'] / negative_z_score_analysis['totalAssets']
negative_z_score_analysis['C'] = negative_z_score_analysis['ebit'] / negative_z_score_analysis['totalAssets']
negative_z_score_analysis['D'] = negative_z_score_analysis['marketcap'] / negative_z_score_analysis['totalLiabilities']
negative_z_score_analysis['E'] = negative_z_score_analysis['revenue'] / negative_z_score_analysis['totalAssets']

negative_z_score_analysis.to_csv('./datasamples/negative_z_score_analysis.csv', index=False)

In [None]:
corr_data_raw = negative_z_score_analysis[['z_score', 'totalCurrentAssets', 'totalAssets', 'retainedEarnings', 'ebit', 'marketcap', 'totalLiabilities', 'revenue']]
corr_data_z = negative_z_score_analysis[['z_score','A', 'B', 'C', 'D', 'E']]

matrix1 = corr_data_raw.corr(numeric_only=True)
matrix2 = corr_data_z.corr(numeric_only=True)

fig, axs = plt.subplots(ncols=2, figsize=(15, 5))
sns.heatmap(matrix1, annot=True, ax=axs[0])
sns.heatmap(matrix2, annot=True, ax=axs[1])
plt.show()

Now we have to matrices showing correlations between the Altman Z-Score and the datapoints of the companies with negative scores.
The first matrix show the correlations of the datapoints as given in the dataset.
The second matrix shows the correlations of the ratios used by the Altman Z-Score calculation.
Here we see that 'B', the retained Earnings to total Assets ratio has the strongest correlation to the z-score.

Looking at the data, we see that all retained earnings are negative in this subset.

Let's create the second matrix using the entire dataset, and compare two matrices.

In [None]:
complete_z_score_analysis = df_1std[['symbol', 'year', 'z_score', 'totalCurrentAssets', 'totalAssets', 'retainedEarnings', 'ebitda', 'depreciationAndAmortization_income_statement', 'totalLiabilities', 'marketcap', 'revenue']]
complete_z_score_analysis['ebit'] = complete_z_score_analysis['ebitda'] - complete_z_score_analysis['depreciationAndAmortization_income_statement']
complete_z_score_analysis = complete_z_score_analysis.drop(columns=['ebitda', 'depreciationAndAmortization_income_statement'])
complete_z_score_analysis['A'] = complete_z_score_analysis['totalCurrentAssets'] / complete_z_score_analysis['totalAssets']
complete_z_score_analysis['B'] = complete_z_score_analysis['retainedEarnings'] / complete_z_score_analysis['totalAssets']
complete_z_score_analysis['C'] = complete_z_score_analysis['ebit'] / complete_z_score_analysis['totalAssets']
complete_z_score_analysis['D'] = complete_z_score_analysis['marketcap'] / complete_z_score_analysis['totalLiabilities']
complete_z_score_analysis['E'] = complete_z_score_analysis['revenue'] / complete_z_score_analysis['totalAssets']

complete_corr_data = complete_z_score_analysis[['z_score','A', 'B', 'C', 'D', 'E']]

complete_matrix = complete_corr_data.corr(numeric_only=True)
negative_matrix = corr_data_z.corr(numeric_only=True)

fig, axs = plt.subplots(ncols=2, figsize=(15, 5))
sns.heatmap(complete_matrix, annot=True, ax=axs[0])
sns.heatmap(negative_matrix, annot=True, ax=axs[1])
axs[0].set_title('All Z-Score')
axs[1].set_title('Negative Z-Score')
plt.show()

In comparison, the correlation of B is weaker in the full dataset, but it's still decently strong.
The strongest correlation is now D, Market Cap to Total Liabilities ratio. Which I believe logically makes sense, as we're dealing with companies that have been sourced from a high performing index.

In [None]:
# barchart of df_zscore_analysis	
df_zscore_analysis = df_zscore_analysis.sort_values(by=['z_score'], ascending=False)

plt.bar(df_zscore_analysis['symbol'], df_zscore_analysis['z_score'])
