In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel

In [None]:
# Load the data
df = pd.read_csv("sampled_10_percent.csv", delimiter=',')

In [None]:
num_rows = df.shape[0]
print("Total number of rows in the DataFrame:", num_rows)

In [None]:
print(df.head(5))

In [None]:
# Clean the data
df['Bankruptcy'] = df['DLSTCD'].isin(['450', '460', '470', '490'] + list(range(550, 586)))
df['date'] = pd.to_datetime(df['public_date'])
df = df.sort_values(by=['permno', 'public_date'])

# Mark all records of a company as bankrupt if identified as such at any point
df['Bankrupt'] = df.groupby('permno')['Bankruptcy'].transform('max')

# Filter to keep only the most recent 36 months of data for each company
df['date'] = pd.to_datetime(df['public_date'])  # Ensure date is in datetime format
df = df.sort_values(by=['permno', 'date'])  # Sort data

# Filter out companies with less than 36 months of data
filtered_df = df.groupby('permno').filter(lambda x: len(x) >= 36)

# Keep only the most recent 36 months for each company
filtered_df = filtered_df.groupby('permno').apply(lambda x: x.tail(36)).reset_index(drop=True)

In [None]:
print("Original Bankruptcy count:")
print(df['Bankruptcy'].value_counts())

In [None]:
print("\nPropagated Bankrupt count:")
print(df['Bankrupt'].value_counts())

In [None]:
# Check the number of rows after filtering
print("Number of rows after filtering:", filtered_df.shape[0])

In [None]:
# Define features and target
ratios = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10',
          'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19',
          'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28',
          'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37',
          'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46',
          'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55',
          'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64',
          'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71']

In [None]:
# Split the data
X = df[ratios]
y = df['Bankrupt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Impute missing values using SimpleImputer with mean strategy
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [None]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


In [None]:
# Balance class distribution SMOTE
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_scaled, y_train)


In [None]:
# Feature selection with LASSO
# Define and fit the Lasso model
lasso = Lasso(alpha=0.001, random_state=42)
lasso.fit(X_train_resampled, y_train_resampled)

# Select features based on importance (weights)
model = SelectFromModel(lasso, prefit=True)
X_train_selected = model.transform(X_train_resampled)
X_test_selected = model.transform(X_test_scaled) 

In [None]:
# Get selected features
selected_features = X.columns[(model.get_support())]
print("Selected features:", selected_features)

In [None]:
# Extracting the coefficients from the Lasso model
lasso_coefficients = lasso.coef_

# Creating a DataFrame for the coefficients
coefficients_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': lasso_coefficients})

# Filter out only the selected features
selected_coefficients_df = coefficients_df[coefficients_df['Feature'].isin(selected_features)]

# Calculating the absolute values of the coefficients
selected_coefficients_df['Abs_Coefficient'] = selected_coefficients_df['Coefficient'].abs()

# Normalizing the coefficients to get their percentage contribution
total = selected_coefficients_df['Abs_Coefficient'].sum()
selected_coefficients_df['Percentage_Contribution'] = (selected_coefficients_df['Abs_Coefficient'] / total) * 100

# Sorting by importance (highest to lowest)
selected_coefficients_df

In [None]:
print(selected_coefficients_df.head(36))