# **Phase-3 : Classification Model** 

This is the final phase where we integrate embeddings obtained from Knowledge Graph, Financial ratios, and Volatility index data to one dataframe, which is trained on logistic regression resulting in different evaluation results, for different combination of the model

This file also shows how we can combine **structerd data** like table of financial ratios, along with **unstructered data** like textual data (summary and KG in this case) to get final result of logistic regression

In [126]:
import pandas as pd

bankrupt_file_path = r'Dataset\FinancialRatios\bankrupt_financial_ratio_dataset_final.xlsx'
healthy_file_path = r'Dataset\FinancialRatios\healthy_financial_ratio_dataset _final.xlsx'

bankrupt_data = pd.read_excel(bankrupt_file_path)
healthy_data = pd.read_excel(healthy_file_path)

##### Mannualy mapped the namings because of complexity of retrival

In [138]:
# manually match the names
matches = {
    "ABGSHIP": "ABG Shipyard Limited",
    "ADHUNIK": "Adhunik Metaliks Limited",
    "ANGIND": "ANG Industries Limited",
    "ASHAPURMIN": "Ashapura Minechem Limited",
    "BAFNAPH": "Bafna Pharmaceuticals Limited",
    "BHUSANSTL": "Bhushan Steel Limited",
    "CANDC": "C & C Constructions Limited",
    "EASUNREYRL": "Easun Reyrolle Limited",
    "EDL": "Empee Distilleries Limited",
    "GALLANT": "Gallantt Ispat Ltd",
    "GEMINI": "Gemini Communication Limited",
    "GUJNRECOKE": "Gujarat NRE Coke Limited",
    "INDOSOLAR": "Indosolar Limited",
    "IVRCLINFRA": "IVRCL Limited",
    "JAIHINDPRO": "Jaihind Projects Limited",
    "JENSONICOL": "Jenson & Nicholson (India) Limited",
    "JPINFRATEC": "Jaypee Infratech Limited",
    "KWALITY": "kwality limited",
    "ADVENZYM": "Advanced Enzyme Tech Ltd.",
    "AFFLE": "Affle (India) Ltd.",
    "ALEMBICLTD": "Alembic Pharmaceuticals Ltd.",
    "AMARAJABAT": "Amara Raja Batteries Ltd.",
    "ASTERDM": "Aster DM Healthcare Ltd.",
    "AVANTIFEED": "Avanti Feeds Ltd.",
    "BALRAMCHIN": "Balrampur Chini Mills Ltd.",
    "CEATLTD": "Ceat Ltd."
}

From the Financial ratio dataset consider only features and files which are accounting for change in classification model accuray

In [139]:
bankrupt_output_path = r'output\bankrupt'
healthy_output_path = r'output\healthy'
import os
import json

bankrupt_df = pd.DataFrame(columns=bankrupt_data.columns)
healthy_df = pd.DataFrame(columns=healthy_data.columns)

for file in os.listdir(bankrupt_output_path):
    # print(file)
    row = bankrupt_data.loc[(bankrupt_data['Folder_name'] == matches[file.split('_')[0]]) & (bankrupt_data['Feature_name'] == int(file.split('_')[-2]))]
    bankrupt_df = pd.concat([bankrupt_df, row], ignore_index=True)

for file in os.listdir(healthy_output_path):
    # print(file)
    row = healthy_data.loc[(healthy_data['Folder_name'] == matches[file.split('_')[0]]) & (healthy_data['Feature_name'] == int(file.split('_')[-2]))]
    healthy_df = pd.concat([healthy_df, row], ignore_index=True)

  bankrupt_df = pd.concat([bankrupt_df, row], ignore_index=True)
  healthy_df = pd.concat([healthy_df, row], ignore_index=True)


In [None]:
bankrupt_df["Label"] = 1
healthy_df["Label"] = 0

financial_df = pd.concat([bankrupt_df, healthy_df], ignore_index=True)

missing_values = financial_df.isnull().sum()
# print(missing_values.tolist())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


Combine **Knowledge Graph Embeddings** with **Financial Ratios** with **Special Features** in consideration for building up the dataset, which is sent as an input for the logistic regression model.

In [164]:
_embeddings = pd.read_csv(r"output\embeddings.csv")
merged_df = pd.concat([financial_df, _embeddings], axis=1)

In [165]:
vix_path = r'Dataset\FinancialRatios\VIX_yearly_means.csv'
vix_df = pd.read_csv(vix_path)
overall_mean_vix = vix_df['Close '].mean()
merged_df = merged_df.merge(vix_df, left_on='Feature_name', right_on='Year', how='left')
merged_df['Close '] = merged_df['Close '].fillna(overall_mean_vix)
merged_df.drop(columns=['Year'], inplace=True)

In [158]:
from sklearn.model_selection import train_test_split
selected_ratios = ["x1", "x4", "x13", "x15", "x26", "x28", "k1", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9", "k10"]

merged_df = merged_df.sort_values(by=['Folder_name', 'Feature_name'])

X = merged_df[selected_ratios]
y = merged_df["Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Here We included **KG Embeddings**, **VIX data** and **Financial ratios** as input for logistic regression model,

This is done in the sense of comparing it with other structure data outputs

In [170]:
selected_ratios_vix = ["x1", "x4", "x13", "x15", "x26", "x28", 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7', 'k8', 'k9', 'k10',
       'Open ', 'High ', 'Low ', 'Close ', 'Prev. Close ', 'Change ',
       '% Change ']

merged_df = merged_df.sort_values(by=['Folder_name', 'Feature_name'])

X_vix = merged_df[selected_ratios_vix]
y_vix = merged_df["Label"]

X_train_vix, X_test_vix, y_train_vix, y_test_vix = train_test_split(X_vix, y_vix, test_size=0.2, random_state=42)

In [None]:
## this is for financial ratios only
# from sklearn.model_selection import train_test_split
# selected_ratios = ["x1", "x4", "x13", "x15", "x26", "x28"]

# financial_df = financial_df.sort_values(by=['Folder_name', 'Feature_name'])

# X = financial_df[selected_ratios]
# y = financial_df["Label"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [190]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

model_performance = {}

imputer = SimpleImputer(strategy='mean')

for model_name, model in models.items():
    print(f"Evaluating {model_name} ....")
    
    pipeline = Pipeline([
        ('imputer', imputer),
        ('classifier', model)
    ])

    cv_accuracy = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy').mean()

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None

    model_performance[model_name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
        "cv_accuracy": cv_accuracy
    }
print("----------------------------------------------")
for model_name, metrics in model_performance.items():
    print(f"\n{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Evaluating Logistic Regression ....
----------------------------------------------

Logistic Regression Performance:
accuracy: 0.9500
precision: 1.0000
recall: 0.9091
f1: 0.9524
roc_auc: 1.0000
cv_accuracy: 0.9875


In [191]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

model_performance = {}

imputer = SimpleImputer(strategy='mean')

for model_name, model in models.items():
    print(f"Evaluating {model_name} ....")
    
    pipeline = Pipeline([
        ('imputer', imputer),
        ('classifier', model)
    ])

    cv_accuracy = cross_val_score(pipeline, X_train_vix, y_train_vix, cv=5, scoring='accuracy').mean()

    pipeline.fit(X_train_vix, y_train_vix)
    y_pred = pipeline.predict(X_test_vix)
    y_pred_proba = pipeline.predict_proba(X_test_vix)[:, 1] if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test_vix, y_pred)
    precision = precision_score(y_test_vix, y_pred)
    recall = recall_score(y_test_vix, y_pred)
    f1 = f1_score(y_test_vix, y_pred)
    roc_auc = roc_auc_score(y_test_vix, y_pred_proba) if y_pred_proba is not None else None

    model_performance[model_name] = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
        "cv_accuracy": cv_accuracy
    }

for model_name, metrics in model_performance.items():
    print(f"\n{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

Evaluating Logistic Regression ....

Logistic Regression Performance:
accuracy: 0.9000
precision: 1.0000
recall: 0.8182
f1: 0.9000
roc_auc: 1.0000
cv_accuracy: 0.9875
