# Classification

In [None]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Visegrad Group Companies Dataset

We use Visegrad Group Companies dataset for our clustering.
We first import the dataset as follows.

https://archive.ics.uci.edu/dataset/830/visegrad+group+companies+data-2


X1: Net profit/total assets <br/>
X2: Total liabilities/total assets <br/>
X3: Working capital/total assets <br/>
X4: Current assets/short-term liabilities <br/>
X5: Retained earnings/total assets <br/>
X6: Gross profit/total assets <br/>
X7: Book value of equity/total liabilities <br/>
X8: Net sales revenue/total assets <br/>
X9: Equity/total assets <br/>
X10: (Gross profit + financial expenses)/total assets <br/>
X11: Gross profit/short-term liabilities <br/>
X12: (Gross profit + depreciation)/sales revenues <br/>
X13: EBIT/total operating costs <br/>
X14: (Gross profit + depreciation)/total liabilities <br/>
X15: Total assets/total liabilities <br/>
X16: EBIT/total liabilities <br/>
X17: Gross profit/sales revenues <br/>
X18: EBIT/total assets <br/>
X19: Net profit/sales revenues <br/>
X20: (Equity-share capital)/total assets <br/>
X21: (Net profit + depreciation)/total liabilities <br/>
X22: EBIT/financial expenses <br/>
X23: Working capital/fixed assets <br/>
X24: Logarithm of total assets <br/>
X25: (Total liabilities-cash)/sales revenues <br/>
X26: EBIT/equity <br/>
X27: Operating expenses/short-term liabilities <br/>
X28: Operating expenses/total liabilities <br/>
X29: Profit on sales/total assets <br/>
X30: Total operating revenue/total assets <br/>
X31: (Current assets-inventories)/long-term liabilities <br/>
X32: Constant capital/total assets <br/>
X33: Profit on sales/sales revenues <br/>
X34: (Current assets-inventory-receivables)/short-term liabilities <br/>
X35: EBIT/sales revenues <br/>
X36: Net profit/inventory <br/>
X37: (Current assets-inventory)/short-term liabilities <br/>
X38: EBITDA/total assets <br/>
X39: EBITDA/sales revenues <br/>
X40: Current assets/total liabilities <br/>
X41: Short-term liabilities/total assets <br/>
X42: Equity/fixed assets <br/>
X43: Constant capital/fixed assets <br/>
X44: Working capital <br/>
X45: Net profit/equity <br/>
X46: Long-term liabilities/equity <br/>
X47: Sales revenues/inventory <br/>
X48: Sales revenues/receivables <br/>
X49: Sales revenues/short-term liabilities <br/>
X50: Sales/fixed assets <br/>
X51: (Current assets-inventory-short-term liabilities)/(total operating revenues-profit before income tax-depreciation) <br/>
X52: Net profit/net cash flow from (used in) operating activities <br/>
X53: Depreciation/net cash flow from (used in) operating activities <br/>
X54: Net cash flow from (used in) operating activities/total assets <br/>
X55: Net cash flow from (used in) operating activities/income <br/>
X56: Net cash flow from (used in) operating activities/total liabilities <br/>
X57: Net cash flow from (used in) operating activities/long-term liabilities <br/>
X58: Net cash flow from (used in) operating activities/short-term liabilities <br/>
X59: Net cash flow <br/>
X60: Net cash flow from (used in) operating activities/current assets <br/>
X61: Net cash flow from (used in) operating activities/EBIT <br/>
X62: Net profit per share <br/>
X63: Income/outstanding shares <br/>
X64: Price per share/net profit per share <br/>
X65: Yearly dividend/price per share <br/>
X66: Market capitalization/book value <br/>
X67: Market capitalization/gross profit <br/>
X68: Market capitalization/EBITDA <br/>
X69: Market capitalization to EBIT <br/>
X70: Market capitalization to total assets <br/>
X71: Market capitalization/capital employed <br/>
X72: Sales revenues (n)/sales revenues (n−1) <br/>
X73: Total sales revenue (n)/total sales revenues (n−1) <br/>
X74: Total assets (n)/total assets (n−1) <br/>
X75: Current assets (n)/current assets (n−1) <br/>
X76: EBIT (n)/EBIT (n−1) <br/>
X77: Net profit (n)/net profit (n−1) <br/>
X78: Inventory (n)/inventory (n−1) <br/>
X79: Receivables (n)/receivables (n−1) <br/>
X80: Short-term liabilities (n)/short-term liabilities (n−1) <br/>
X81: Net cash flow from (used in) operating activities (n)/Net cash flow from (used in) operating activities (n−1) <br/>
X82: Net cash flow(n)/net cash flow (n−1) <br/>

<br/>
Class Labels <br/>
(S): 1. Transportation and warehousing; 2. Wholesale trade; <br/>
3. Manufacturing; 4. Retail trade; 5. Energy; 6. Construction.


### Import dataset

In [None]:
#importing dataset
df_full = pd.read_csv('csv_result-2019.csv', header="infer")

df_full_copy = df_full.copy()

# Display the DataFrame
df_full_copy




### Filter to select the 10 most important attributes to measure performance of a company

to reduce the number of attributes we are working with for better visualisation

X1: Net profit/total assets — Measures profitability relative to assets. </br>
X6: Gross profit/total assets — Efficiency in using assets to generate profit.</br>
X4: Current assets/short-term liabilities — Liquidity ratio indicating the ability to cover short-term obligations.</br>
X7: Book value of equity/total liabilities — Financial leverage and ability to meet obligations with equity.</br>
X19: Net profit/sales revenues — Profitability relative to revenue.</br>
X2: Total liabilities/total assets — Leverage ratio, indicating the proportion of assets financed by debt.</br>
X13: EBIT/total operating costs — Operational efficiency indicator.</br>
X9: Equity/total assets — Shows how much of the company's assets are financed by equity.</br>
X54: Net cash flow from operating activities/total assets — Operational cash flow efficiency.</br>
X72: Sales revenues (n)/sales revenues (n−1) — Sales growth indicator.</br>

In [None]:
# Choose 10 key attributes that are most relevant for your analysis
df = df_full_copy[['id', 'Num', 'Country', 'X1', 'X2', 'X4', 'X6', 'X7', 'X9', 'X13', 'X19', 'X54', 'X72','S']]

# 2. Data Preprocessing

In [None]:

print('Number of instances = %d' % df.shape[0])
print('Number of attributes = %d' % df.shape[1])
df.head()
print(df.dtypes)

In [None]:
# Converts to float, setting non-numeric to NaN

df['X1'] = pd.to_numeric(df['X1'], errors='coerce')
df['X2'] = pd.to_numeric(df['X2'], errors='coerce')
df['X4'] = pd.to_numeric(df['X4'], errors='coerce')
df['X6'] = pd.to_numeric(df['X6'], errors='coerce')
df['X7'] = pd.to_numeric(df['X7'], errors='coerce')
df['X9'] = pd.to_numeric(df['X9'], errors='coerce')
df['X13'] = pd.to_numeric(df['X13'], errors='coerce')
df['X19'] = pd.to_numeric(df['X19'], errors='coerce')
df['X54'] = pd.to_numeric(df['X54'], errors='coerce')
df['X72'] = pd.to_numeric(df['X72'], errors='coerce')


df['S'] = df['S'].astype('category')
# Convert 'Country' to categorical
df['Country'] = df['Country'].astype('category')

# Check data types to confirm
print(df.dtypes)
print(df['S'].cat.categories)

### Missing Values


In [None]:
print('Number of instances = %d' % df.shape[0])
print('Number of attributes = %d' % df.shape[1])

print('Number of missing values:')
for col in df.columns:
    print('\t%s: %d' % (col, df[col].isna().sum()))

In [None]:
print(df.head)

In [None]:
df_cleaned = df.dropna()

In [None]:
print('Number of instances = %d' % df_cleaned.shape[0])
print('Number of attributes = %d' % df_cleaned.shape[1])

print('Number of missing values:')
for col in df_cleaned.columns:
    print('\t%s: %d' % (col, df_cleaned[col].isna().sum()))

In [None]:
print(df_cleaned.head)

In [None]:
#drop id and Num
data2 = df_cleaned.drop(['id','Country', 'Num',"S"], axis=1)
data2

#show boxplot to detect outliers
data2.boxplot()
plt.show()

this shows that X4,X7,X72 has abnomally high values while X13,X19 has abnormally low values

In [None]:
z = (data2-data2.mean())/data2.std()
z

In [None]:
df_cleaned['z_score_X1'] = z['X1']
df_cleaned['z_score_X2'] = z['X2']
df_cleaned['z_score_X4'] = z['X4']
df_cleaned['z_score_X6'] = z['X6']
df_cleaned['z_score_X7'] = z['X7']
df_cleaned['z_score_X9'] = z['X9']
df_cleaned['z_score_X13'] = z['X13']
df_cleaned['z_score_X19'] = z['X19']
df_cleaned['z_score_X54'] = z['X54']
df_cleaned['z_score_X72'] = z['X72']

df_cleaned


In [None]:
df_cleaned = df_cleaned[df_cleaned['z_score_X1'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X2'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X4'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X6'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X7'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X9'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X13'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X19'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X54'].between(-3, 3)]
df_cleaned = df_cleaned[df_cleaned['z_score_X72'].between(-3, 3)]

df_cleaned


In [None]:
df_cleaned = df_cleaned.drop(columns=['z_score_X1', 'z_score_X2', 'z_score_X4', 'z_score_X6', 'z_score_X7', 
                                      'z_score_X9', 'z_score_X13', 'z_score_X19', 'z_score_X54', 'z_score_X72'], axis=1)

df_cleaned


In [None]:
#check any duplicated data
dups = df_cleaned.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))

# 3. Classification