## Preparation of the dataset

This notebook contains adjustments of the dataset in order to be ready for the classifier. This includes analyzing the correlation and dropping variables with a perfect correlation.

### 1. Import

In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt

### 2. Loading the data

In [2]:
# read the csv file
df = pd.read_csv("final_df_dec.csv")
df.head()

Unnamed: 0,company_name,city,district,province,industry,sector,last_year_ava,plbt_2016,plbt_2015,plbt_2014,...,wc_2015,wc_2014,wc_2013,wc_2012,ebit_2016,ebit_2015,ebit_2014,ebit_2013,ebit_2012,bankrupt
0,"INTERCEMENT PORTUGAL, S.A.",Lisboa,Lisboa,Lisboa,business and other management consultancy acti...,consultancy,2018-12-31,-6942.67,-14438.98,4059.84,...,,,,,-5824.87,-13714.53,-574.51,10786.05,31216.17,0
1,"TAP - TRANSPORTES AÉREOS PORTUGUESES, SGPS, S.A.",Lisboa,Lisboa,Lisboa,activities of holding companies non financial,holding companies,2018-12-31,6612.05,-117714.28,-12043.21,...,,,,,-42123.12,5689.15,5772.95,4640.32,-17129.81,0
2,"INVESTGAVE, SGPS, S.A.",Lisboa,Lisboa,Lisboa,activities of holding companies non financial,holding companies,2018-12-31,-145848.42,-141530.59,-280881.84,...,,,,,-145182.9,-140783.81,-280590.36,-146308.52,-146.32,0
3,SOMAGUE - SOCIEDADE GESTORA DE PARTICIPAÇÕES S...,Lisboa,Lisboa,Lisboa,activities of holding companies non financial,holding companies,2018-12-31,-32358.51,-34433.79,-121.3,...,,,,,-30769.77,-32322.32,2278.59,8806.28,6533.03,0
4,"INVESTGAVE IV, SGPS, S.A.",Lisboa,Lisboa,Lisboa,activities of holding companies non financial,holding companies,2018-12-31,-40568.9,-31819.79,-37698.85,...,,,,,-12652.25,-14650.9,-17555.11,-55605.31,-11.65,0


### 3. Correlation

In [3]:
# check perfect correlation apperance 
corr = df.corr()
indices = np.where(corr > 0.99)
indices = [(corr.index[x], corr.columns[y]) for x, y in zip(*indices)
                                        if x != y and x < y]
indices

[('plbt_2016', 'netinc_2016'),
 ('plbt_2015', 'netinc_2015'),
 ('plbt_2013', 'netinc_2013'),
 ('plbt_2012', 'netinc_2012'),
 ('totass_2013', 'totass_2012'),
 ('ecoprof_2016', 'rota_2016'),
 ('ecoprof_2015', 'rota_2015'),
 ('ecoprof_2014', 'rota_2014'),
 ('ecoprof_2013', 'rota_2013'),
 ('ecoprof_2012', 'rota_2012'),
 ('finprof_2016', 'rosf_2016'),
 ('finprof_2015', 'rosf_2015'),
 ('finprof_2014', 'rosf_2014'),
 ('finprof_2013', 'rosf_2013'),
 ('finprof_2012', 'rosf_2012'),
 ('genliq_2016', 'curr_2016'),
 ('genliq_2015', 'curr_2015'),
 ('genliq_2014', 'curr_2014'),
 ('genliq_2013', 'curr_2013'),
 ('genliq_2012', 'curr_2012'),
 ('indeptness_2015', 'indeptness_2014'),
 ('indeptness_2015', 'indeptness_2013'),
 ('indeptness_2014', 'indeptness_2013'),
 ('solr_2015', 'solr_2014'),
 ('solr_2015', 'solr_2013'),
 ('solr_2014', 'solr_2013'),
 ('totassets/empl_2013', 'totassets/empl_2012')]

### 4. Dropping entire columns because of correlation

In [4]:
# variables to drop because of perfect correlation as seen before
feat_to_drop = ['plbt_2016','plbt_2015',"plbt_2014",'plbt_2013','plbt_2012',
                'ecoprof_2016',"ecoprof_2015","ecoprof_2014","ecoprof_2013","ecoprof_2012",
                'finprof_2016',"finprof_2015","finprof_2014","finprof_2013","finprof_2012",
                'genliq_2016',"genliq_2015","genliq_2014","genliq_2013","genliq_2012",
                'indeptness_2016',"indeptness_2015","indeptness_2014","indeptness_2013","indeptness_2012",
                'solr_2016',"solr_2015","solr_2014","solr_2013","solr_2012",
                'totass_2013', 'totass_2012','totassets/empl_2013', 'totassets/empl_2012'          
                ]

In [5]:
# function for dropping specific colums by means of a list of these columns
def dropping(df,feat_to_drop):
    df = df.iloc[:, 7:178]
    df = df.drop(feat_to_drop, axis = 1)
    return df

### 5. Dividing the dataframe in subdataframes for every year

In [6]:
# function for dividing the entire dataframe in subdataframes
def division_subdf(df):
    col_2016 = [x for x in df.columns if "2016" in x]
    col_2016.append("bankrupt")
    df_2016 = df.loc[:, col_2016]
    return df_2016

### 6. Pipeline data preparation

In [7]:
# function for the whole preparation pipeline
def pipeline_preparation(df):
    
    df = dropping(df,feat_to_drop)
    df_2016 = division_subdf(df)
    
    return df, df_2016

In [8]:
# calling the pipeline_preparation function  
df, df_2016 = pipeline_preparation(df)