## Import Data and Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Import data
path = "Data/UTF_Data4.csv"
df = pd.read_csv(path, sep=';')
df = df.replace("-",np.nan)
df = df.replace("0",np.nan)
df.head(3)

Unnamed: 0,Company name,Org. no,Last annual report,Currency,"Num. of employees, stock company 2020 (pcs)","Num. of employees, stock company 2019 (pcs)","Num. of employees, stock company 2018 (pcs)","Num. of employees, stock company 2017 (pcs)","Num. of employees, stock company 2016 (pcs)","Num. of employees, stock company 2015 (pcs)",...,Debt/equity ratio 2019 (%),Debt/equity ratio 2018 (%),Debt/equity ratio 2017 (%),Debt/equity ratio 2016 (%),Debt/equity ratio 2015 (%),Debt/equity ratio 2014 (%),Debt/equity ratio 2013 (%),Debt/equity ratio 2012 (%),Debt/equity ratio 2011 (%),Debt/equity ratio 2010 (%)
0,Ekman Invest Holding AB,5567121602,201912,SEK,,301,,283,279,43,...,672,691,523,541,515,764,614,571,597,494
1,Axfood Snabbgross AB,5560003575,201912,SEK,,411,401.0,384,365,335,...,844,1206,121,1542,1566,1161,1054,1219,1316,1534
2,LWW Group AB,5565295333,201912,SEK,,347,361.0,337,332,340,...,191,177,177,11,87,112,105,102,29,33


## Data Cleansing

### Get overview

In [3]:
# Get overview
print("Shape of the dataset:", df.shape)

df.describe(include="all")

Shape of the dataset: (908, 81)


Unnamed: 0,Company name,Org. no,Last annual report,Currency,"Num. of employees, stock company 2020 (pcs)","Num. of employees, stock company 2019 (pcs)","Num. of employees, stock company 2018 (pcs)","Num. of employees, stock company 2017 (pcs)","Num. of employees, stock company 2016 (pcs)","Num. of employees, stock company 2015 (pcs)",...,Debt/equity ratio 2019 (%),Debt/equity ratio 2018 (%),Debt/equity ratio 2017 (%),Debt/equity ratio 2016 (%),Debt/equity ratio 2015 (%),Debt/equity ratio 2014 (%),Debt/equity ratio 2013 (%),Debt/equity ratio 2012 (%),Debt/equity ratio 2011 (%),Debt/equity ratio 2010 (%)
count,908,908.0,907.0,908,5.0,826.0,853.0,853.0,833.0,808.0,...,587.0,642.0,628.0,612.0,605.0,588.0,568.0,557.0,496.0,478.0
unique,908,,26.0,5,1.0,234.0,283.0,313.0,308.0,314.0,...,401.0,425.0,423.0,401.0,387.0,389.0,381.0,380.0,362.0,352.0
top,Apple Retail Sweden AB,,201912.0,SEK,200.0,200.0,200.0,200.0,200.0,200.0,...,109.0,89.0,11.0,185.0,87.0,107.0,153.0,113.0,6.0,191.0
freq,1,,702.0,593,5.0,250.0,243.0,245.0,243.0,232.0,...,6.0,6.0,5.0,6.0,7.0,6.0,6.0,5.0,5.0,5.0
mean,,3710139000.0,,,,,,,,,...,,,,,,,,,,
std,,2552399000.0,,,,,,,,,...,,,,,,,,,,
min,,10089750.0,,,,,,,,,...,,,,,,,,,,
25%,,74897920.0,,,,,,,,,...,,,,,,,,,,
50%,,5562348000.0,,,,,,,,,...,,,,,,,,,,
75%,,5566133000.0,,,,,,,,,...,,,,,,,,,,


### Remove missing values
1. List/plot the number of missing values for each column
2. Remove columns with many missing values
3. Then remove all rows with missing values

When done, use .describe() again and see the changes

In [4]:
def removeRareColumns(df, threshold=0.80):
    dfLength = len(df)
    for column in df:
        count = df[column].count()
        percent = count / dfLength

        if percent < threshold:
            print(column)
            df = df.drop(column, axis=1)
        else:
            pass
    return df

df = removeRareColumns(df)

Num. of employees, stock company 2020 (pcs)
Num. of employees, stock company 2012 (pcs)
Num. of employees, stock company 2011 (pcs)
Num. of employees, stock company 2010 (pcs)
EBITDA 2020 (k)
EBITDA 2012 (k)
EBITDA 2011 (k)
EBITDA 2010 (k)
Gross profit/loss 2020 (k)
Gross profit/loss 2019 (k)
Gross profit/loss 2018 (k)
Gross profit/loss 2017 (k)
Gross profit/loss 2016 (k)
Gross profit/loss 2015 (k)
Gross profit/loss 2014 (k)
Gross profit/loss 2013 (k)
Gross profit/loss 2012 (k)
Gross profit/loss 2011 (k)
Gross profit/loss 2010 (k)
R & D costs 2020 (k)
R & D costs 2019 (k)
R & D costs 2018 (k)
R & D costs 2017 (k)
R & D costs 2016 (k)
R & D costs 2015 (k)
R & D costs 2014 (k)
R & D costs 2013 (k)
R & D costs 2012 (k)
R & D costs 2011 (k)
R & D costs 2010 (k)
Total assets 2020 (k)
Total assets 2012 (k)
Total assets 2011 (k)
Total assets 2010 (k)
Quick ratio 2020 (%)
Quick ratio 2012 (%)
Quick ratio 2011 (%)
Quick ratio 2010 (%)
Debt/equity ratio 2020 (%)
Debt/equity ratio 2019 (%)
Debt/e

In [5]:
df.dropna()

Unnamed: 0,Company name,Org. no,Last annual report,Currency,"Num. of employees, stock company 2019 (pcs)","Num. of employees, stock company 2018 (pcs)","Num. of employees, stock company 2017 (pcs)","Num. of employees, stock company 2016 (pcs)","Num. of employees, stock company 2015 (pcs)","Num. of employees, stock company 2014 (pcs)",...,Total assets 2015 (k),Total assets 2014 (k),Total assets 2013 (k),Quick ratio 2019 (%),Quick ratio 2018 (%),Quick ratio 2017 (%),Quick ratio 2016 (%),Quick ratio 2015 (%),Quick ratio 2014 (%),Quick ratio 2013 (%)
1,Axfood Snabbgross AB,5560003575,201912,SEK,411,401,384,365,335,316,...,331 720,288 487,236 497,276,347,306,429,491,514,355
2,LWW Group AB,5565295333,201912,SEK,347,361,337,332,340,346,...,777 425,868 339,816 246,81,863,88,110,1218,1044,975
3,Atteviks Bil Aktiebolag,5562130400,201912,SEK,453,443,434,419,407,344,...,894 424,723 909,606 718,1245,1349,899,1011,773,83,878
5,GIPEJO Förvaltnings Aktiebolag,5563246312,201912,SEK,272,245,203,193,172,154,...,736 550,542 758,408 265,409,415,379,428,428,50,564
6,Bröderna Börjessons Bil Aktiebolag,5561762864,201912,SEK,399,399,275,234,230,222,...,749 059,578 237,463 515,735,677,712,785,1108,956,874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
901,EGONS A/S,33053835,201912,DKK,200,200,200,200,200,100,...,78 335,64 346,75 321,135,126,129,12,117,084,075
902,DEICHMANN-SKO ApS,27278388,201912,DKK,200,200,200,200,200,200,...,78 128,82 056,94 092,448,467,264,259,287,223,185
903,G. TSCHERNING A/S,26378443,202004,DKK,200,200,200,200,200,200,...,123 125,124 368,96 692,129,123,115,117,09,099,108
906,SAIPEM DRILLING NORWAY AS,998277418,201912,EUR,224,190,230,223,225,229,...,814 708,841 531,1 040 811,011,23,983,234,123,034,033


### Remove outliers 
Use winzorization:

df = winDf.clip(df.quantile(0.05), df.quantile(0.95), axis=1)

### One-hot encode the columns with text or categorical values
Probably not necessary for our case

### If needed, calculate predictor column 

### Make some interesting plots (can be skipped)

# Modeling