# Ison yritysdatamassan käsittelyä käyttökelpoisemmaksi

In [1]:
import numpy as np
import pandas as pd

## Valittavat sarakkeet

In [2]:
SELECTED_COMPANY_FEATURES = ['company_form_code', 'location_municipality_code', 
                             'location_region_code', 'company_status_code', 'industry_code', 'turnover', 
                             'net_profit', 'personnel_average', 'performer_ranking_points', 'risk_rating_class']

## Ladataan data pandas-dataframeen

In [3]:
COMPANIES = pd \
        .read_csv('data/prod_data_companies_more_data_2021_09_16.csv',
                  delimiter='\t',
                  na_values='(null)',
                  dtype={
                      'business_id': str,
                      'business_id_start_date': str,
                      'company_name': str,
                      'email_address': str,
                      'industry_code': str,
                      'language': str,
                      'company_form': str,
                      'company_form_code': str,
                      'domicile_code': str,
                      'region_code': str,
                      'location_municipality_code': str,
                      'location_region_code': str,
                      'company_status_code': str,
                      'balance_industry_code': str,
                      'personnel_class': str,
                      'turnover_class': str,
                      'statement_date': str,
                      'statement_length': float,
                      'target_year': str,
                      'turnover': float,
                      'raw_materials_use': float,
                      'current_ratio': float,
                      'personnel_expenses': float,
                      'investment_expenses_cash_flow': float,
                      'ebitda': float,
                      'operating_profit_per_turnover': float,
                      'ebit': float,
                      'turnover_growth_percent': float,
                      'ebit_per_turnover': float,
                      'short_term_sales_receivable': float,
                      'net_profit': float,
                      'net_profit_per_turnover': float,
                      'aggregate_equity': float,
                      'return_on_equity_percent': float,
                      'equity_ratio': float,
                      'long_term_sales_receivable': float,
                      'quick_ratio': float,
                      'buildings': float,
                      'return_on_investment_percent': float,
                      'total_assets': float,
                      'external_services': float,
                      'aggregate_current_assets': float,
                      'personnel_average': float,
                      'concern': str,
                      'bulk': str,
                      'performer_ranking_points': float,
                      'risk_rating_class': str,
                      'latest': str
                  }
                  )

print(COMPANIES.head())

  business_id business_id_start_date  \
0    31431209             2020-06-22   
1    32087307             2021-05-04   
2    18601103             2003-10-24   
3    20469041             2006-07-04   
4    24844507             2012-06-18   

                               company_name                    email_address  \
0                      Tmi Onerva Miettunen       onerva.adalmiina@gmail.com   
1                      Koti Vaihtoon LKV Oy  matti.leskinen@kotivaihtoon.com   
2  Teollisuuden Monialapalvelu T:mi Oksanen                              NaN   
3                                  POLYBEAT                              NaN   
4                               SAHOJEN PUU            kalle.kantola@hasa.fi   

  industry_code language                      company_form company_form_code  \
0         43341     CL_1  Yksityinen elinkeinonharjoittaja             CO_26   
1         68310     CL_1                        Osakeyhtiö             CO_16   
2         68209     CL_1  Yksityinen e

## Käsitellään konsernit (poistetaan)

In [4]:
COMPANIES = COMPANIES[COMPANIES['concern'] != 'true']

print(COMPANIES.head())

duplicates = COMPANIES.duplicated('business_id')
duplicates[duplicates == True]

  business_id business_id_start_date  \
0    31431209             2020-06-22   
1    32087307             2021-05-04   
2    18601103             2003-10-24   
3    20469041             2006-07-04   
4    24844507             2012-06-18   

                               company_name                    email_address  \
0                      Tmi Onerva Miettunen       onerva.adalmiina@gmail.com   
1                      Koti Vaihtoon LKV Oy  matti.leskinen@kotivaihtoon.com   
2  Teollisuuden Monialapalvelu T:mi Oksanen                              NaN   
3                                  POLYBEAT                              NaN   
4                               SAHOJEN PUU            kalle.kantola@hasa.fi   

  industry_code language                      company_form company_form_code  \
0         43341     CL_1  Yksityinen elinkeinonharjoittaja             CO_26   
1         68310     CL_1                        Osakeyhtiö             CO_16   
2         68209     CL_1  Yksityinen e

Series([], dtype: bool)

## Valitaan halutut sarakkeet

In [5]:
COMPANIES = COMPANIES[['business_id', 'company_name'] + SELECTED_COMPANY_FEATURES]
print(COMPANIES.head())

  business_id                              company_name company_form_code  \
0    31431209                      Tmi Onerva Miettunen             CO_26   
1    32087307                      Koti Vaihtoon LKV Oy             CO_16   
2    18601103  Teollisuuden Monialapalvelu T:mi Oksanen             CO_26   
3    20469041                                  POLYBEAT             CO_53   
4    24844507                               SAHOJEN PUU             CO_26   

  location_municipality_code location_region_code company_status_code  \
0                        091                   01                 AKT   
1                        091                   01                 AKT   
2                        NaN                  NaN                 AKT   
3                        NaN                  NaN                 AKT   
4                        NaN                  NaN                 AKT   

  industry_code  turnover  net_profit  personnel_average  \
0         43341       NaN         NaN 

## Käsitellään toimialakoodit

In [7]:
def transform_industry(val):
    if (pd.isnull(val)):
        return 'NaN'
    return val[0:2]

COMPANIES['industry_code'] = COMPANIES['industry_code'].apply(transform_industry)

## Käsitellään riskiluokitus

In [8]:
def transform_risk_rating(val):
    if (pd.isnull(val)):
        return 'NaN'
    return val.split('_')[0]

COMPANIES['risk_rating_class'] = COMPANIES['risk_rating_class'].apply(transform_risk_rating)

## Muutetaan valittujen sarakkeiden tiedot avainsanoiksi

In [9]:
def transform_str_to_keyword(val, col_name):
    if (pd.isnull(val)):
        return col_name + '_' + 'NaN'
    return col_name + '_' + val

boundaries = COMPANIES.quantile([0.2, 0.4, 0.6, 0.8, 0.9, 0.98, 0.99])
print(boundaries)

def transform_num_to_keyword(val, col_name):
    if (pd.isnull(val)):
        return col_name + '_' + 'NaN'
    
    col_boundaries = boundaries[col_name]
    for index, item in col_boundaries.iteritems():
        if (val < item):
            return col_name + '_' + str(index)
        
    return col_name + '_' + 'top'

        turnover  net_profit  personnel_average  performer_ranking_points
0.20     21000.0     -4000.0                1.0                      31.0
0.40     84000.0         0.0                2.0                      46.0
0.60    221000.0      7000.0                5.0                      57.0
0.80    736000.0     40000.0               13.0                      70.0
0.90   1913000.0    112960.0               29.0                      77.0
0.98  13937520.0    699000.0              144.0                      86.0
0.99  30044720.0   1515680.0              297.0                      88.0


In [10]:
for col_name in SELECTED_COMPANY_FEATURES:
    dtype = COMPANIES[col_name].dtypes
    if dtype == 'object':
        COMPANIES[col_name] = COMPANIES[col_name].apply(lambda x: transform_str_to_keyword(x, col_name))
    if dtype == 'float64':
        COMPANIES[col_name] = COMPANIES[col_name].apply(lambda x: transform_num_to_keyword(x, col_name))
        

print(COMPANIES.head())

  business_id                              company_name  \
0    31431209                      Tmi Onerva Miettunen   
1    32087307                      Koti Vaihtoon LKV Oy   
2    18601103  Teollisuuden Monialapalvelu T:mi Oksanen   
3    20469041                                  POLYBEAT   
4    24844507                               SAHOJEN PUU   

         company_form_code      location_municipality_code  \
0  company_form_code_CO_26  location_municipality_code_091   
1  company_form_code_CO_16  location_municipality_code_091   
2  company_form_code_CO_26  location_municipality_code_NaN   
3  company_form_code_CO_53  location_municipality_code_NaN   
4  company_form_code_CO_26  location_municipality_code_NaN   

       location_region_code      company_status_code      industry_code  \
0   location_region_code_01  company_status_code_AKT   industry_code_43   
1   location_region_code_01  company_status_code_AKT   industry_code_68   
2  location_region_code_NaN  company_status_cod

## Pikkelöi käsitelty yritysdata

In [None]:
COMPANIES.to_pickle("data/pandas_pickles/prod_data_proto2.pkl")

## Heränneitä kysymyksiä

1. Minkälaiset rajat tunnuslukujen diskretisoinnissa parhaat?