# Ison yritysdatamassan käsittelyä käyttökelpoisemmaksi

- Kaikki yritykset
- Viimeisimmän tilikauden tiedot, jos löytyy
- Lakanneet poistettu etukäteen
- Konsernit poistetaan tässä käsittelyssä

In [22]:
import numpy as np
import pandas as pd

WORKING_DIRECTORY = '/mnt/d/git/masters-thesis-code/jupyter/code/'

## Valittavat sarakkeet

In [23]:
SELECTED_COMPANY_FEATURES = ['company_form_code', 'location_municipality_code', 
                             'location_region_code', 'company_status_code', 'industry_code', 'turnover', 
                             'net_profit', 'personnel_average', 'performer_ranking_points', 'risk_rating_class']

## Ladataan data pandas-dataframeen

In [None]:
COMPANIES = pd \
        .read_csv(WORKING_DIRECTORY + 'data/prod_data_companies_more_data_2021_09_16.csv',
                  delimiter='\t',
                  na_values='(null)',
                  dtype={
                      'business_id': str,
                      'business_id_start_date': str,
                      'company_name': str,
                      'email_address': str,
                      'industry_code': str,
                      'language': str,
                      'company_form': str,
                      'company_form_code': str,
                      'domicile_code': str,
                      'region_code': str,
                      'location_municipality_code': str,
                      'location_region_code': str,
                      'company_status_code': str,
                      'balance_industry_code': str,
                      'personnel_class': str,
                      'turnover_class': str,
                      'statement_date': str,
                      'statement_length': float,
                      'target_year': str,
                      'turnover': float,
                      'raw_materials_use': float,
                      'current_ratio': float,
                      'personnel_expenses': float,
                      'investment_expenses_cash_flow': float,
                      'ebitda': float,
                      'operating_profit_per_turnover': float,
                      'ebit': float,
                      'turnover_growth_percent': float,
                      'ebit_per_turnover': float,
                      'short_term_sales_receivable': float,
                      'net_profit': float,
                      'net_profit_per_turnover': float,
                      'aggregate_equity': float,
                      'return_on_equity_percent': float,
                      'equity_ratio': float,
                      'long_term_sales_receivable': float,
                      'quick_ratio': float,
                      'buildings': float,
                      'return_on_investment_percent': float,
                      'total_assets': float,
                      'external_services': float,
                      'aggregate_current_assets': float,
                      'personnel_average': float,
                      'concern': str,
                      'bulk': str,
                      'performer_ranking_points': float,
                      'risk_rating_class': str,
                      'latest': str
                  }
                  )

COMPANIES.head()

## Käsitellään konsernit (prefiksoi)

In [25]:
def handle_concerns(row):
    if (row.concern == 'true'):
        return 'K-' + row.business_id
    return row.business_id

COMPANIES['business_id'] = COMPANIES.apply(handle_concerns, axis=1)



## Tsekataan muokatut yritystunnisteet

In [26]:
COMPANIES[COMPANIES['company_name'] == 'Leipomo Rosten Oy']

Unnamed: 0,business_id,business_id_start_date,company_name,email_address,industry_code,language,company_form,company_form_code,domicile_code,region_code,...,return_on_investment_percent,total_assets,external_services,aggregate_current_assets,personnel_average,concern,bulk,performer_ranking_points,risk_rating_class,latest
1333142,01370820,1978-03-15,Leipomo Rosten Oy,,10710,CL_1,Osakeyhtiö,CO_16,853,RE_02,...,8.2,13753000.0,2971000.0,1157000.0,117.0,False,False,50.0,GREEN_A_PLUS,True
1333143,K-01370820,1978-03-15,Leipomo Rosten Oy,,10710,CL_1,Osakeyhtiö,CO_16,853,RE_02,...,14.7,15891000.0,3732000.0,1741000.0,211.0,True,False,45.0,GREEN_A_PLUS,True


In [None]:
COMPANIES

## Valitaan halutut sarakkeet

In [28]:
COMPANIES = COMPANIES[['business_id', 'company_name'] + SELECTED_COMPANY_FEATURES]
COMPANIES.head()

Unnamed: 0,business_id,company_name,company_form_code,location_municipality_code,location_region_code,company_status_code,industry_code,turnover,net_profit,personnel_average,performer_ranking_points,risk_rating_class
0,31431209,Tmi Onerva Miettunen,CO_26,91.0,1.0,AKT,43341.0,,,,,
1,32087307,Koti Vaihtoon LKV Oy,CO_16,91.0,1.0,AKT,68310.0,,,,,
2,18601103,Teollisuuden Monialapalvelu T:mi Oksanen,CO_26,,,AKT,68209.0,,,,,
3,20469041,POLYBEAT,CO_53,,,AKT,74909.0,,,,,
4,24844507,SAHOJEN PUU,CO_26,,,AKT,,,,,,


## Käsitellään toimialakoodit

In [29]:
def transform_industry(row):
    industry_code = row.industry_code
    if (pd.isnull(industry_code)):
        return 'NaN'
    return industry_code[0:2]

COMPANIES['industry_code'] = COMPANIES.apply(transform_industry, axis=1)
COMPANIES['industry_code']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COMPANIES['industry_code'] = COMPANIES.apply(transform_industry, axis=1)


0           43
1           68
2           68
3           74
4          NaN
          ... 
1337863     70
1337864     78
1337865     42
1337866     62
1337867     95
Name: industry_code, Length: 1337868, dtype: object

## Käsitellään riskiluokitus

In [None]:
def transform_risk_rating(row):
    risk_rating = row.risk_rating_class
    if (pd.isnull(risk_rating)):
        return 'NaN'
    return risk_rating.split('_')[0]

COMPANIES['risk_rating_class'] = COMPANIES.apply(transform_risk_rating, axis=1)
COMPANIES

## Muutetaan valittujen sarakkeiden tiedot avainsanoiksi

In [31]:
def transform_str_to_keyword(val, col_name):
    if (pd.isnull(val)):
        return col_name + '+' + 'NaN'
    return col_name + '+' + val

boundaries = COMPANIES.quantile([0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.98])
print(boundaries)

def transform_num_to_keyword(val, col_name):
    if (pd.isnull(val)):
        return col_name + '+' + 'NaN'
    
    col_boundaries = boundaries[col_name]
    for index, item in col_boundaries.iteritems():
        if (val < item):
            return col_name + '+' + str(index)
        
    return col_name + '+' + 'top'

        turnover  net_profit  personnel_average  performer_ranking_points
0.20     22000.0     -4000.0               1.00                      31.0
0.40     87000.0         0.0               2.00                      46.0
0.60    234000.0      8000.0               5.00                      56.0
0.80    825000.0     43000.0              16.00                      70.0
0.90   2390000.0    125000.0              41.00                      77.0
0.95   6721200.0    305000.0              95.95                      82.0
0.98  23293680.0    950780.0             294.38                      86.0


In [None]:
for col_name in SELECTED_COMPANY_FEATURES:
    dtype = COMPANIES[col_name].dtypes
    if dtype == 'object':
        COMPANIES[col_name] = COMPANIES[col_name].apply(lambda x: transform_str_to_keyword(x, col_name))
    if dtype == 'float64':
        COMPANIES[col_name] = COMPANIES[col_name].apply(lambda x: transform_num_to_keyword(x, col_name))
        
COMPANIES

## Pikkelöi käsitelty yritysdata

In [33]:
COMPANIES.to_pickle(WORKING_DIRECTORY + "data/pandas_pickles/prod_data_proto2.pkl")

## Heränneitä kysymyksiä

1. Minkälaiset rajat tunnuslukujen diskretisoinnissa parhaat?
2. Mitkä avainsanat merkittäviä?