# Tutkitaan avainsanojen esiintyvyyttä/erottelukykyä

In [39]:
import numpy as np
import pandas as pd
import re

## Ladataan yritysdata ja vuorovaikutusdata

In [15]:
SELECTED_COMPANY_FEATURES = ['company_form_code', 'location_municipality_code', 
                             'location_region_code', 'company_status_code', 'industry_code', 'turnover', 
                             'net_profit', 'personnel_average', 'performer_ranking_points', 'risk_rating_class']

In [16]:
COMPANIES = pd.read_pickle("data/pandas_pickles/prod_data_proto2.pkl")

In [17]:
ITEM_IDS = list(COMPANIES['business_id'].unique())
ITEM_IDS

['31431209',
 '32087307',
 '18601103',
 '20469041',
 '24844507',
 '01015810',
 '01119927',
 '02357927',
 '02488679',
 '05075714',
 '06460411',
 '07574322',
 '08764347',
 '09708355',
 '10117341',
 '10299456',
 '10415357',
 '14961099',
 '16149387',
 '17235578',
 '17849078',
 '20218672',
 '19495891',
 '21972159',
 '21518909',
 '20907849',
 '23612800',
 '23087997',
 '22887116',
 '22430888',
 '24876568',
 '24235437',
 '23992604',
 '24017691',
 '27232477',
 '29193547',
 '29213405',
 '28746293',
 '28758024',
 '28578007',
 '31019003',
 '31352025',
 '01029964',
 '01197028',
 '04225637',
 '05163087',
 '04979017',
 '04804735',
 '08237512',
 '08140892',
 '08727546',
 '08956429',
 '10456837',
 '14086095',
 '13980124',
 '14284040',
 '14146588',
 '28694323',
 '14676984',
 '14776563',
 '15080621',
 '15662840',
 '17406496',
 '18116781',
 '18477018',
 '19516832',
 '19384278',
 '22217884',
 '21787615',
 '22798535',
 '23089474',
 '25586028',
 '25535511',
 '25410145',
 '25082276',
 '24854318',
 '24809509',

In [18]:
interactions_tmp = pd \
    .read_csv('data/interactions_2021_08_19.csv',
             delimiter='\t',
             dtype={
                 'group_id': 'string',
                 'business_id': 'string',
                 'owner': 'string'
             })

# Poistetaan vuorovaikutusdatasta sellaiset y-tunnukset, joita ei löydy kohteista
INTERACTIONS = interactions_tmp[interactions_tmp.business_id.isin(ITEM_IDS)]

print(interactions_tmp.shape)
print(INTERACTIONS.shape)

(548198, 3)
(525513, 3)


In [19]:
features_tmp = [COMPANIES[feature].unique() for feature in SELECTED_COMPANY_FEATURES]
FEATURES = [item for sublist in features_tmp for item in sublist]
len(FEATURES)
FEATURES

['company_form_code+CO_26',
 'company_form_code+CO_16',
 'company_form_code+CO_53',
 'company_form_code+CO_2',
 'company_form_code+CO_10',
 'company_form_code+CO_13',
 'company_form_code+CO_19',
 'company_form_code+CO_5',
 'company_form_code+CO_14',
 'company_form_code+CO_6',
 'company_form_code+CO_63',
 'company_form_code+CO_51',
 'company_form_code+CO_56',
 'company_form_code+CO_60',
 'company_form_code+CO_17',
 'company_form_code+CO_0',
 'company_form_code+CO_54',
 'company_form_code+CO_18',
 'company_form_code+CO_52',
 'company_form_code+CO_50',
 'company_form_code+CO_57',
 'company_form_code+CO_31',
 'company_form_code+CO_29',
 'company_form_code+CO_41',
 'company_form_code+CO_71',
 'company_form_code+CO_39',
 'company_form_code+CO_36',
 'company_form_code+CO_90',
 'company_form_code+CO_48',
 'company_form_code+CO_44',
 'company_form_code+CO_59',
 'company_form_code+CO_55',
 'company_form_code+CO_42',
 'company_form_code+CO_40',
 'company_form_code+CO_15',
 'company_form_code+CO_7

## Luodaan yritys/feature-matriisi Gini-indeksin laskemista varten

In [150]:
def calculate_gini_for_word(word):
    col_name = word.split('+')[0]
    matches_df = COMPANIES[COMPANIES[col_name] == word]
    
    matched_docs_total = matches_df.shape[0]
    print(matched_docs_total)    
    
    match_bids = list(matches_df['business_id'].unique())
    
    matching_interactions_df = INTERACTIONS[INTERACTIONS['business_id'].isin(match_bids)]
    
    interacted_docs_count = matching_interactions_df['business_id'].unique().shape[0]
    non_interacted_docs_count = matched_docs_total - interacted_docs_count
    
    gini_index = 1 - ((interacted_docs_count / matched_docs_total) ** 2 + \
                    (non_interacted_docs_count / matched_docs_total) ** 2)
    
    
    print(interacted_docs_count)
    
    return gini_index

In [188]:
print(calculate_gini_for_word('performer_ranking_points+0.2'))

26223
16515
0.46630877860073205


In [169]:
COMPANIES[COMPANIES['risk_rating_class'] == 'risk_rating_class+RED']

Unnamed: 0,business_id,company_name,company_form_code,location_municipality_code,location_region_code,company_status_code,industry_code,turnover,net_profit,personnel_average,performer_ranking_points,risk_rating_class
158,28280569,Industry62 Oy,company_form_code+CO_16,location_municipality_code+837,location_region_code+06,company_status_code+AKT,industry_code+62,turnover+0.8,net_profit+0.2,personnel_average+0.8,performer_ranking_points+0.2,risk_rating_class+RED
263,27880714,MikkoTalot Rakentaa Oy,company_form_code+CO_16,location_municipality_code+698,location_region_code+19,company_status_code+AKT,industry_code+41,turnover+0.8,net_profit+0.8,personnel_average+0.6,performer_ranking_points+0.6,risk_rating_class+RED
268,30932079,AlterAlfa Oy,company_form_code+CO_16,location_municipality_code+NaN,location_region_code+NaN,company_status_code+AKT,industry_code+81,turnover+0.8,net_profit+0.2,personnel_average+NaN,performer_ranking_points+0.2,risk_rating_class+RED
373,27687199,Wasco Coatings Finland Oy,company_form_code+CO_16,location_municipality_code+285,location_region_code+08,company_status_code+AKT,industry_code+32,turnover+top,net_profit+0.98,personnel_average+0.98,performer_ranking_points+0.2,risk_rating_class+RED
417,24493815,Eira Estetica Oy,company_form_code+CO_16,location_municipality_code+091,location_region_code+01,company_status_code+AKT,industry_code+96,turnover+0.4,net_profit+0.2,personnel_average+0.4,performer_ranking_points+0.2,risk_rating_class+RED
...,...,...,...,...,...,...,...,...,...,...,...,...
1337137,29626963,Lapin OmaTaxi Oy,company_form_code+CO_16,location_municipality_code+698,location_region_code+19,company_status_code+AKT,industry_code+49,turnover+0.4,net_profit+0.4,personnel_average+NaN,performer_ranking_points+0.2,risk_rating_class+RED
1337219,21443986,Remake EkoDesign Oy,company_form_code+CO_16,location_municipality_code+245,location_region_code+01,company_status_code+AKT,industry_code+13,turnover+0.2,net_profit+0.2,personnel_average+NaN,performer_ranking_points+0.2,risk_rating_class+RED
1337224,28779511,Navettanikkarit Oy,company_form_code+CO_16,location_municipality_code+481,location_region_code+02,company_status_code+AKT,industry_code+43,turnover+0.2,net_profit+0.2,personnel_average+NaN,performer_ranking_points+0.2,risk_rating_class+RED
1337822,02074521,Enon Höyläämö Oy,company_form_code+CO_16,location_municipality_code+NaN,location_region_code+NaN,company_status_code+SELTILA,industry_code+16,turnover+0.8,net_profit+0.8,personnel_average+0.6,performer_ranking_points+0.2,risk_rating_class+RED


In [44]:
re.escape('turnover+0.99')

'turnover\\+0\\.99'

- Tätähän voi ehkä oikeesti käyttää "sanojen" valintaan!
    - Esim. nähdään, että se "kiinnostaa" kuuluuko ylimpiin persentiileihin
    - Lisäksi se "kiinnostaa" löytyykö jotain tietoa -> NaN luokat lähes järjestestään merkittävimpien sanojen joukossa
    
- Gini-indeksissä kyllä vaarana se, että riittää, että jokin yritys on yhdessä ryhmässä, ja se nousee vuorovaikutettujen kohteiden joukkoon -> tietyt poiminnat voi nostaa selvästi joidenkin sanojen arvoa
    - Kannattaa perehtyä johonkin muuhunkin metriikkaan