# Tutkitaan avainsanojen esiintyvyyttä/erottelukykyä

In [39]:
import numpy as np
import pandas as pd
import re

## Ladataan yritysdata ja vuorovaikutusdata

In [15]:
SELECTED_COMPANY_FEATURES = ['company_form_code', 'location_municipality_code', 
                             'location_region_code', 'company_status_code', 'industry_code', 'turnover', 
                             'net_profit', 'personnel_average', 'performer_ranking_points', 'risk_rating_class']

In [16]:
COMPANIES = pd.read_pickle("data/pandas_pickles/prod_data_proto2.pkl")

In [17]:
ITEM_IDS = list(COMPANIES['business_id'].unique())
ITEM_IDS

['31431209',
 '32087307',
 '18601103',
 '20469041',
 '24844507',
 '01015810',
 '01119927',
 '02357927',
 '02488679',
 '05075714',
 '06460411',
 '07574322',
 '08764347',
 '09708355',
 '10117341',
 '10299456',
 '10415357',
 '14961099',
 '16149387',
 '17235578',
 '17849078',
 '20218672',
 '19495891',
 '21972159',
 '21518909',
 '20907849',
 '23612800',
 '23087997',
 '22887116',
 '22430888',
 '24876568',
 '24235437',
 '23992604',
 '24017691',
 '27232477',
 '29193547',
 '29213405',
 '28746293',
 '28758024',
 '28578007',
 '31019003',
 '31352025',
 '01029964',
 '01197028',
 '04225637',
 '05163087',
 '04979017',
 '04804735',
 '08237512',
 '08140892',
 '08727546',
 '08956429',
 '10456837',
 '14086095',
 '13980124',
 '14284040',
 '14146588',
 '28694323',
 '14676984',
 '14776563',
 '15080621',
 '15662840',
 '17406496',
 '18116781',
 '18477018',
 '19516832',
 '19384278',
 '22217884',
 '21787615',
 '22798535',
 '23089474',
 '25586028',
 '25535511',
 '25410145',
 '25082276',
 '24854318',
 '24809509',

In [18]:
interactions_tmp = pd \
    .read_csv('data/interactions_2021_08_19.csv',
             delimiter='\t',
             dtype={
                 'group_id': 'string',
                 'business_id': 'string',
                 'owner': 'string'
             })

# Poistetaan vuorovaikutusdatasta sellaiset y-tunnukset, joita ei löydy kohteista
INTERACTIONS = interactions_tmp[interactions_tmp.business_id.isin(ITEM_IDS)]

print(interactions_tmp.shape)
print(INTERACTIONS.shape)

(548198, 3)
(525513, 3)


In [19]:
features_tmp = [COMPANIES[feature].unique() for feature in SELECTED_COMPANY_FEATURES]
FEATURES = [item for sublist in features_tmp for item in sublist]
len(FEATURES)
FEATURES

['company_form_code+CO_26',
 'company_form_code+CO_16',
 'company_form_code+CO_53',
 'company_form_code+CO_2',
 'company_form_code+CO_10',
 'company_form_code+CO_13',
 'company_form_code+CO_19',
 'company_form_code+CO_5',
 'company_form_code+CO_14',
 'company_form_code+CO_6',
 'company_form_code+CO_63',
 'company_form_code+CO_51',
 'company_form_code+CO_56',
 'company_form_code+CO_60',
 'company_form_code+CO_17',
 'company_form_code+CO_0',
 'company_form_code+CO_54',
 'company_form_code+CO_18',
 'company_form_code+CO_52',
 'company_form_code+CO_50',
 'company_form_code+CO_57',
 'company_form_code+CO_31',
 'company_form_code+CO_29',
 'company_form_code+CO_41',
 'company_form_code+CO_71',
 'company_form_code+CO_39',
 'company_form_code+CO_36',
 'company_form_code+CO_90',
 'company_form_code+CO_48',
 'company_form_code+CO_44',
 'company_form_code+CO_59',
 'company_form_code+CO_55',
 'company_form_code+CO_42',
 'company_form_code+CO_40',
 'company_form_code+CO_15',
 'company_form_code+CO_7

## Luodaan yritys/feature-matriisi Gini-indeksin laskemista varten

In [199]:
def calculate_gini_for_word(word):
    col_name = word.split('+')[0]
    matches_df = COMPANIES[COMPANIES[col_name] == word]
    
    matched_docs_total = matches_df.shape[0]
    print(matched_docs_total)    
    
    match_bids = list(matches_df['business_id'].unique())
    
    matching_interactions_df = INTERACTIONS[INTERACTIONS['business_id'].isin(match_bids)]
    
    interacted_docs_count = matching_interactions_df['business_id'].unique().shape[0]
    non_interacted_docs_count = matched_docs_total - interacted_docs_count
    
    gini_index = 1 - ((interacted_docs_count / matched_docs_total) ** 2 + \
                    (non_interacted_docs_count / matched_docs_total) ** 2)
    
    
    print(interacted_docs_count)
    
    return gini_index

In [207]:
print(calculate_gini_for_word('location_municipality_code+NaN'))

888180
39115
0.08420003849323976


In [None]:
COMPANIES[COMPANIES['risk_rating_class'] == 'risk_rating_class+RED']

In [192]:
GINI_SCORES = []

for word in FEATURES:
    gini = calculate_gini_for_word(word)
    GINI_SCORES.append((word, gini))

In [194]:
GINI_DF = pd.DataFrame(GINI_SCORES, columns=['feature', 'gini_score'])

In [197]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(GINI_DF.sort_values('gini_score'))

                            feature  gini_score
58          company_form_code+CO_20    0.000000
54          company_form_code+CO_35    0.000000
43          company_form_code+CO_38    0.000000
59           company_form_code+CO_8    0.000000
53           company_form_code+CO_1    0.000000
52           company_form_code+CO_9    0.000000
515          personnel_average+0.99    0.000000
51          company_form_code+CO_28    0.000000
15           company_form_code+CO_0    0.000000
60           company_form_code+CO_7    0.000000
27          company_form_code+CO_90    0.000000
483                industry_code+97    0.000000
48          company_form_code+CO_21    0.000000
61            company_form_code+NaN    0.000000
516           personnel_average+top    0.000000
489                industry_code+05    0.000000
63          company_form_code+CO_61    0.000000
64          company_form_code+CO_23    0.000000
488                industry_code+12    0.000000
487                industry_code+06    0

- Tätähän voi ehkä oikeesti käyttää "sanojen" valintaan!
    - Esim. nähdään, että se "kiinnostaa" kuuluuko ylimpiin persentiileihin
    - Lisäksi se "kiinnostaa" löytyykö jotain tietoa -> NaN luokat lähes järjestestään merkittävimpien sanojen joukossa
    
- Gini-indeksissä kyllä vaarana se, että riittää, että jokin yritys on yhdessä ryhmässä, ja se nousee vuorovaikutettujen kohteiden joukkoon -> tietyt poiminnat voi nostaa selvästi joidenkin sanojen arvoa
    - Kannattaa perehtyä johonkin muuhunkin metriikkaan
    
- Jonkun rajan voisi määrittää, kuinka monta kertaa sanan pitää esiintyä, jotta se otetaan mukaan