# iteration3

Data:
- interaktiot kaikkien muiden ryhmistä paitsi Alman kehittäjien
- interaktiot ryhmä, ei käyttäjäkohtaisia
- kaikki yritykset - konsernit mukana lisäämällä "K-" y-tunnuksen eteen
- metadatana perustietoa yrityksistä - numeeriset tilikausitiedot diskretisoitu hieman persentiilejä mukaileviin custom-luokkiin
- **data esikäsitelty iteration3_feature_selection**-notebookissa
- Warp-malli käytössä
- Minimiryhmäkoko = 2

Kysymyksiä:

1. Miten uudet muokatut tilinpäätöstiedon luokat + location_municipalityn pudotus toimii?
2. Miten gini-indeksillä painotettu item_feature-matriisi toimii eri painotuksilla?
3. Mitä jos NaN-featureja ei anneta?

## Importit

In [59]:
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import reciprocal_rank
from lightfm.data import Dataset

import numpy as np
import pandas as pd

import statistics
import functools

from sklearn.model_selection import train_test_split

WORKING_DIRECTORY = '/mnt/d/git/masters-thesis-code/jupyter/code/'

## Valitut metadatat yrityksille

In [60]:
SELECTED_COMPANY_FEATURES = ['company_form_code', 
                             'location_region_code', 'company_status_code', 'industry_code', 'turnover', 
                             'net_profit', 'personnel_average', 'performer_ranking_points', 'risk_rating_class']

## Ladataan yritysdata

In [72]:
COMPANIES_DF = pd.read_pickle(WORKING_DIRECTORY + "data/pandas_pickles/company_data_iteration3.pkl")

ITEM_IDS_COMPANIES = list(COMPANIES_DF['business_id'].unique())

In [None]:
COMPANIES_DF

## Ladataan vuorovaikutusdata

In [75]:
interactions_tmp = pd \
    .read_csv(WORKING_DIRECTORY + 'data/interactions_2021_08_19.csv',
             delimiter='\t',
             dtype={
                 'group_id': 'string',
                 'business_id': 'string',
                 'owner': 'string'
             })

# otetaan pois 1 kokoiset ryhmät
group_sizes = interactions_tmp['group_id'].value_counts()
group_sizes_df = pd.DataFrame({'group_id': group_sizes.index, 'group_size': group_sizes.values})

INTERACTIONS_WITH_GROUP_SIZES_DF = interactions_tmp.merge(group_sizes_df, on='group_id')             
interactions_tmp = INTERACTIONS_WITH_GROUP_SIZES_DF[INTERACTIONS_WITH_GROUP_SIZES_DF.group_size >= 2]
#interactions_tmp = INTERACTIONS_WITH_GROUP_SIZES_DF[INTERACTIONS_WITH_GROUP_SIZES_DF.group_size <= 3000]
interactions_tmp.sort_values('group_size')



Unnamed: 0,group_id,business_id,owner,group_size
155696,3e9dd356-2b21-45ae-9ee4-7cd6cc122fe1,07577937,5e87095492119e00066e7158,2
106198,31503959-943a-4081-abcc-dc80e5cb0402,15093748,5db034c64320cd0006d2b788,2
313746,cab22fae-db47-46b6-b902-3d9a1b1051f6,01163004,5e4534bc7bf061000697e940,2
313747,cab22fae-db47-46b6-b902-3d9a1b1051f6,10410900,5e4534bc7bf061000697e940,2
545392,0967d6ed-88b7-4023-a720-f09f7051f24d,17944788,5efdbc656488210007bc27f6,2
...,...,...,...,...
8042,a5c6ce2e-22ab-4871-bd72-e5da294b33cc,16029641,5e1489f3c2f568000654ecbb,3999
8043,a5c6ce2e-22ab-4871-bd72-e5da294b33cc,16030167,5e1489f3c2f568000654ecbb,3999
8044,a5c6ce2e-22ab-4871-bd72-e5da294b33cc,16030415,5e1489f3c2f568000654ecbb,3999
8031,a5c6ce2e-22ab-4871-bd72-e5da294b33cc,16001948,5e1489f3c2f568000654ecbb,3999


In [76]:
# lisätään konserniyrityksille interaktiot
concern_interactions = interactions_tmp.copy()
concern_interactions['business_id'] = 'K-' + concern_interactions['business_id'].astype(str)
concern_interactions = concern_interactions[concern_interactions.business_id.isin(ITEM_IDS_COMPANIES)]
concern_interactions

Unnamed: 0,group_id,business_id,owner,group_size
5,c2626398-faac-4ff3-b02d-cdc64b50cdaa,K-01681709,60646431ae18cb00063ed63f,1862
6,c2626398-faac-4ff3-b02d-cdc64b50cdaa,K-15055514,60646431ae18cb00063ed63f,1862
7,c2626398-faac-4ff3-b02d-cdc64b50cdaa,K-01876143,60646431ae18cb00063ed63f,1862
9,c2626398-faac-4ff3-b02d-cdc64b50cdaa,K-05363070,60646431ae18cb00063ed63f,1862
10,c2626398-faac-4ff3-b02d-cdc64b50cdaa,K-01387534,60646431ae18cb00063ed63f,1862
...,...,...,...,...
548074,8b0915ff-a0cb-4520-9160-8d783a6bf308,K-02106319,6110c56241e21e000857ca77,131
548110,8b0915ff-a0cb-4520-9160-8d783a6bf308,K-20333371,6110c56241e21e000857ca77,131
548137,8b0915ff-a0cb-4520-9160-8d783a6bf308,K-07027249,6110c56241e21e000857ca77,131
548162,8b0915ff-a0cb-4520-9160-8d783a6bf308,K-02011774,6110c56241e21e000857ca77,131


In [77]:
# yhdistetään konserni-interaktiot tavallisiin ja poistetaan sellaiset interaktiot, joille ei löydy y-tunnusta
INTERACTIONS_DF = pd.concat([interactions_tmp, concern_interactions])
INTERACTIONS_DF = INTERACTIONS_DF[INTERACTIONS_DF.business_id.isin(ITEM_IDS_COMPANIES)]
INTERACTIONS_DF[INTERACTIONS_DF['business_id'] == 'K-02011774']

USER_IDS = list(set(INTERACTIONS_DF['group_id'].values))
INTERACTIONS_DF

Unnamed: 0,group_id,business_id,owner,group_size
0,3a63222b-86b2-4293-bd2e-171011190ae6,31291154,603e1524d377150007c2dbea,5
1,3a63222b-86b2-4293-bd2e-171011190ae6,08544918,603e1524d377150007c2dbea,5
2,3a63222b-86b2-4293-bd2e-171011190ae6,10134299,603e1524d377150007c2dbea,5
3,3a63222b-86b2-4293-bd2e-171011190ae6,26404708,603e1524d377150007c2dbea,5
4,3a63222b-86b2-4293-bd2e-171011190ae6,29747968,603e1524d377150007c2dbea,5
...,...,...,...,...
548074,8b0915ff-a0cb-4520-9160-8d783a6bf308,K-02106319,6110c56241e21e000857ca77,131
548110,8b0915ff-a0cb-4520-9160-8d783a6bf308,K-20333371,6110c56241e21e000857ca77,131
548137,8b0915ff-a0cb-4520-9160-8d783a6bf308,K-07027249,6110c56241e21e000857ca77,131
548162,8b0915ff-a0cb-4520-9160-8d783a6bf308,K-02011774,6110c56241e21e000857ca77,131


In [78]:
# poistetaan kohteista sellaiset, joille ei ole vuorovaikutuksia
ITEM_IDS = list(INTERACTIONS_DF['business_id'].unique())

COMPANIES_DF = COMPANIES_DF[COMPANIES_DF.business_id.isin(ITEM_IDS)]

item_features_tmp = [COMPANIES_DF[feature].unique() for feature in SELECTED_COMPANY_FEATURES]

ITEM_FEATURE_LABELS = [item for sublist in item_features_tmp for item in sublist]

COMPANIES_DF

Unnamed: 0,business_id,company_name,company_form_code,location_region_code,company_status_code,industry_code,turnover,net_profit,personnel_average,performer_ranking_points,risk_rating_class
13,09708355,Kyröntarhat Oy,company_form_code+CO_16,location_region_code+02,company_status_code+AKT,industry_code+01,turnover+4,net_profit+0,personnel_average+1,performer_ranking_points+1,risk_rating_class+GREEN
20,17849078,Kuopion Keskus-Burger Oy,company_form_code+CO_16,location_region_code+11,company_status_code+AKT,industry_code+56,turnover+3,net_profit+2,personnel_average+NaN,performer_ranking_points+1,risk_rating_class+YELLOW
28,22887116,Elämysapteekki Oy,company_form_code+CO_16,location_region_code+02,company_status_code+AKT,industry_code+47,turnover+4,net_profit+5,personnel_average+1,performer_ranking_points+3,risk_rating_class+GREEN
30,24876568,Plastvo Oy,company_form_code+CO_16,location_region_code+07,company_status_code+AKT,industry_code+22,turnover+4,net_profit+5,personnel_average+1,performer_ranking_points+3,risk_rating_class+GREEN
31,24235437,Niko Lindholm Oy,company_form_code+CO_16,location_region_code+NaN,company_status_code+AKT,industry_code+41,turnover+2,net_profit+1,personnel_average+NaN,performer_ranking_points+1,risk_rating_class+NaN
...,...,...,...,...,...,...,...,...,...,...,...
1337857,21145030,Tmi Markku Vuorentausta,company_form_code+CO_26,location_region_code+12,company_status_code+AKT,industry_code+47,turnover+NaN,net_profit+NaN,personnel_average+NaN,performer_ranking_points+NaN,risk_rating_class+NaN
1337858,23088228,HTK Liikenne Oy,company_form_code+CO_16,location_region_code+NaN,company_status_code+AKT,industry_code+49,turnover+2,net_profit+2,personnel_average+0,performer_ranking_points+1,risk_rating_class+GREEN
1337859,24318670,Rabbit Hole Oy,company_form_code+CO_16,location_region_code+01,company_status_code+AKT,industry_code+85,turnover+NaN,net_profit+0,personnel_average+NaN,performer_ranking_points+NaN,risk_rating_class+NaN
1337861,K-26538773,Etelä-Savon Työterveys Oy,company_form_code+CO_16,location_region_code+10,company_status_code+AKT,industry_code+86,turnover+top,net_profit+5,personnel_average+2,performer_ranking_points+1,risk_rating_class+GREEN


In [79]:
def print_interactions_meta_data(interactions_df):
    print('ryhmiä: {groups}, interaktioita {interactions}, yrityksiä {companies}'
          .format(groups=len(list(interactions_df['group_id'].unique())),
                  interactions=interactions_df.shape[0], 
                  companies=len(list(interactions_df['business_id'].unique()))))

print('----- group_size>=2 -----')
print_interactions_meta_data(INTERACTIONS_DF)

----- group_size>=2 -----
ryhmiä: 1312, interaktioita 598703, yrityksiä 143839


## Luodaan cross-validationia varten ositetut datasetit

## Luodaan LightFM:n ymmärtämät Dataset-oliot

In [80]:
def create_item_features_ds():
    return [(company['business_id'], 
                [company[feature] for feature in SELECTED_COMPANY_FEATURES])
                    for company in COMPANIES_DF.to_dict(orient='records')]


In [81]:
def calculate_gini_for_word(word, train_interactions_df, alpha):
    col_name = word.split('+')[0]
    matches_df = COMPANIES_DF[COMPANIES_DF[col_name] == word]
    
    matched_docs_total = matches_df.shape[0]
    
    match_bids = list(matches_df['business_id'].unique())
    
    matching_interactions_df = train_interactions_df[train_interactions_df['business_id'].isin(match_bids)]
    
    interacted_docs_count = matching_interactions_df['business_id'].unique().shape[0]
    non_interacted_docs_count = matched_docs_total - interacted_docs_count
    
    gini_index = 1 - ((interacted_docs_count / matched_docs_total) ** 2 + \
                    (non_interacted_docs_count / matched_docs_total) ** 2)
    
        
    return (word, alpha - gini_index, interacted_docs_count, matched_docs_total)

def create_gini_weighted_item_features(train_interactions_df, alpha):
    feature_weights = {}

    for word in ITEM_FEATURE_LABELS:
        gini = calculate_gini_for_word(word, train_interactions_df, alpha)
        feature_weights[word] = gini[1]

    return [(company['business_id'], 
                {k: feature_weights[k] for k in [company[feature] for feature in SELECTED_COMPANY_FEATURES]})
                    for company in COMPANIES_DF.to_dict(orient='records')]


In [82]:
def create_dataset(train_interactions_df, test_interactions_df, alpha=None):
    dataset = Dataset(user_identity_features=False)

    
    train_interactions = [(interaction['group_id'], interaction['business_id']) 
                for interaction in train_interactions_df.to_dict(orient='records')]

    test_interactions = [(interaction['group_id'], interaction['business_id']) 
            for interaction in test_interactions_df.to_dict(orient='records')]
    
    dataset.fit(users=USER_IDS, items=ITEM_IDS, item_features=ITEM_FEATURE_LABELS)

    (train_interactions_ds, _) = dataset.build_interactions(train_interactions)
    (test_interactions_ds, _) = dataset.build_interactions(test_interactions)
    
    if (alpha == None):
        item_features_ds = dataset.build_item_features(create_item_features_ds(), normalize=False)
        return (train_interactions_ds, test_interactions_ds, item_features_ds)

    else:
        item_features_ds = dataset.build_item_features(create_gini_weighted_item_features(train_interactions_df, alpha), normalize=False)
        return (train_interactions_ds, test_interactions_ds, item_features_ds)

## Arvioidaan mallien laatua

In [83]:
NUM_THREADS = 10

def run_evaluation_function(model, test_ds, train_ds, evaluation_function, name, item_features=None):    
    print('Calculating {name} for train dataset...'.format(name=name))
    train_results = evaluation_function(model, train_ds, item_features=item_features, num_threads=NUM_THREADS)
    np.savetxt('iteration3-train-results-{}.txt'.format(name), train_results)
    train_metric = train_results.mean()
    
    print('Calculating {name} for test dataset...'.format(name=name))
    test_results = evaluation_function(model, test_ds, train_ds, item_features=item_features, num_threads=NUM_THREADS)
    np.savetxt('iteration3-test-results-{}.txt'.format(name), test_results)
    test_metric = test_results.mean()
    
    print('{name}: train {train_metric:.4f}, test {test_metric:.4f}'.format(name=name, 
                                                                            train_metric=train_metric, 
                                                                            test_metric=test_metric))
    print('\n')
    return (train_metric, test_metric)

def run_evaluations_for_ds(model, train_ds, test_ds, model_name, item_features=None):
    auc = run_evaluation_function(model, test_ds, train_ds, auc_score, 'AUC_' + model_name, item_features)
    precision = run_evaluation_function(model, test_ds, train_ds, precision_at_k, 'PRECISION_' + model_name, item_features)
    #recall = run_evaluation_function(model, test_ds, train_ds, recall_at_k, 'RECALL_' + model_name, item_features)
    #reciprocal = run_evaluation_function(model, test_ds, train_ds, reciprocal_rank, 'RECIPROCAL_' + model_name, item_features)
    
    return (auc, precision) #, recall, reciprocal)


In [84]:
def run_evaluations(interactions_df_cv):

    results = {
        'NO_WEIGHTING': []
    }
    
    for i in range(0, len(interactions_df_cv)):
        print('Starting iteration {}...'.format(i))
            
        test_interactions_df = interactions_df_cv[i]
        
        # laitetaan uuteen listaan kaikki paitsi testidatasetti
        train_df_tmp = [ds for j,ds in enumerate(interactions_df_cv) if j != i]
        # yhdistetään treenidatasetiksi valikoituneet vuorovaikutusmatriisit
        train_interactions_df = pd.concat(train_df_tmp)

        print('test_interactions', test_interactions_df.shape)
        print('train_interactions', train_interactions_df.shape)
        
        ##### NO_WEIGHTING #####
        name = 'NO_WEIGHTING'
        (train_interactions_ds, test_interactions_ds, item_features_ds) = create_dataset(train_interactions_df, test_interactions_df)

        MODEL = LightFM(loss='warp')
        MODEL.fit(train_interactions_ds, item_features=item_features_ds, epochs=5, num_threads=NUM_THREADS, verbose=True)
        
        results[name].append(run_evaluations_for_ds(MODEL, train_interactions_ds, test_interactions_ds, '{}_{}'.format(name, i), item_features_ds))

    return results

In [85]:
def print_metric_result(result_arr, model_name):
    train_results = [x[0] for x in result_arr]
    test_results = [x[1] for x in result_arr]
    
    print('{name}:\n train mean {train_mean:.4f} ({train_arr})\n test mean {test_mean:.4f} ({test_arr})\n'
          .format(train_mean=statistics.mean(train_results),
                 test_mean=statistics.mean(test_results),
                 train_arr=['%.4f' % x for x in train_results],
                 test_arr=['%.4f' % x for x in test_results],
                 name=model_name))
    

def print_all_results(results):
    for i,metric in enumerate(['AUC', 'PRECISION']): #, 'RECALL', 'RECIPROCAL']):
        print('\n-----{}-----'.format(metric))
        for model_name,result_arr in results.items():
            print_metric_result([res[i] for res in result_arr], model_name)
    

### Tulokset

In [86]:
def create_partitioned_datasets(interactions_df):
    (rest, fifth_1) = train_test_split(interactions_df, test_size=0.2)
    (rest, fifth_2) = train_test_split(rest, test_size=0.25)
    (rest, fifth_3) = train_test_split(rest, test_size=0.3333333)
    (fifth_4, fifth_5) = train_test_split(rest, test_size=0.5)
    
    return [fifth_1, fifth_2, fifth_3, fifth_4, fifth_5]
    

In [87]:
INTERACTIONS_CV = create_partitioned_datasets(INTERACTIONS_DF)
for cv in INTERACTIONS_CV:
    print(cv.shape)

(119741, 4)
(119741, 4)
(119741, 4)
(119740, 4)
(119740, 4)


In [88]:
RESULTS = run_evaluations(INTERACTIONS_CV)

Starting iteration 0...
test_interactions (119741, 4)
train_interactions (478962, 4)


Epoch: 100%|██████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.39it/s]


Calculating AUC_NO_WEIGHTING_0 for train dataset...
Calculating AUC_NO_WEIGHTING_0 for test dataset...
AUC_NO_WEIGHTING_0: train 0.9735, test 0.9435


Calculating PRECISION_NO_WEIGHTING_0 for train dataset...
Calculating PRECISION_NO_WEIGHTING_0 for test dataset...
PRECISION_NO_WEIGHTING_0: train 0.2421, test 0.1335


Starting iteration 1...
test_interactions (119741, 4)
train_interactions (478962, 4)


Epoch: 100%|██████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.47it/s]


Calculating AUC_NO_WEIGHTING_1 for train dataset...
Calculating AUC_NO_WEIGHTING_1 for test dataset...
AUC_NO_WEIGHTING_1: train 0.9750, test 0.9463


Calculating PRECISION_NO_WEIGHTING_1 for train dataset...
Calculating PRECISION_NO_WEIGHTING_1 for test dataset...
PRECISION_NO_WEIGHTING_1: train 0.2379, test 0.1379


Starting iteration 2...
test_interactions (119741, 4)
train_interactions (478962, 4)


Epoch: 100%|██████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.39it/s]


Calculating AUC_NO_WEIGHTING_2 for train dataset...
Calculating AUC_NO_WEIGHTING_2 for test dataset...
AUC_NO_WEIGHTING_2: train 0.9746, test 0.9447


Calculating PRECISION_NO_WEIGHTING_2 for train dataset...
Calculating PRECISION_NO_WEIGHTING_2 for test dataset...
PRECISION_NO_WEIGHTING_2: train 0.2387, test 0.1306


Starting iteration 3...
test_interactions (119740, 4)
train_interactions (478963, 4)


Epoch: 100%|██████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.40it/s]


Calculating AUC_NO_WEIGHTING_3 for train dataset...
Calculating AUC_NO_WEIGHTING_3 for test dataset...
AUC_NO_WEIGHTING_3: train 0.9731, test 0.9468


Calculating PRECISION_NO_WEIGHTING_3 for train dataset...
Calculating PRECISION_NO_WEIGHTING_3 for test dataset...
PRECISION_NO_WEIGHTING_3: train 0.2313, test 0.1215


Starting iteration 4...
test_interactions (119740, 4)
train_interactions (478963, 4)


Epoch: 100%|██████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.40it/s]


Calculating AUC_NO_WEIGHTING_4 for train dataset...
Calculating AUC_NO_WEIGHTING_4 for test dataset...
AUC_NO_WEIGHTING_4: train 0.9745, test 0.9485


Calculating PRECISION_NO_WEIGHTING_4 for train dataset...
Calculating PRECISION_NO_WEIGHTING_4 for test dataset...
PRECISION_NO_WEIGHTING_4: train 0.2283, test 0.1366




In [89]:
print_all_results(RESULTS)


-----AUC-----
NO_WEIGHTING:
 train mean 0.9741 (['0.9735', '0.9750', '0.9746', '0.9731', '0.9745'])
 test mean 0.9460 (['0.9435', '0.9463', '0.9447', '0.9468', '0.9485'])


-----PRECISION-----
NO_WEIGHTING:
 train mean 0.2357 (['0.2421', '0.2379', '0.2387', '0.2313', '0.2283'])
 test mean 0.1320 (['0.1335', '0.1379', '0.1306', '0.1215', '0.1366'])



In [91]:
test_interactions_df = INTERACTIONS_CV[0]
train_df_tmp = [ds for j,ds in enumerate(INTERACTIONS_CV) if j != 0]
train_interactions_df = pd.concat(train_df_tmp)

(train_interactions_ds, test_interactions_ds, item_features_ds) = create_dataset(train_interactions_df, test_interactions_df)

MODEL = LightFM(loss='warp')
MODEL.fit(train_interactions_ds, item_features=item_features_ds, epochs=5, num_threads=NUM_THREADS, verbose=True)

Epoch: 100%|██████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.37it/s]


<lightfm.lightfm.LightFM at 0x7ffa06430160>

In [92]:
run_evaluations_for_ds(MODEL, train_interactions_ds, test_interactions_ds, '{}_{}'.format('MODEL', 0), item_features_ds)

Calculating AUC_MODEL_0 for train dataset...
Calculating AUC_MODEL_0 for test dataset...
AUC_MODEL_0: train 0.9743, test 0.9449


Calculating PRECISION_MODEL_0 for train dataset...
Calculating PRECISION_MODEL_0 for test dataset...
PRECISION_MODEL_0: train 0.2493, test 0.1387




((0.97430086, 0.9449179), (0.24931298, 0.13866995))

In [119]:
for user in [0,200,300,755]:
    scores = MODEL.predict(user, list(range(0, train_interactions_ds.shape[1])), item_features_ds)
    top10 = np.argsort(-scores)[0:10]
    results = []
    for i in top10:
        results.append((i, scores[i]))

    print(results)
    print('-----\n')

[(141526, 8.767214), (4183, 8.471357), (3617, 8.469233), (141421, 8.25195), (141849, 8.114233), (2845, 8.08995), (140745, 8.048396), (4102, 8.046736), (140753, 7.972617), (141319, 7.9710445)]
-----

[(141294, 7.7536077), (3831, 7.623009), (141107, 7.4643497), (141238, 7.41244), (141147, 7.373005), (3386, 7.313701), (3643, 7.2656317), (3399, 7.2534795), (141157, 7.2487884), (141202, 7.241895)]
-----

[(141544, 7.2408957), (140800, 7.236505), (140730, 7.088708), (141541, 7.081325), (141252, 7.0352135), (3447, 6.948887), (3704, 6.7679443), (3267, 6.6863594), (141319, 6.626699), (141073, 6.6070137)]
-----

[(141326, 3.651312), (141491, 3.6285238), (140774, 3.5445611), (141189, 3.5404978), (3869, 3.4524057), (3508, 3.450136), (3817, 3.3963726), (2857, 3.3894675), (141774, 3.343518), (2039, 3.302548)]
-----

