# Proto2

In [1]:
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import reciprocal_rank
from lightfm.data import Dataset

import numpy as np
import pandas as pd

import statistics

In [2]:
SELECTED_COMPANY_FEATURES = ['company_form_code', 'location_municipality_code', 
                             'location_region_code', 'company_status_code', 'industry_code', 'turnover', 
                             'net_profit', 'personnel_average', 'performer_ranking_points', 'risk_rating_class']

In [3]:
COMPANIES_DF = pd.read_pickle("data/pandas_pickles/prod_data_proto2.pkl")

In [4]:
ITEM_IDS = list(COMPANIES_DF['business_id'].unique())

In [7]:
features_tmp = [COMPANIES_DF[feature].unique() for feature in SELECTED_COMPANY_FEATURES]

ITEM_FEATURE_LABELS = [item for sublist in features_tmp for item in sublist]

ITEM_FEATURES = [(company['business_id'], 
                  [company[feature] for feature in SELECTED_COMPANY_FEATURES])
                     for company in COMPANIES_DF.to_dict(orient='records')]

print(ITEM_FEATURES[0:10])

[('31431209', ['company_form_code+CO_26', 'location_municipality_code+091', 'location_region_code+01', 'company_status_code+AKT', 'industry_code+43', 'turnover+NaN', 'net_profit+NaN', 'personnel_average+NaN', 'performer_ranking_points+NaN', 'risk_rating_class+NaN']), ('32087307', ['company_form_code+CO_16', 'location_municipality_code+091', 'location_region_code+01', 'company_status_code+AKT', 'industry_code+68', 'turnover+NaN', 'net_profit+NaN', 'personnel_average+NaN', 'performer_ranking_points+NaN', 'risk_rating_class+NaN']), ('18601103', ['company_form_code+CO_26', 'location_municipality_code+NaN', 'location_region_code+NaN', 'company_status_code+AKT', 'industry_code+68', 'turnover+NaN', 'net_profit+NaN', 'personnel_average+NaN', 'performer_ranking_points+NaN', 'risk_rating_class+NaN']), ('20469041', ['company_form_code+CO_53', 'location_municipality_code+NaN', 'location_region_code+NaN', 'company_status_code+AKT', 'industry_code+74', 'turnover+NaN', 'net_profit+NaN', 'personnel_av

In [8]:
interactions_tmp = pd \
    .read_csv('data/interactions_2021_08_19.csv',
             delimiter='\t',
             dtype={
                 'group_id': 'string',
                 'business_id': 'string',
                 'owner': 'string'
             })

# Poistetaan vuorovaikutusdatasta sellaiset y-tunnukset, joita ei löydy kohteista
INTERACTIONS_DF = interactions_tmp[interactions_tmp.business_id.isin(ITEM_IDS)]

## Otetaan minimiryhmäkoko käyttöön

In [12]:
group_sizes = INTERACTIONS_DF['group_id'].value_counts()
group_sizes_df = pd.DataFrame({'group_id': group_sizes.index, 'group_size': group_sizes.values})

INTERACTIONS_WITH_GROUP_SIZES_DF = INTERACTIONS_DF.merge(group_sizes_df, on='group_id')

INTERACTIONS_50_DF = INTERACTIONS_WITH_GROUP_SIZES_DF[INTERACTIONS_WITH_GROUP_SIZES_DF.group_size >= 50]

INTERACTIONS_50 = [(interaction['group_id'], interaction['business_id']) 
                for interaction in INTERACTIONS_50_DF.to_dict(orient='records')]

USER_IDS_50 = list(set(INTERACTIONS_50_DF['group_id'].values))

print(INTERACTIONS_50[0:10])


[('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '01681709'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '15055514'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '01876143'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '01863991'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '05363070'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '01387534'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '01372818'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '18348689'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '01421229'), ('c2626398-faac-4ff3-b02d-cdc64b50cdaa', '01446661')]


## Luodaan LightFM-datasetti

In [17]:
DATASET_50 = Dataset(user_identity_features=False, item_identity_features=False)

# user_featureja ei ainakaan vielä ole
DATASET_50.fit(users=USER_IDS_50, items=ITEM_IDS, item_features=ITEM_FEATURE_LABELS)

ITEM_FEATURES_50_DS = DATASET_50.build_item_features(ITEM_FEATURES, normalize=False)

(INTERACTIONS_50_DS, WEIGHTS_50_DS) = DATASET_50.build_interactions(INTERACTIONS_50)

USER_MAP_50_DS = DATASET_50.mapping()[0]
ITEM_MAP_50_DS = DATASET_50.mapping()[2]
ITEM_FEATURE_MAP_50_DS = DATASET_50.mapping()[3]


## Arvioidaan mallien laatua

### Ajetaan evaluaatiot ja otetaan tulokset talteen 5:ltä ajokerralta

In [19]:
def run_evaluation(model, train, test, evaluation_function, name, item_features=None):    
    print("Calculating %s for train dataset..." % (name))
    train_metric = evaluation_function(model, train, item_features=item_features).mean()
    
    print("Calculating %s for test dataset..." % (name))
    test_metric = evaluation_function(model, test, item_features=item_features).mean()
    
    print('%s: train %.2f, test %.2f.' % (name, train_metric, test_metric))
    print("\n")
    return (train_metric, test_metric)

WARP_AUC = []
WARP_PRECISION = []
WARP_RECALL = []
WARP_RECIPROCAL = []

BPR_AUC = []
BPR_PRECISION = []
BPR_RECALL = []
BPR_RECIPROCAL = []

WARP_NO_ITEM_AUC = []
WARP_NO_ITEM_PRECISION = []
WARP_NO_ITEM_RECALL = []
WARP_NO_ITEM_RECIPROCAL = []

BPR_NO_ITEM_AUC = []
BPR_NO_ITEM_PRECISION = []
BPR_NO_ITEM_RECALL = []
BPR_NO_ITEM_RECIPROCAL = []

for i in [1]:
    print("Starting iteration %d" % i)
    
    (TRAIN, TEST) = random_train_test_split(INTERACTIONS_50_DS, test_percentage=0.2)

    MODEL_WARP = LightFM(loss='warp')
    MODEL_WARP.fit(TRAIN, item_features=ITEM_FEATURES_50_DS, epochs=10, verbose=True)

    MODEL_BPR = LightFM(loss='bpr')
    MODEL_BPR.fit(TRAIN, item_features=ITEM_FEATURES_50_DS, epochs=10, verbose=True)

    MODEL_WARP_NO_ITEM = LightFM(loss='warp')
    MODEL_WARP_NO_ITEM.fit(TRAIN, epochs=10, verbose=True)

    MODEL_BPR_NO_ITEM = LightFM(loss='bpr')
    MODEL_BPR_NO_ITEM.fit(TRAIN, epochs=10, verbose=True)
    
    
    
    
    WARP_AUC.append(run_evaluation(MODEL_WARP, TRAIN, TEST, auc_score, "AUC_WARP", ITEM_FEATURES_DS))
    BPR_AUC.append(run_evaluation(MODEL_BPR, TRAIN, TEST, auc_score, "AUC_BPR", ITEM_FEATURES_DS))
    
    WARP_PRECISION.append(run_evaluation(MODEL_WARP, TRAIN, TEST, precision_at_k, "PRECISION_WARP", ITEM_FEATURES_DS))
    BPR_PRECISION.append(run_evaluation(MODEL_BPR, TRAIN, TEST, precision_at_k, "PRECISION_BPR", ITEM_FEATURES_DS))
    
    WARP_RECALL.append(run_evaluation(MODEL_WARP, TRAIN, TEST, recall_at_k, "RECALL_WARP", ITEM_FEATURES_DS))
    BPR_RECALL.append(run_evaluation(MODEL_BPR, TRAIN, TEST, recall_at_k, "RECALL_BPR", ITEM_FEATURES_DS))
    
    WARP_RECIPROCAL.append(run_evaluation(MODEL_WARP, TRAIN, TEST, reciprocal_rank, "RECIPROCAL_WARP", ITEM_FEATURES_DS))
    BPR_RECIPROCAL.append(run_evaluation(MODEL_BPR, TRAIN, TEST, reciprocal_rank, "RECIPROCAL_BPR", ITEM_FEATURES_DS))
    
    
    
    
    WARP_NO_ITEM_AUC.append(run_evaluation(MODEL_WARP_NO_ITEM, TRAIN, TEST, auc_score, "AUC_WARP_NO_ITEM"))
    BPR_NO_ITEM_AUC.append(run_evaluation(MODEL_BPR_NO_ITEM, TRAIN, TEST, auc_score, "AUC_BPR_NO_ITEM"))
    
    WARP_NO_ITEM_PRECISION.append(run_evaluation(MODEL_WARP_NO_ITEM, TRAIN, TEST, precision_at_k, "PRECISION_WARP_NO_ITEM"))
    BPR_NO_ITEM_PRECISION.append(run_evaluation(MODEL_BPR_NO_ITEM, TRAIN, TEST, precision_at_k, "PRECISION_BPR_NO_ITEM"))
    
    WARP_NO_ITEM_RECALL.append(run_evaluation(MODEL_WARP_NO_ITEM, TRAIN, TEST, recall_at_k, "RECALL_WARP_NO_ITEM"))
    BPR_NO_ITEM_RECALL.append(run_evaluation(MODEL_BPR_NO_ITEM, TRAIN, TEST, recall_at_k, "RECALL_BPR_NO_ITEM"))
    
    WARP_NO_ITEM_RECIPROCAL.append(run_evaluation(MODEL_WARP_NO_ITEM, TRAIN, TEST, reciprocal_rank, "RECIPROCAL_WARP_NO_ITEM"))
    BPR_NO_ITEM_RECIPROCAL.append(run_evaluation(MODEL_BPR_NO_ITEM, TRAIN, TEST, reciprocal_rank, "RECIPROCAL_BPR_NO_ITEM"))

Starting iteration 1


Epoch: 100%|██████████| 10/10 [00:17<00:00,  1.73s/it]
Epoch: 100%|██████████| 10/10 [00:11<00:00,  1.17s/it]
Epoch: 100%|██████████| 10/10 [00:13<00:00,  1.40s/it]
Epoch: 100%|██████████| 10/10 [00:07<00:00,  1.38it/s]


Calculating AUC_WARP for train dataset...
Calculating AUC_WARP for test dataset...
AUC_WARP: train 0.99, test 0.99.


Calculating AUC_BPR for train dataset...
Calculating AUC_BPR for test dataset...
AUC_BPR: train 0.97, test 0.97.


Calculating PRECISION_WARP for train dataset...
Calculating PRECISION_WARP for test dataset...
PRECISION_WARP: train 0.24, test 0.06.


Calculating PRECISION_BPR for train dataset...
Calculating PRECISION_BPR for test dataset...
PRECISION_BPR: train 0.26, test 0.07.


Calculating RECALL_WARP for train dataset...
Calculating RECALL_WARP for test dataset...
RECALL_WARP: train 0.01, test 0.01.


Calculating RECALL_BPR for train dataset...
Calculating RECALL_BPR for test dataset...
RECALL_BPR: train 0.01, test 0.01.


Calculating RECIPROCAL_WARP for train dataset...
Calculating RECIPROCAL_WARP for test dataset...
RECIPROCAL_WARP: train 0.39, test 0.16.


Calculating RECIPROCAL_BPR for train dataset...
Calculating RECIPROCAL_BPR for test dataset...
RECIPROCAL_BP